import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetimeref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models
1. Imports
import warnings
warnings.filterwarnings('ignore')%run ../functions_pyod2.pywith open('../fraudTrain.pkl', 'rb') as file:
fraudTrain = pickle.load(file) df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])
_df1_mean = _df1.is_fraud.mean()
_df2_mean = _df2.is_fraud.mean()
_df3_mean = _df3.is_fraud.mean()
_df4_mean = _df4.is_fraud.mean()
_df5_mean = _df5.is_fraud.mean()
_df6_mean = _df6.is_fraud.mean()
_df7_mean = _df7.is_fraud.mean()
_df8_mean = _df8.is_fraud.mean()df_train1.shape, df_train2.shape, df_train3.shape, df_train4.shape,df_train5.shape,df_train6.shape,df_train7.shape,df_train8.shape,df_test.shape((734003, 22),
(420500, 22),
(84100, 22),
(42050, 22),
(21025, 22),
(14017, 22),
(10512, 22),
(8410, 22),
(314572, 22))
df_train1.is_fraud.mean(),df_train2.is_fraud.mean(),df_train3.is_fraud.mean(),df_train4.is_fraud.mean(),df_train5.is_fraud.mean(),df_train6.is_fraud.mean(),df_train7.is_fraud.mean(),df_train8.is_fraud.mean(),df_test.is_fraud.mean()(0.005728859418830713,
0.01,
0.05,
0.1,
0.2,
0.29999286580580725,
0.4000190258751903,
0.5,
0.005725239372862174)
_df1_mean, _df2_mean,_df3_mean,_df4_mean,_df5_mean,_df6_mean,_df7_mean,_df8_mean(0.005727773406766326,
0.00817062818336163,
0.015065015852630734,
0.01684136144152632,
0.017896465105468762,
0.018278152950950883,
0.01847522486495798,
0.018595463524283085)
pyod_0503: 기존거에서 그냥 result2로 저장위치만 변경
def pyod_0503(X,XX,y,yy,predictors,throw_rate):
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
method = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params = []
for name, predictor in predictors.items():
t1 = time.time()
predictor.fit(X,y)
t2 = time.time()
yyhat = predictor.predict(XX)
scores = evaluate(yy,yyhat)
model.append(name)
time_diff.append(t2-t1)
acc.append(scores['acc'])
pre.append(scores['pre'])
rec.append(scores['rec'])
f1.append(scores['f1'])
auc.append(scores['auc'])
graph_based.append(False)
method.append('pyod')
train_size.append(len(y)),
train_cols.append(list(X.columns)),
train_frate.append(np.array(y).reshape(-1).mean()),
test_size.append(len(yy)),
test_frate.append(np.array(yy).reshape(-1).mean())
hyper_params.append(None)
df_results = pd.DataFrame(dict(
model = model,
time=time_diff,
acc=acc,
pre=pre,
rec=rec,
f1=f1,
auc=auc,
graph_based = graph_based,
method = method,
throw_rate = throw_rate,
train_size = train_size,
train_cols = train_cols,
train_frate = np.array(y).mean(),
test_size = test_size,
test_frate = np.array(yy).mean(),
hyper_params = hyper_params
))
ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
df_results.to_csv(f'../results2/{ymdhms}-pyod.csv',index=False)
return df_resultsdef pyod_preprocess_0503(df_tr, df_tstn, _df_mean):
X = pd.DataFrame(df_tr['amt'])
y = pd.DataFrame(df_tr['is_fraud'])
XX = pd.DataFrame(df_tstn['amt'])
yy = pd.DataFrame(df_tstn['is_fraud'])
throw_rate = _df_mean
fraud_ratio = df_tr.is_fraud.mean()
predictors = {
'ABOD': ABOD(contamination=fraud_ratio),
# 'ALAD': ALAD(contamination=fraud_ratio),
# 'AnoGAN': AnoGAN(contamination=fraud_ratio),
# 'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
## 'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
## 'COF': COF(contamination=fraud_ratio),
## 'CD': CD(contamination=fraud_ratio),
'COPOD': COPOD(contamination=fraud_ratio),
# 'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
# 'DIF': DIF(contamination=fraud_ratio),
'ECOD': ECOD(contamination=fraud_ratio),
# 'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
'GMM': GMM(contamination=fraud_ratio),
'HBOS': HBOS(contamination=fraud_ratio),
'IForest': IForest(contamination=fraud_ratio),
'INNE': INNE(contamination=fraud_ratio),
'KDE': KDE(contamination=fraud_ratio),
'KNN': KNN(contamination=fraud_ratio),
#### 'KPCA': KPCA(contamination=fraud_ratio),
# 'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
## 'LMDD': LMDD(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
'LOF': LOF(contamination=fraud_ratio),
#### 'LOCI': LOCI(contamination=fraud_ratio),
# 'LUNAR': LUNAR(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
# 'LSCP': LSCP(contamination=fraud_ratio),
'MAD': MAD(contamination=fraud_ratio),
'MCD': MCD(contamination=fraud_ratio),
# 'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
'OCSVM': OCSVM(contamination=fraud_ratio),
'PCA': PCA(contamination=fraud_ratio),
### 'QMCD': QMCD(contamination=fraud_ratio),
#### 'RGraph': RGraph(contamination=fraud_ratio),
'ROD': ROD(contamination=fraud_ratio),
## 'Sampling': Sampling(contamination=fraud_ratio),
## 'SOD': SOD(contamination=fraud_ratio),
# 'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
#### 'SOS': SOS(contamination=fraud_ratio),
# 'SUOD': SUOD(contamination=fraud_ratio),
# 'VAE': VAE(contamination=fraud_ratio),
# 'XGBOD': XGBOD(contamination=fraud_ratio),
}
return X, XX, y, yy, predictors, throw_ratepyod_0503(*pyod_preprocess_0503(df_train1, df_test, _df1_mean))–> 9시간 이상
pyod_0503(*pyod_preprocess_0503(df_train2, df_test, _df2_mean))—> 4시간 이상
pyod_0503(*pyod_preprocess_0503(df_train3, df_test, _df3_mean))pyod_0503(*pyod_preprocess_0503(df_train4, df_test, _df4_mean))pyod_0503(*pyod_preprocess_0503(df_train5, df_test, _df5_mean))pyod_0503(*pyod_preprocess_0503(df_train6, df_test, _df6_mean))pyod_0503(*pyod_preprocess_0503(df_train7, df_test, _df7_mean))pyod_0503(*pyod_preprocess_0503(df_train8, df_test, _df8_mean))