import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models
1. Imports
import warnings
'ignore') warnings.filterwarnings(
%run ../functions_pyod2.py
with open('../fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
= pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train1 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_test.csv') df_test
= pd.concat([df_train1, df_test])
_df1 = pd.concat([df_train2, df_test])
_df2 = pd.concat([df_train3, df_test])
_df3 = pd.concat([df_train4, df_test])
_df4 = pd.concat([df_train5, df_test])
_df5 = pd.concat([df_train6, df_test])
_df6 = pd.concat([df_train7, df_test])
_df7 = pd.concat([df_train8, df_test])
_df8 = _df1.is_fraud.mean()
_df1_mean = _df2.is_fraud.mean()
_df2_mean = _df3.is_fraud.mean()
_df3_mean = _df4.is_fraud.mean()
_df4_mean = _df5.is_fraud.mean()
_df5_mean = _df6.is_fraud.mean()
_df6_mean = _df7.is_fraud.mean()
_df7_mean = _df8.is_fraud.mean() _df8_mean
df_train1.shape, df_train2.shape, df_train3.shape, df_train4.shape,df_train5.shape,df_train6.shape,df_train7.shape,df_train8.shape,df_test.shape
((734003, 22),
(420500, 22),
(84100, 22),
(42050, 22),
(21025, 22),
(14017, 22),
(10512, 22),
(8410, 22),
(314572, 22))
df_train1.is_fraud.mean(),df_train2.is_fraud.mean(),df_train3.is_fraud.mean(),df_train4.is_fraud.mean(),df_train5.is_fraud.mean(),df_train6.is_fraud.mean(),df_train7.is_fraud.mean(),df_train8.is_fraud.mean(),df_test.is_fraud.mean()
(0.005728859418830713,
0.01,
0.05,
0.1,
0.2,
0.29999286580580725,
0.4000190258751903,
0.5,
0.005725239372862174)
_df1_mean, _df2_mean,_df3_mean,_df4_mean,_df5_mean,_df6_mean,_df7_mean,_df8_mean
(0.005727773406766326,
0.00817062818336163,
0.015065015852630734,
0.01684136144152632,
0.017896465105468762,
0.018278152950950883,
0.01847522486495798,
0.018595463524283085)
pyod_0503: 기존거에서 그냥 result2로 저장위치만 변경
def pyod_0503(X,XX,y,yy,predictors,throw_rate):
= []
model = []
time_diff = []
acc = []
pre = []
rec = []
f1 = []
auc = []
graph_based = []
method = []
train_size = []
train_cols = []
train_frate = []
test_size = []
test_frate = []
hyper_params for name, predictor in predictors.items():
= time.time()
t1
predictor.fit(X,y)= time.time()
t2 = predictor.predict(XX)
yyhat = evaluate(yy,yyhat)
scores
model.append(name)-t1)
time_diff.append(t2'acc'])
acc.append(scores['pre'])
pre.append(scores['rec'])
rec.append(scores['f1'])
f1.append(scores['auc'])
auc.append(scores[False)
graph_based.append('pyod')
method.append(len(y)),
train_size.append(list(X.columns)),
train_cols.append(-1).mean()),
train_frate.append(np.array(y).reshape(len(yy)),
test_size.append(-1).mean())
test_frate.append(np.array(yy).reshape(None)
hyper_params.append(= pd.DataFrame(dict(
df_results = model,
model =time_diff,
time=acc,
acc=pre,
pre=rec,
rec=f1,
f1=auc,
auc= graph_based,
graph_based = method,
method = throw_rate,
throw_rate = train_size,
train_size = train_cols,
train_cols = np.array(y).mean(),
train_frate = test_size,
test_size = np.array(yy).mean(),
test_frate = hyper_params
hyper_params
))= datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
ymdhms f'../results2/{ymdhms}-pyod.csv',index=False)
df_results.to_csv(return df_results
def pyod_preprocess_0503(df_tr, df_tstn, _df_mean):
= pd.DataFrame(df_tr['amt'])
X = pd.DataFrame(df_tr['is_fraud'])
y = pd.DataFrame(df_tstn['amt'])
XX = pd.DataFrame(df_tstn['is_fraud'])
yy = _df_mean
throw_rate = df_tr.is_fraud.mean()
fraud_ratio = {
predictors 'ABOD': ABOD(contamination=fraud_ratio),
# 'ALAD': ALAD(contamination=fraud_ratio),
# 'AnoGAN': AnoGAN(contamination=fraud_ratio),
# 'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
## 'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
## 'COF': COF(contamination=fraud_ratio),
## 'CD': CD(contamination=fraud_ratio),
'COPOD': COPOD(contamination=fraud_ratio),
# 'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
# 'DIF': DIF(contamination=fraud_ratio),
'ECOD': ECOD(contamination=fraud_ratio),
# 'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
'GMM': GMM(contamination=fraud_ratio),
'HBOS': HBOS(contamination=fraud_ratio),
'IForest': IForest(contamination=fraud_ratio),
'INNE': INNE(contamination=fraud_ratio),
'KDE': KDE(contamination=fraud_ratio),
'KNN': KNN(contamination=fraud_ratio),
#### 'KPCA': KPCA(contamination=fraud_ratio),
# 'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
## 'LMDD': LMDD(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
'LOF': LOF(contamination=fraud_ratio),
#### 'LOCI': LOCI(contamination=fraud_ratio),
# 'LUNAR': LUNAR(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
# 'LSCP': LSCP(contamination=fraud_ratio),
'MAD': MAD(contamination=fraud_ratio),
'MCD': MCD(contamination=fraud_ratio),
# 'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
'OCSVM': OCSVM(contamination=fraud_ratio),
'PCA': PCA(contamination=fraud_ratio),
### 'QMCD': QMCD(contamination=fraud_ratio),
#### 'RGraph': RGraph(contamination=fraud_ratio),
'ROD': ROD(contamination=fraud_ratio),
## 'Sampling': Sampling(contamination=fraud_ratio),
## 'SOD': SOD(contamination=fraud_ratio),
# 'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
#### 'SOS': SOS(contamination=fraud_ratio),
# 'SUOD': SUOD(contamination=fraud_ratio),
# 'VAE': VAE(contamination=fraud_ratio),
# 'XGBOD': XGBOD(contamination=fraud_ratio),
}return X, XX, y, yy, predictors, throw_rate
*pyod_preprocess_0503(df_train1, df_test, _df1_mean)) pyod_0503(
–> 9시간 이상
*pyod_preprocess_0503(df_train2, df_test, _df2_mean)) pyod_0503(
—> 4시간 이상
*pyod_preprocess_0503(df_train3, df_test, _df3_mean)) pyod_0503(
*pyod_preprocess_0503(df_train4, df_test, _df4_mean)) pyod_0503(
*pyod_preprocess_0503(df_train5, df_test, _df5_mean)) pyod_0503(
*pyod_preprocess_0503(df_train6, df_test, _df6_mean)) pyod_0503(
*pyod_preprocess_0503(df_train7, df_test, _df7_mean)) pyod_0503(
*pyod_preprocess_0503(df_train8, df_test, _df8_mean)) pyod_0503(