[Pyod] df_train, df_test(0.00573)

Author

김보람

Published

May 3, 2024

ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models

1. Imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')
%run ../functions_pyod2.py
with open('../fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])
_df1_mean = _df1.is_fraud.mean()
_df2_mean = _df2.is_fraud.mean()
_df3_mean = _df3.is_fraud.mean()
_df4_mean = _df4.is_fraud.mean()
_df5_mean = _df5.is_fraud.mean()
_df6_mean = _df6.is_fraud.mean()
_df7_mean = _df7.is_fraud.mean()
_df8_mean = _df8.is_fraud.mean()
df_train1.shape, df_train2.shape, df_train3.shape, df_train4.shape,df_train5.shape,df_train6.shape,df_train7.shape,df_train8.shape,df_test.shape
((734003, 22),
 (420500, 22),
 (84100, 22),
 (42050, 22),
 (21025, 22),
 (14017, 22),
 (10512, 22),
 (8410, 22),
 (314572, 22))
df_train1.is_fraud.mean(),df_train2.is_fraud.mean(),df_train3.is_fraud.mean(),df_train4.is_fraud.mean(),df_train5.is_fraud.mean(),df_train6.is_fraud.mean(),df_train7.is_fraud.mean(),df_train8.is_fraud.mean(),df_test.is_fraud.mean()
(0.005728859418830713,
 0.01,
 0.05,
 0.1,
 0.2,
 0.29999286580580725,
 0.4000190258751903,
 0.5,
 0.005725239372862174)
_df1_mean, _df2_mean,_df3_mean,_df4_mean,_df5_mean,_df6_mean,_df7_mean,_df8_mean
(0.005727773406766326,
 0.00817062818336163,
 0.015065015852630734,
 0.01684136144152632,
 0.017896465105468762,
 0.018278152950950883,
 0.01847522486495798,
 0.018595463524283085)

pyod_0503: 기존거에서 그냥 result2로 저장위치만 변경

def pyod_0503(X,XX,y,yy,predictors,throw_rate):
    model = []
    time_diff = []
    acc = []
    pre = []
    rec = []
    f1 = [] 
    auc = [] 
    graph_based = []
    method = [] 
    train_size = []
    train_cols = []
    train_frate = []
    test_size = []
    test_frate = []
    hyper_params = [] 
    for name, predictor in predictors.items():
        t1 = time.time()
        predictor.fit(X,y)
        t2 = time.time()
        yyhat = predictor.predict(XX)
        scores = evaluate(yy,yyhat)
        model.append(name)
        time_diff.append(t2-t1)
        acc.append(scores['acc'])
        pre.append(scores['pre'])
        rec.append(scores['rec'])
        f1.append(scores['f1'])
        auc.append(scores['auc'])
        graph_based.append(False)
        method.append('pyod')
        train_size.append(len(y)),
        train_cols.append(list(X.columns)),
        train_frate.append(np.array(y).reshape(-1).mean()),
        test_size.append(len(yy)),
        test_frate.append(np.array(yy).reshape(-1).mean())
        hyper_params.append(None)
    df_results = pd.DataFrame(dict(
        model = model,
        time=time_diff,
        acc=acc,
        pre=pre,
        rec=rec,
        f1=f1,
        auc=auc,
        graph_based = graph_based,
        method = method,
        throw_rate = throw_rate,
        train_size = train_size,
        train_cols = train_cols,
        train_frate = np.array(y).mean(),
        test_size = test_size,
        test_frate = np.array(yy).mean(),
        hyper_params = hyper_params
    ))
    ymdhms = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S') 
    df_results.to_csv(f'../results2/{ymdhms}-pyod.csv',index=False)
    return df_results
def pyod_preprocess_0503(df_tr, df_tstn, _df_mean):
      
    X = pd.DataFrame(df_tr['amt'])
    y = pd.DataFrame(df_tr['is_fraud'])
    XX = pd.DataFrame(df_tstn['amt'])
    yy = pd.DataFrame(df_tstn['is_fraud'])
    throw_rate = _df_mean
    fraud_ratio = df_tr.is_fraud.mean()
    predictors = {
            'ABOD': ABOD(contamination=fraud_ratio),
#    'ALAD': ALAD(contamination=fraud_ratio),
#    'AnoGAN': AnoGAN(contamination=fraud_ratio),
#    'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
##    'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
##    'COF': COF(contamination=fraud_ratio),
##    'CD': CD(contamination=fraud_ratio),
    'COPOD': COPOD(contamination=fraud_ratio),
#    'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
#    'DIF': DIF(contamination=fraud_ratio),    
    'ECOD': ECOD(contamination=fraud_ratio),
#    'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
    'GMM': GMM(contamination=fraud_ratio),
    'HBOS': HBOS(contamination=fraud_ratio),
    'IForest': IForest(contamination=fraud_ratio),
    'INNE': INNE(contamination=fraud_ratio),
    'KDE': KDE(contamination=fraud_ratio),
    'KNN': KNN(contamination=fraud_ratio),
####    'KPCA': KPCA(contamination=fraud_ratio),
#    'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
##    'LMDD': LMDD(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
    'LOF': LOF(contamination=fraud_ratio),
####    'LOCI': LOCI(contamination=fraud_ratio),
#    'LUNAR': LUNAR(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
#    'LSCP': LSCP(contamination=fraud_ratio),
    'MAD': MAD(contamination=fraud_ratio),
    'MCD': MCD(contamination=fraud_ratio),
#    'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
    'OCSVM': OCSVM(contamination=fraud_ratio),
    'PCA': PCA(contamination=fraud_ratio),
###    'QMCD': QMCD(contamination=fraud_ratio),
####    'RGraph': RGraph(contamination=fraud_ratio),
    'ROD': ROD(contamination=fraud_ratio),
##    'Sampling': Sampling(contamination=fraud_ratio),
##   'SOD': SOD(contamination=fraud_ratio),
#    'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
####    'SOS': SOS(contamination=fraud_ratio),
#    'SUOD': SUOD(contamination=fraud_ratio),
#    'VAE': VAE(contamination=fraud_ratio),
#    'XGBOD': XGBOD(contamination=fraud_ratio),  
}
    return X, XX, y, yy, predictors, throw_rate
pyod_0503(*pyod_preprocess_0503(df_train1, df_test, _df1_mean))

–> 9시간 이상

pyod_0503(*pyod_preprocess_0503(df_train2, df_test, _df2_mean))

—> 4시간 이상

pyod_0503(*pyod_preprocess_0503(df_train3, df_test, _df3_mean))
pyod_0503(*pyod_preprocess_0503(df_train4, df_test, _df4_mean))
pyod_0503(*pyod_preprocess_0503(df_train5, df_test, _df5_mean))
pyod_0503(*pyod_preprocess_0503(df_train6, df_test, _df6_mean))
pyod_0503(*pyod_preprocess_0503(df_train7, df_test, _df7_mean))
pyod_0503(*pyod_preprocess_0503(df_train8, df_test, _df8_mean))