[GNN] 신용카드거래 사기탐지 – pyod 사용(교수님)

Author

신록예찬,김보람

Published

January 30, 2024

ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models

1. Imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
/tmp/ipykernel_3458664/761229760.py:1: DeprecationWarning: 
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
from pyod.models.abod import ABOD
#from pyod.models.alad import ALAD
#from pyod.models.anogan import AnoGAN
#from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.cd import CD
from pyod.models.copod import COPOD
#from pyod.models.deep_svdd import DeepSVDD
#from pyod.models.dif import DIF
from pyod.models.ecod import ECOD
#from pyod.models.feature_bagging import FeatureBagging
from pyod.models.gmm import GMM
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.kde import KDE
from pyod.models.knn import KNN
from pyod.models.kpca import KPCA
from pyod.models.kpca import PyODKernelPCA
from pyod.models.lmdd import LMDD
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
#from pyod.models.lunar import LUNAR
from pyod.models.lscp import LSCP
from pyod.models.mad import MAD
from pyod.models.mcd import MCD
#from pyod.models.mo_gaal import MO_GAAL
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.qmcd import QMCD
from pyod.models.rgraph import RGraph
from pyod.models.rod import ROD
from pyod.models.sampling import Sampling
from pyod.models.sod import SOD
#from pyod.models.so_gaal import SO_GAAL
from pyod.models.sos import SOS
#from pyod.models.suod import SUOD
#from pyod.models.vae import VAE
#from pyod.models.xgbod import XGBOD
import warnings
warnings.filterwarnings('ignore')
%run functions.py

2. Data

# fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
# fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
# with open('fraudTrain.pkl', 'wb') as file:
#     pickle.dump(fraudTrain, file)
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.05) 
X = pd.DataFrame(df_tr['amt'])
y = pd.DataFrame(df_tr['is_fraud'])
XX = pd.DataFrame(df_tst['amt'])
yy = pd.DataFrame(df_tst['is_fraud'])

3. Predictor 만들기

- 1인것의 비율을 계산해서 fraud_ratio를 설정해야함. (즉 사기거래 비율)

사기거래 비율은.. 처음에 정한 throw? 에서 내가 설정한 값으로 봐야하남?

fraud_ratio = df_tr.is_fraud.mean()
predictors = {
    'ABOD': ABOD(contamination=fraud_ratio),
#    'ALAD': ALAD(contamination=fraud_ratio),
#    'AnoGAN': AnoGAN(contamination=fraud_ratio),
#    'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
##    'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
##    'COF': COF(contamination=fraud_ratio),
##    'CD': CD(contamination=fraud_ratio),
    'COPOD': COPOD(contamination=fraud_ratio),
#    'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
#    'DIF': DIF(contamination=fraud_ratio),    
    'ECOD': ECOD(contamination=fraud_ratio),
#    'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
    'GMM': GMM(contamination=fraud_ratio),
    'HBOS': HBOS(contamination=fraud_ratio),
    'IForest': IForest(contamination=fraud_ratio),
    'INNE': INNE(contamination=fraud_ratio),
    'KDE': KDE(contamination=fraud_ratio),
    'KNN': KNN(contamination=fraud_ratio),
####    'KPCA': KPCA(contamination=fraud_ratio),
#    'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
##    'LMDD': LMDD(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
    'LOF': LOF(contamination=fraud_ratio),
####    'LOCI': LOCI(contamination=fraud_ratio),
#    'LUNAR': LUNAR(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
#    'LSCP': LSCP(contamination=fraud_ratio),
    'MAD': MAD(contamination=fraud_ratio),
    'MCD': MCD(contamination=fraud_ratio),
#    'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
    'OCSVM': OCSVM(contamination=fraud_ratio),
    'PCA': PCA(contamination=fraud_ratio),
###    'QMCD': QMCD(contamination=fraud_ratio),
####    'RGraph': RGraph(contamination=fraud_ratio),
    'ROD': ROD(contamination=fraud_ratio),
##    'Sampling': Sampling(contamination=fraud_ratio),
##   'SOD': SOD(contamination=fraud_ratio),
#    'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
####    'SOS': SOS(contamination=fraud_ratio),
#    'SUOD': SUOD(contamination=fraud_ratio),
#    'VAE': VAE(contamination=fraud_ratio),
#    'XGBOD': XGBOD(contamination=fraud_ratio),  
}
  • 주석처리는 안만들어지는 것
  • #은 tensorflow 등이 없어서 ..
  • ##은 만들어지는데 .fit 할때 오류가 나느것
  • ###은 fit은 되는데 test할때 nan이 출력되는것
  • ####은 시간오래걸리는것

4. 학습 & 결과저장

pyod(X,XX,y,yy,predictors)
model time acc pre rec f1 auc graph_based pyod train_size train_cols train_frate test_size test_frate hyper_params
0 ABOD 1.925603 0.950050 0.000000 0.000000 0.000000 0.500000 False True 14014 [amt] 0.407164 6006 0.04995 None
1 COPOD 0.085152 0.745921 0.134248 0.750000 0.227733 0.747853 False True 14014 [amt] 0.407164 6006 0.04995 None
2 ECOD 0.005345 0.720613 0.123497 0.753333 0.212207 0.736113 False True 14014 [amt] 0.407164 6006 0.04995 None
3 GMM 0.099428 0.711455 0.101723 0.610000 0.174369 0.663395 False True 14014 [amt] 0.407164 6006 0.04995 None
4 HBOS 1.349680 0.950050 0.000000 0.000000 0.000000 0.500000 False True 14014 [amt] 0.407164 6006 0.04995 None
5 IForest 0.181929 0.838495 0.201426 0.753333 0.317862 0.798153 False True 14014 [amt] 0.407164 6006 0.04995 None
6 INNE 0.561864 0.792374 0.154632 0.706667 0.253740 0.751774 False True 14014 [amt] 0.407164 6006 0.04995 None
7 KDE 4.642007 0.831002 0.193133 0.750000 0.307167 0.792631 False True 14014 [amt] 0.407164 6006 0.04995 None
8 KNN 0.018220 0.806860 0.168721 0.730000 0.274093 0.770450 False True 14014 [amt] 0.407164 6006 0.04995 None
9 LODA 0.157484 0.950050 0.000000 0.000000 0.000000 0.500000 False True 14014 [amt] 0.407164 6006 0.04995 None
10 LOF 0.048183 0.521479 0.052503 0.503333 0.095088 0.512883 False True 14014 [amt] 0.407164 6006 0.04995 None
11 MAD 0.002226 0.964369 0.695455 0.510000 0.588462 0.749129 False True 14014 [amt] 0.407164 6006 0.04995 None
12 MCD 0.008822 0.840493 0.204668 0.760000 0.322489 0.802362 False True 14014 [amt] 0.407164 6006 0.04995 None
13 OCSVM 9.001019 0.807193 0.165888 0.710000 0.268939 0.761151 False True 14014 [amt] 0.407164 6006 0.04995 None
14 PCA 0.004164 0.444555 0.022341 0.236667 0.040828 0.346076 False True 14014 [amt] 0.407164 6006 0.04995 None
15 ROD 5.310971 0.575924 0.027743 0.220000 0.049272 0.407319 False True 14014 [amt] 0.407164 6006 0.04995 None