[Pyod] 신용카드거래 사기탐지

Author

신록예찬,김보람

Published

January 31, 2024

ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models

1. Imports

import pandas as pd
import numpy as np
import sklearn
import pickle 
import time 
import datetime
from pyod.models.abod import ABOD
#from pyod.models.alad import ALAD
#from pyod.models.anogan import AnoGAN
#from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.cd import CD
from pyod.models.copod import COPOD
#from pyod.models.deep_svdd import DeepSVDD
#from pyod.models.dif import DIF
from pyod.models.ecod import ECOD
#from pyod.models.feature_bagging import FeatureBagging
from pyod.models.gmm import GMM
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.kde import KDE
from pyod.models.knn import KNN
from pyod.models.kpca import KPCA
from pyod.models.kpca import PyODKernelPCA
from pyod.models.lmdd import LMDD
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
#from pyod.models.lunar import LUNAR
from pyod.models.lscp import LSCP
from pyod.models.mad import MAD
from pyod.models.mcd import MCD
#from pyod.models.mo_gaal import MO_GAAL
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.qmcd import QMCD
from pyod.models.rgraph import RGraph
from pyod.models.rod import ROD
from pyod.models.sampling import Sampling
from pyod.models.sod import SOD
#from pyod.models.so_gaal import SO_GAAL
from pyod.models.sos import SOS
#from pyod.models.suod import SUOD
#from pyod.models.vae import VAE
#from pyod.models.xgbod import XGBOD
import warnings
warnings.filterwarnings('ignore')
%run functions.py

2. Data

# fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
# fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
# with open('fraudTrain.pkl', 'wb') as file:
#     pickle.dump(fraudTrain, file)
with open('fraudTrain.pkl', 'rb') as file:
    fraudTrain = pickle.load(file)    
df = throw(fraudTrain, 0.3)
df_tr, df_tst = split_dataframe(df, 0.05) 
X = pd.DataFrame(df_tr['amt'])
y = pd.DataFrame(df_tr['is_fraud'])
XX = pd.DataFrame(df_tst['amt'])
yy = pd.DataFrame(df_tst['is_fraud'])

3. Predictor 만들기

- 1인것의 비율을 계산해서 fraud_ratio를 설정해야함. (즉 사기거래 비율)

fraud_ratio = df_tr.is_fraud.mean()
predictors = {
    'ABOD': ABOD(contamination=fraud_ratio),
#    'ALAD': ALAD(contamination=fraud_ratio),
#    'AnoGAN': AnoGAN(contamination=fraud_ratio),
#    'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
##    'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
##    'COF': COF(contamination=fraud_ratio),
##    'CD': CD(contamination=fraud_ratio),
    'COPOD': COPOD(contamination=fraud_ratio),
#    'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
#    'DIF': DIF(contamination=fraud_ratio),    
    'ECOD': ECOD(contamination=fraud_ratio),
#    'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
    'GMM': GMM(contamination=fraud_ratio),
    'HBOS': HBOS(contamination=fraud_ratio),
    'IForest': IForest(contamination=fraud_ratio),
    'INNE': INNE(contamination=fraud_ratio),
    'KDE': KDE(contamination=fraud_ratio),
    'KNN': KNN(contamination=fraud_ratio),
####    'KPCA': KPCA(contamination=fraud_ratio),
#    'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
##    'LMDD': LMDD(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
    'LOF': LOF(contamination=fraud_ratio),
####    'LOCI': LOCI(contamination=fraud_ratio),
#    'LUNAR': LUNAR(contamination=fraud_ratio),
    'LODA': LODA(contamination=fraud_ratio),
#    'LSCP': LSCP(contamination=fraud_ratio),
    'MAD': MAD(contamination=fraud_ratio),
    'MCD': MCD(contamination=fraud_ratio),
#    'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
    'OCSVM': OCSVM(contamination=fraud_ratio),
    'PCA': PCA(contamination=fraud_ratio),
###    'QMCD': QMCD(contamination=fraud_ratio),
####    'RGraph': RGraph(contamination=fraud_ratio),
    'ROD': ROD(contamination=fraud_ratio),
##    'Sampling': Sampling(contamination=fraud_ratio),
##   'SOD': SOD(contamination=fraud_ratio),
#    'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
####    'SOS': SOS(contamination=fraud_ratio),
#    'SUOD': SUOD(contamination=fraud_ratio),
#    'VAE': VAE(contamination=fraud_ratio),
#    'XGBOD': XGBOD(contamination=fraud_ratio),  
}
  • 주석처리는 안만들어지는 것
  • #은 tensorflow 등이 없어서 ..
  • ##은 만들어지는데 .fit 할때 오류가 나느것
  • ###은 fit은 되는데 test할때 nan이 출력되는것
  • ####은 시간오래걸리는것

4. 학습 & 결과저장

pyod(X,XX,y,yy,predictors,throw_rate=0.3)
model time acc pre rec f1 auc graph_based pyod throw_rate train_size train_cols train_frate test_size test_frate hyper_params
0 ABOD 0.786525 0.950050 0.000000 0.000000 0.000000 0.500000 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
1 COPOD 0.004162 0.739927 0.128386 0.726667 0.218218 0.733645 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
2 ECOD 0.004062 0.716450 0.114349 0.693333 0.196319 0.705499 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
3 GMM 0.084714 0.707959 0.104031 0.636667 0.178839 0.674187 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
4 HBOS 0.002090 0.949883 0.000000 0.000000 0.000000 0.499912 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
5 IForest 0.189287 0.817349 0.176805 0.726667 0.284410 0.774392 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
6 INNE 0.434896 0.657010 0.071984 0.493333 0.125637 0.579474 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
7 KDE 4.614658 0.835165 0.195230 0.736667 0.308659 0.788505 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
8 KNN 0.015593 0.808192 0.164038 0.693333 0.265306 0.753782 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
9 LODA 0.056379 0.949883 0.000000 0.000000 0.000000 0.499912 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
10 LOF 0.037894 0.528472 0.050745 0.476667 0.091725 0.503931 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
11 MAD 0.001531 0.960872 0.641921 0.490000 0.555766 0.737815 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
12 MCD 0.005907 0.834998 0.195057 0.736667 0.308444 0.788417 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
13 OCSVM 8.670979 0.806860 0.167697 0.723333 0.272271 0.767292 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
14 PCA 0.002473 0.461705 0.025558 0.263333 0.046594 0.367734 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None
15 ROD 2.395536 0.600566 0.034177 0.256667 0.060321 0.437657 False True 0.3 14014 [amt] 0.407164 6006 0.04995 None