[GNN] 신용카드거래 사기탐지 – pyod 사용(교수님)
ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models
1. Imports
import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
/tmp/ipykernel_3458664/761229760.py:1: DeprecationWarning:
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
import pandas as pd
from pyod.models.abod import ABOD
#from pyod.models.alad import ALAD
#from pyod.models.anogan import AnoGAN
#from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.cd import CD
from pyod.models.copod import COPOD
#from pyod.models.deep_svdd import DeepSVDD
#from pyod.models.dif import DIF
from pyod.models.ecod import ECOD
#from pyod.models.feature_bagging import FeatureBagging
from pyod.models.gmm import GMM
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.kde import KDE
from pyod.models.knn import KNN
from pyod.models.kpca import KPCA
from pyod.models.kpca import PyODKernelPCA
from pyod.models.lmdd import LMDD
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
#from pyod.models.lunar import LUNAR
from pyod.models.lscp import LSCP
from pyod.models.mad import MAD
from pyod.models.mcd import MCD
#from pyod.models.mo_gaal import MO_GAAL
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.qmcd import QMCD
from pyod.models.rgraph import RGraph
from pyod.models.rod import ROD
from pyod.models.sampling import Sampling
from pyod.models.sod import SOD
#from pyod.models.so_gaal import SO_GAAL
from pyod.models.sos import SOS
#from pyod.models.suod import SUOD
#from pyod.models.vae import VAE
#from pyod.models.xgbod import XGBOD
import warnings
warnings.filterwarnings('ignore' )
2. Data
# fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
# fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
# with open('fraudTrain.pkl', 'wb') as file:
# pickle.dump(fraudTrain, file)
with open ('fraudTrain.pkl' , 'rb' ) as file :
fraudTrain = pickle.load(file )
df = throw(fraudTrain, 0.3 )
df_tr, df_tst = split_dataframe(df, 0.05 )
X = pd.DataFrame(df_tr['amt' ])
y = pd.DataFrame(df_tr['is_fraud' ])
XX = pd.DataFrame(df_tst['amt' ])
yy = pd.DataFrame(df_tst['is_fraud' ])
3. Predictor 만들기
-
1인것의 비율을 계산해서 fraud_ratio를 설정해야함. (즉 사기거래 비율)
사기거래 비율은.. 처음에 정한 throw? 에서 내가 설정한 값으로 봐야하남?
fraud_ratio = df_tr.is_fraud.mean()
predictors = {
'ABOD' : ABOD(contamination= fraud_ratio),
# 'ALAD': ALAD(contamination=fraud_ratio),
# 'AnoGAN': AnoGAN(contamination=fraud_ratio),
# 'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
## 'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
## 'COF': COF(contamination=fraud_ratio),
## 'CD': CD(contamination=fraud_ratio),
'COPOD' : COPOD(contamination= fraud_ratio),
# 'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
# 'DIF': DIF(contamination=fraud_ratio),
'ECOD' : ECOD(contamination= fraud_ratio),
# 'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
'GMM' : GMM(contamination= fraud_ratio),
'HBOS' : HBOS(contamination= fraud_ratio),
'IForest' : IForest(contamination= fraud_ratio),
'INNE' : INNE(contamination= fraud_ratio),
'KDE' : KDE(contamination= fraud_ratio),
'KNN' : KNN(contamination= fraud_ratio),
#### 'KPCA': KPCA(contamination=fraud_ratio),
# 'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
## 'LMDD': LMDD(contamination=fraud_ratio),
'LODA' : LODA(contamination= fraud_ratio),
'LOF' : LOF(contamination= fraud_ratio),
#### 'LOCI': LOCI(contamination=fraud_ratio),
# 'LUNAR': LUNAR(contamination=fraud_ratio),
'LODA' : LODA(contamination= fraud_ratio),
# 'LSCP': LSCP(contamination=fraud_ratio),
'MAD' : MAD(contamination= fraud_ratio),
'MCD' : MCD(contamination= fraud_ratio),
# 'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
'OCSVM' : OCSVM(contamination= fraud_ratio),
'PCA' : PCA(contamination= fraud_ratio),
### 'QMCD': QMCD(contamination=fraud_ratio),
#### 'RGraph': RGraph(contamination=fraud_ratio),
'ROD' : ROD(contamination= fraud_ratio),
## 'Sampling': Sampling(contamination=fraud_ratio),
## 'SOD': SOD(contamination=fraud_ratio),
# 'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
#### 'SOS': SOS(contamination=fraud_ratio),
# 'SUOD': SUOD(contamination=fraud_ratio),
# 'VAE': VAE(contamination=fraud_ratio),
# 'XGBOD': XGBOD(contamination=fraud_ratio),
}
주석처리는 안만들어지는 것
#
은 tensorflow 등이 없어서 ..
##
은 만들어지는데 .fit
할때 오류가 나느것
###
은 fit은 되는데 test할때 nan이 출력되는것
####
은 시간오래걸리는것
4. 학습 & 결과저장
pyod(X,XX,y,yy,predictors)
0
ABOD
1.925603
0.950050
0.000000
0.000000
0.000000
0.500000
False
True
14014
[amt]
0.407164
6006
0.04995
None
1
COPOD
0.085152
0.745921
0.134248
0.750000
0.227733
0.747853
False
True
14014
[amt]
0.407164
6006
0.04995
None
2
ECOD
0.005345
0.720613
0.123497
0.753333
0.212207
0.736113
False
True
14014
[amt]
0.407164
6006
0.04995
None
3
GMM
0.099428
0.711455
0.101723
0.610000
0.174369
0.663395
False
True
14014
[amt]
0.407164
6006
0.04995
None
4
HBOS
1.349680
0.950050
0.000000
0.000000
0.000000
0.500000
False
True
14014
[amt]
0.407164
6006
0.04995
None
5
IForest
0.181929
0.838495
0.201426
0.753333
0.317862
0.798153
False
True
14014
[amt]
0.407164
6006
0.04995
None
6
INNE
0.561864
0.792374
0.154632
0.706667
0.253740
0.751774
False
True
14014
[amt]
0.407164
6006
0.04995
None
7
KDE
4.642007
0.831002
0.193133
0.750000
0.307167
0.792631
False
True
14014
[amt]
0.407164
6006
0.04995
None
8
KNN
0.018220
0.806860
0.168721
0.730000
0.274093
0.770450
False
True
14014
[amt]
0.407164
6006
0.04995
None
9
LODA
0.157484
0.950050
0.000000
0.000000
0.000000
0.500000
False
True
14014
[amt]
0.407164
6006
0.04995
None
10
LOF
0.048183
0.521479
0.052503
0.503333
0.095088
0.512883
False
True
14014
[amt]
0.407164
6006
0.04995
None
11
MAD
0.002226
0.964369
0.695455
0.510000
0.588462
0.749129
False
True
14014
[amt]
0.407164
6006
0.04995
None
12
MCD
0.008822
0.840493
0.204668
0.760000
0.322489
0.802362
False
True
14014
[amt]
0.407164
6006
0.04995
None
13
OCSVM
9.001019
0.807193
0.165888
0.710000
0.268939
0.761151
False
True
14014
[amt]
0.407164
6006
0.04995
None
14
PCA
0.004164
0.444555
0.022341
0.236667
0.040828
0.346076
False
True
14014
[amt]
0.407164
6006
0.04995
None
15
ROD
5.310971
0.575924
0.027743
0.220000
0.049272
0.407319
False
True
14014
[amt]
0.407164
6006
0.04995
None