import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
ref: https://pyod.readthedocs.io/en/latest/pyod.models.html#all-models
1. Imports
from pyod.models.abod import ABOD
#from pyod.models.alad import ALAD
#from pyod.models.anogan import AnoGAN
#from pyod.models.auto_encoder import AutoEncoder
from pyod.models.cblof import CBLOF
from pyod.models.cof import COF
from pyod.models.cd import CD
from pyod.models.copod import COPOD
#from pyod.models.deep_svdd import DeepSVDD
#from pyod.models.dif import DIF
from pyod.models.ecod import ECOD
#from pyod.models.feature_bagging import FeatureBagging
from pyod.models.gmm import GMM
from pyod.models.hbos import HBOS
from pyod.models.iforest import IForest
from pyod.models.inne import INNE
from pyod.models.kde import KDE
from pyod.models.knn import KNN
from pyod.models.kpca import KPCA
from pyod.models.kpca import PyODKernelPCA
from pyod.models.lmdd import LMDD
from pyod.models.loda import LODA
from pyod.models.lof import LOF
from pyod.models.loci import LOCI
#from pyod.models.lunar import LUNAR
from pyod.models.lscp import LSCP
from pyod.models.mad import MAD
from pyod.models.mcd import MCD
#from pyod.models.mo_gaal import MO_GAAL
from pyod.models.ocsvm import OCSVM
from pyod.models.pca import PCA
from pyod.models.qmcd import QMCD
from pyod.models.rgraph import RGraph
from pyod.models.rod import ROD
from pyod.models.sampling import Sampling
from pyod.models.sod import SOD
#from pyod.models.so_gaal import SO_GAAL
from pyod.models.sos import SOS
#from pyod.models.suod import SUOD
#from pyod.models.vae import VAE
#from pyod.models.xgbod import XGBOD
import warnings
'ignore') warnings.filterwarnings(
%run functions.py
2. Data
# fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
# fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
# with open('fraudTrain.pkl', 'wb') as file:
# pickle.dump(fraudTrain, file)
with open('fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
= throw(fraudTrain, 0.3)
df = split_dataframe(df, 0.05) df_tr, df_tst
= pd.DataFrame(df_tr['amt'])
X = pd.DataFrame(df_tr['is_fraud'])
y = pd.DataFrame(df_tst['amt'])
XX = pd.DataFrame(df_tst['is_fraud']) yy
3. Predictor 만들기
-
1인것의 비율을 계산해서 fraud_ratio를 설정해야함. (즉 사기거래 비율)
= df_tr.is_fraud.mean()
fraud_ratio = {
predictors 'ABOD': ABOD(contamination=fraud_ratio),
# 'ALAD': ALAD(contamination=fraud_ratio),
# 'AnoGAN': AnoGAN(contamination=fraud_ratio),
# 'AutoEncoder':AutoEncoder(contamination=fraud_ratio),
## 'CBLOF': CBLOF(contamination=fraud_ratio,n_clusters=2),
## 'COF': COF(contamination=fraud_ratio),
## 'CD': CD(contamination=fraud_ratio),
'COPOD': COPOD(contamination=fraud_ratio),
# 'DeepSVDD': DeepSVDD(contamination=fraud_ratio),
# 'DIF': DIF(contamination=fraud_ratio),
'ECOD': ECOD(contamination=fraud_ratio),
# 'FeatureBagging': FeatureBagging(contamination=fraud_ratio),
'GMM': GMM(contamination=fraud_ratio),
'HBOS': HBOS(contamination=fraud_ratio),
'IForest': IForest(contamination=fraud_ratio),
'INNE': INNE(contamination=fraud_ratio),
'KDE': KDE(contamination=fraud_ratio),
'KNN': KNN(contamination=fraud_ratio),
#### 'KPCA': KPCA(contamination=fraud_ratio),
# 'PyODKernelPCA': PyODKernelPCA(contamination=fraud_ratio),
## 'LMDD': LMDD(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
'LOF': LOF(contamination=fraud_ratio),
#### 'LOCI': LOCI(contamination=fraud_ratio),
# 'LUNAR': LUNAR(contamination=fraud_ratio),
'LODA': LODA(contamination=fraud_ratio),
# 'LSCP': LSCP(contamination=fraud_ratio),
'MAD': MAD(contamination=fraud_ratio),
'MCD': MCD(contamination=fraud_ratio),
# 'MO_GAAL': MO_GAAL(contamination=fraud_ratio),
'OCSVM': OCSVM(contamination=fraud_ratio),
'PCA': PCA(contamination=fraud_ratio),
### 'QMCD': QMCD(contamination=fraud_ratio),
#### 'RGraph': RGraph(contamination=fraud_ratio),
'ROD': ROD(contamination=fraud_ratio),
## 'Sampling': Sampling(contamination=fraud_ratio),
## 'SOD': SOD(contamination=fraud_ratio),
# 'SO_GAAL': SO_GAAL(contamination=fraud_ratio),
#### 'SOS': SOS(contamination=fraud_ratio),
# 'SUOD': SUOD(contamination=fraud_ratio),
# 'VAE': VAE(contamination=fraud_ratio),
# 'XGBOD': XGBOD(contamination=fraud_ratio),
}
- 주석처리는 안만들어지는 것
#
은 tensorflow 등이 없어서 ..##
은 만들어지는데.fit
할때 오류가 나느것###
은 fit은 되는데 test할때 nan이 출력되는것####
은 시간오래걸리는것
4. 학습 & 결과저장
=0.3) pyod(X,XX,y,yy,predictors,throw_rate
model | time | acc | pre | rec | f1 | auc | graph_based | pyod | throw_rate | train_size | train_cols | train_frate | test_size | test_frate | hyper_params | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ABOD | 0.786525 | 0.950050 | 0.000000 | 0.000000 | 0.000000 | 0.500000 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
1 | COPOD | 0.004162 | 0.739927 | 0.128386 | 0.726667 | 0.218218 | 0.733645 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
2 | ECOD | 0.004062 | 0.716450 | 0.114349 | 0.693333 | 0.196319 | 0.705499 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
3 | GMM | 0.084714 | 0.707959 | 0.104031 | 0.636667 | 0.178839 | 0.674187 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
4 | HBOS | 0.002090 | 0.949883 | 0.000000 | 0.000000 | 0.000000 | 0.499912 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
5 | IForest | 0.189287 | 0.817349 | 0.176805 | 0.726667 | 0.284410 | 0.774392 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
6 | INNE | 0.434896 | 0.657010 | 0.071984 | 0.493333 | 0.125637 | 0.579474 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
7 | KDE | 4.614658 | 0.835165 | 0.195230 | 0.736667 | 0.308659 | 0.788505 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
8 | KNN | 0.015593 | 0.808192 | 0.164038 | 0.693333 | 0.265306 | 0.753782 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
9 | LODA | 0.056379 | 0.949883 | 0.000000 | 0.000000 | 0.000000 | 0.499912 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
10 | LOF | 0.037894 | 0.528472 | 0.050745 | 0.476667 | 0.091725 | 0.503931 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
11 | MAD | 0.001531 | 0.960872 | 0.641921 | 0.490000 | 0.555766 | 0.737815 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
12 | MCD | 0.005907 | 0.834998 | 0.195057 | 0.736667 | 0.308444 | 0.788417 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
13 | OCSVM | 8.670979 | 0.806860 | 0.167697 | 0.723333 | 0.272271 | 0.767292 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
14 | PCA | 0.002473 | 0.461705 | 0.025558 | 0.263333 | 0.046594 | 0.367734 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |
15 | ROD | 2.395536 | 0.600566 | 0.034177 | 0.256667 | 0.060321 | 0.437657 | False | True | 0.3 | 14014 | [amt] | 0.407164 | 6006 | 0.04995 | None |