import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
imports
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
데이터정리
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
(214520, 22)
= down_sample_textbook(df02)
df50 = df50.reset_index()
df50 df50.shape
(12012, 23)
tr/test
= sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr,df50_test df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
= len(df50)
N = [i in df50_tr.index for i in range(N)]
train_mask = [i in df50_test.index for i in range(N)]
test_mask = np.array(train_mask)
train_mask = np.array(test_mask)
test_mask sum(), test_mask.sum() train_mask.
(9009, 3003)
train_mask.shape, test_mask.shape
((12012,), (12012,))
edge_index 설정
# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)
= np.load('edge_index_list_plus50.npy')
edge_index edge_index.shape
(200706, 3)
= edge_index[:,2].mean()
theta = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index 2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index[:,= edge_index.tolist()
edge_index = np.array(edge_index)[:,2].mean()
mean_ mean_
0.5098736436405648
5] edge_index[:
[[1023.0, 1023.0, 0.0],
[1023.0, 1024.0, 0.9994677478343093],
[1023.0, 1028.0, 0.9902065900321946],
[1023.0, 1031.0, 0.97983815585674],
[1023.0, 1032.0, 0.97983815585674]]
= [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected edge_index_selected.shape
torch.Size([2, 93730])
data설정(x, edge_index, y)
= torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
x = torch.tensor(df50['is_fraud'],dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data data
Data(x=[12012, 1], edge_index=[2, 93730], y=[12012], train_mask=[12012], test_mask=[12012])
정리
구분 | Train | Test | 모형 | 설명변수 | 비고 |
---|---|---|---|---|---|
분석1 | df50_tr | df50_test | GNN | amt | |
분석2 | df50_tr | df50_test | 로지스틱 회귀 | amt | |
분석3 | df50_tr | df50_test | SVM | amt | |
분석4 | df50_tr | df50_test | 랜덤포레스트 | amt | |
분석5 | df50_tr | df50_test | 부스팅 | amt | |
분석6 | df50_tr | df50_test | Naive Bayes | amt |
= [_results1, _results2,_results3,_results4,_results5, _results6]
lst pd.concat(lst)
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.889111 | 0.865884 | 0.923533 | 0.893780 |
분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
분석3 | 0.850150 | 0.935510 | 0.755438 | 0.835886 |
분석4 | 0.847153 | 0.850331 | 0.846407 | 0.848365 |
분석5 | 0.880120 | 0.886957 | 0.874094 | 0.880478 |
분석6 | 0.857143 | 0.957143 | 0.750824 | 0.841522 |
분석 1(GNN)
= (data.x[data.train_mask]).numpy()
X = (data.x[data.test_mask]).numpy()
XX = (data.y[data.train_mask]).numpy()
y = (data.y[data.test_mask]).numpy() yy
= GCN()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer
model.train()for epoch in range(400):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()eval() model.
GCN(
(conv1): GCNConv(1, 16)
(conv2): GCNConv(16, 2)
)
= model(data).argmax(dim=1)
pred = pred[data.test_mask] yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1 _results1
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.889111 | 0.865884 | 0.923533 | 0.89378 |
분석2(로지스틱 회귀)
= np.array(df50_tr.loc[:,['amt']])
X = np.array(df50_test.loc[:,['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud) yy
= sklearn.linear_model.LogisticRegression() lrnr
lrnr.fit(X,y)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
#thresh = y.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
= lrnr.predict(XX) yyhat
yyhat
array([0, 1, 0, ..., 0, 0, 1])
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2 _results2
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
분석3(서포트 벡터 머신)
= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud) yy
= SVC(kernel='linear')
lrnr
lrnr.fit(X,y)= lrnr.predict(XX) yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3 _results3
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석3 | 0.85015 | 0.93551 | 0.755438 | 0.835886 |
분석4(랜덤 포레스트)
= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud) yy
= RandomForestClassifier()
lrnr
lrnr.fit(X, y)= lrnr.predict(XX) yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석4'])
_results4 _results4
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석4 | 0.847153 | 0.850331 | 0.846407 | 0.848365 |
분석5(부스팅)
= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud) yy
= xgb.XGBClassifier()
lrnr
lrnr.fit(X, y)= lrnr.predict(XX) yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석5'])
_results5 _results5
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석5 | 0.88012 | 0.886957 | 0.874094 | 0.880478 |
분석6(Naive Bayes)
= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud) yy
= GaussianNB()
lrnr
lrnr.fit(X, y)= lrnr.predict(XX) yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score, sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석6'])
_results6 _results6
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석6 | 0.857143 | 0.957143 | 0.750824 | 0.841522 |