import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
imports
def down_sample_textbook(df):
df_majority = df[df.is_fraud==0].copy()
df_minority = df[df.is_fraud==1].copy()
df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
return df_downsampled
def compute_time_difference(group):
n = len(group)
result = []
for i in range(n):
for j in range(n):
time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
return result
def mask(df):
df_tr,df_test = sklearn.model_selection.train_test_split(df, random_state=42)
N = len(df)
train_mask = [i in df_tr.index for i in range(N)]
test_mask = [i in df_test.index for i in range(N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
return train_mask, test_mask
def edge_index_selected(edge_index):
theta = edge_index[:,2].mean()
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index = edge_index.tolist()
mean_ = np.array(edge_index)[:,2].mean()
selected_edges = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
return edge_index_selected
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain| trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
| 1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
| 2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
| 3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
| 4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
| 1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
| 1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
| 1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
| 1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
데이터정리
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape(214520, 22)
df50 = down_sample_textbook(df02)
df50 = df50.reset_index()
df50.shape(12012, 23)
tr/test
mask(df50)(array([False, True, True, ..., True, False, True]),
array([ True, False, False, ..., False, True, False]))
train_mask, test_mask = mask(df50)edge_index 설정
edge_index = np.load('edge_index_list_plus50.npy').astype(np.float64)
edge_index.shape(200706, 3)
edge_index_selected = edge_index_selected(edge_index)data설정(x, edge_index, y)
x = torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
y = torch.tensor(df50['is_fraud'],dtype=torch.int64)
data = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
dataData(x=[12012, 1], edge_index=[2, 93730], y=[12012], train_mask=[12012], test_mask=[12012])
정리
| 구분 | Train | Test | 모형 | 설명변수 | 비고 |
|---|---|---|---|---|---|
| 분석1 | df50_tr | df50_test | Proposed | amt | |
| 분석2 | df50_tr | df50_test | 로지스틱 회귀 | amt | |
| 분석3 | df50_tr | df50_test | XGBoost | amt | |
| 분석4 | df50_tr | df50_test | LightGBM | amt |
lst = [_results1, _results2,_results3,_results4]
pd.concat(lst)| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석1 | 0.902098 | 0.862478 | 0.959130 | 0.908240 |
| 분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
| 분석3 | 0.880120 | 0.886957 | 0.874094 | 0.880478 |
| 분석4 | 0.885115 | 0.893817 | 0.876730 | 0.885191 |
분석 1(GCN)
torch.manual_seed(202250926)
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32,2)
def forward(self, data):
x, edge_index = data.x, data.edge_index
x = self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
return F.log_softmax(x, dim=1)
X = (data.x[data.train_mask]).numpy()
XX = (data.x[data.test_mask]).numpy()
y = (data.y[data.train_mask]).numpy()
yy = (data.y[data.test_mask]).numpy()
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
for epoch in range(400):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
model.eval()
pred = model(data).argmax(dim=1)
yyhat = pred[data.test_mask]
pred
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
_results1= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석1 | 0.902098 | 0.862478 | 0.95913 | 0.90824 |
b1,W1 = list(model.conv1.parameters())
b1,W1(Parameter containing:
tensor([ 3.3281e-01, -5.2539e-01, 1.1277e+00, 2.8239e-01, -1.2373e-01,
9.6607e-01, -1.7652e-02, 8.3960e-01, 3.3252e-01, -4.6080e-01,
8.6407e-01, 1.3294e+00, 3.2962e-01, -1.3215e-06, 2.1968e+00,
-1.8843e-01, 7.6852e-10, -5.0779e-01, -7.7613e-09, -7.8956e-04,
-4.6320e-01, -6.8497e-02, 3.4533e-02, -4.4006e-01, -3.8074e-01,
-4.5996e-01, 4.4989e-13, -3.9142e-08, 4.3476e-01, 1.5457e+00,
-3.1396e-01, 4.3208e-01], requires_grad=True),
Parameter containing:
tensor([[ 1.6974e-01],
[ 1.4139e-01],
[-6.2153e-03],
[ 1.3422e-01],
[-8.4340e-02],
[-5.0439e-03],
[-5.0210e-02],
[-4.3200e-03],
[ 2.5801e-01],
[ 2.1217e-01],
[-4.4886e-03],
[-7.1453e-03],
[ 1.8556e-01],
[-3.8936e-02],
[-1.2772e-02],
[-3.0888e-03],
[-1.9216e-05],
[ 4.9480e-02],
[-6.8162e-02],
[-8.9236e-02],
[ 1.3753e-01],
[-6.8890e-02],
[-6.3743e-02],
[ 2.1896e-01],
[ 2.9288e-03],
[ 4.8694e-03],
[-4.5865e-02],
[-5.2090e-02],
[ 1.2880e-01],
[-8.3668e-03],
[ 9.4231e-03],
[ 1.3015e-01]], requires_grad=True))
b2,W2 = list(model.conv2.parameters())
b2,W2(Parameter containing:
tensor([ 0.8103, -0.8103], requires_grad=True),
Parameter containing:
tensor([[-3.1862e-02, 1.2264e-01, 2.5667e-01, -2.9070e-01, 1.9933e-02,
2.8287e-01, -8.1168e-03, 1.7061e-01, 3.1362e-02, -1.3049e-01,
2.4063e-02, 1.5218e-01, 1.7274e-01, 5.8768e-06, 2.9730e-01,
3.7174e-02, -1.1418e-09, -2.0023e-01, -1.3184e-02, 5.8934e-02,
2.0550e-01, -5.7726e-02, 2.8572e-01, -1.4259e-01, -2.8006e-01,
-6.7342e-02, -3.5160e-02, -7.0944e-02, -5.3993e-02, 1.6846e-01,
5.3713e-02, -1.7743e-01],
[-2.9673e-02, 1.2650e-01, -1.8649e-01, -2.8863e-01, 2.7340e-01,
-2.5555e-02, -2.4640e-02, -9.0935e-02, 3.2923e-02, -1.2820e-01,
-2.4358e-01, -2.7795e-01, 1.7514e-01, -4.4449e-06, -3.2877e-01,
5.9096e-02, 1.0013e-09, -1.9043e-01, 1.0161e-02, -4.9584e-02,
2.0870e-01, -6.1236e-03, 2.2979e-01, -1.4060e-01, 1.1504e-01,
2.0197e-01, 4.5961e-02, -1.0499e-01, -5.0285e-02, -3.6542e-01,
1.1680e-01, -1.7159e-01]], requires_grad=True))
분석2(로지스틱 회귀)
torch.manual_seed(202250926)
X = np.array(df50_tr.loc[:,['amt']])
XX = np.array(df50_test.loc[:,['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)
lrnr = sklearn.linear_model.LogisticRegression()
lrnr.fit(X,y)
#thresh = y.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
yyhat = lrnr.predict(XX)
yyhat
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
_results2= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석2 | 0.849484 | 0.933279 | 0.756098 | 0.835397 |
분석3(XGBoost)
import xgboost as xgb
torch.manual_seed(202250926)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)
lrnr = xgb.XGBClassifier()
lrnr.fit(X,y)
yyhat = lrnr.predict(XX)
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
_results3= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석3 | 0.88012 | 0.886957 | 0.874094 | 0.880478 |
분석4(Light GBM)
import lightgbm as lgb
torch.manual_seed(202250926)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)
lrnr = lgb.LGBMClassifier()
lrnr.fit(X, y)
yyhat = lrnr.predict(XX)
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
_results4 = pd.DataFrame({m.__name__: [m(yy, yyhat).round(6)] for m in metrics}, index=['분석4'])
_results4| accuracy_score | precision_score | recall_score | f1_score | |
|---|---|---|---|---|
| 분석4 | 0.885115 | 0.893817 | 0.87673 | 0.885191 |
분석5(One class SVM)
from sklearn.svm import OneClassSVM
torch.manual_seed(202250926)
X = np.array(df50_tr.loc[:, ['amt']])
XX = np.array(df50_test.loc[:, ['amt']])
y = np.array(df50_tr.is_fraud)
yy = np.array(df50_test.is_fraud)
ocsvm = OneClassSVM(gamma='auto')
ocsvm.fit(X[y == 0])
yyhat_ocsvm = ocsvm.predict(XX)
yyhat_ocsvm_binary = np.where(yyhat_ocsvm == 1, 0)
metrics_ocsvm = [
sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score
]
_results5 = pd.DataFrame({m.__name__: [m(yy, yyhat_ocsvm_binary).round(6)] for m in metrics_ocsvm}, index=['분석5'])
_results5