import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
'ignore') warnings.filterwarnings(
imports
%run function_proposed_gcn.py
with open('fraudTrain.pkl', 'rb') as file:
= pickle.load(file) fraudTrain
= fraudTrain[::10]
df
= df.reset_index(drop=True)
df
df.is_fraud.mean()
# df_train, df_test = sklearn.model_selection.train_test_split(df)
0.005807854431707643
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
def mask(df):
= sklearn.model_selection.train_test_split(df, random_state=42)
df_tr,df_test = len(df)
N = [i in df_tr.index for i in range(N)]
train_mask = [i in df_test.index for i in range(N)]
test_mask = np.array(train_mask)
train_mask = np.array(test_mask)
test_mask return train_mask, test_mask
def edge_index_selected(edge_index):
= edge_index[:,2].mean()
theta 2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index[:,= edge_index.tolist()
edge_index = np.array(edge_index)[:,2].mean()
mean_ = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected return edge_index_selected
df.shape
(104858, 22)
= mask(df)
train_mask, test_mask = sklearn.model_selection.train_test_split(df, random_state=42) df_tr,df_test
# groups = df.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus_005.npy', edge_index_list_plus_nparr)
= np.load('edge_index_list_plus_005.npy').astype(np.float64)
edge_index edge_index.shape
(15990358, 3)
2].mean() edge_index[:,
1.1661268505973428e+16
2]) plt.hist(edge_index[:,
(array([3295084., 2862306., 2479784., 2167346., 1785280., 1358708.,
1001646., 663712., 286718., 89774.]),
array([0.0000000e+00, 3.7535640e+15, 7.5071280e+15, 1.1260692e+16,
1.5014256e+16, 1.8767820e+16, 2.2521384e+16, 2.6274948e+16,
3.0028512e+16, 3.3782076e+16, 3.7535640e+16]),
<BarContainer object of 10 artists>)
2.5e16 > 1.16e16
True
= 2.5e16 theta
2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta)) edge_index[:,
= edge_index.tolist() edge_index
2].mean() np.array(edge_index)[:,
0.6541518297734155
2]) plt.hist(np.array(edge_index)[:,
(array([ 104858., 0., 367350., 1557012., 2109832., 2399724.,
2431610., 2400986., 2332404., 2286582.]),
array([0. , 0.09999976, 0.19999952, 0.29999928, 0.39999904,
0.4999988 , 0.59999856, 0.69999832, 0.79999808, 0.89999784,
0.9999976 ]),
<BarContainer object of 10 artists>)
= [(int(row[0]), int(row[1])) for row in edge_index if row[2] > 0.9]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
= torch.tensor(df['amt'], dtype=torch.float).reshape(-1,1)
x = torch.tensor(df['is_fraud'],dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data data
Data(x=[104858, 1], edge_index=[2, 2286540], y=[104858], train_mask=[104858], test_mask=[104858])
edge_index_selected.shape
torch.Size([2, 2286540])
202250926)
torch.manual_seed(
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= (data.x[data.train_mask]).numpy()
X = (data.x[data.test_mask]).numpy()
XX = (data.y[data.train_mask]).numpy()
y = (data.y[data.test_mask]).numpy()
yy
= GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer
model.train()for epoch in range(400):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()eval()
model.
= model(data).argmax(dim=1) # argmax말고
pred = pred[data.test_mask]
yyhat
pred
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1 _results1
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.994202 | 0.0 | 0.0 | 0.0 |
sum() yyhat.
tensor(0)
= roc_curve(yy, yyhat)
gnn_fpr, gnn_tpr, gnn_thresholds = auc(gnn_fpr, gnn_tpr)
gnn_roc_auc_lr gnn_roc_auc_lr
0.5
= 0.5e16 theta
= np.load('edge_index_list_plus_005.npy').astype(np.float64)
edge_index 2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta)) edge_index[:,
= edge_index.tolist() edge_index
2].mean() np.array(edge_index)[:,
0.24202248967016632
2]) plt.hist(np.array(edge_index)[:,
(array([7297668., 2256502., 1485530., 1114218., 886792., 727340.,
651226., 577800., 518864., 474418.]),
array([0. , 0.0999988, 0.1999976, 0.2999964, 0.3999952, 0.499994 ,
0.5999928, 0.6999916, 0.7999904, 0.8999892, 0.999988 ]),
<BarContainer object of 10 artists>)
= [(int(row[0]), int(row[1])) for row in edge_index if row[2] > 0.8]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
= torch.tensor(df['amt'], dtype=torch.float).reshape(-1,1)
x = torch.tensor(df['is_fraud'],dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data data
Data(x=[104858, 1], edge_index=[2, 993216], y=[104858], train_mask=[104858], test_mask=[104858])
edge_index_selected.shape
torch.Size([2, 993216])
202250926)
torch.manual_seed(
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= (data.x[data.train_mask]).numpy()
X = (data.x[data.test_mask]).numpy()
XX = (data.y[data.train_mask]).numpy()
y = (data.y[data.test_mask]).numpy()
yy
= GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer
model.train()for epoch in range(400):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()eval()
model.
= model(data).argmax(dim=1) # argmax말고
pred = pred[data.test_mask]
yyhat
pred
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1 _results1
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.994202 | 0.0 | 0.0 | 0.0 |
= roc_curve(yy, yyhat)
gnn_fpr, gnn_tpr, gnn_thresholds = auc(gnn_fpr, gnn_tpr)
gnn_roc_auc_lr gnn_roc_auc_lr
0.5
= [(int(row[0]), int(row[1])) for row in edge_index if row[2] > 0.95]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
= torch.tensor(df['amt'], dtype=torch.float).reshape(-1,1)
x = torch.tensor(df['is_fraud'],dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data data
Data(x=[104858, 1], edge_index=[2, 238640], y=[104858], train_mask=[104858], test_mask=[104858])
202250926)
torch.manual_seed(
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= (data.x[data.train_mask]).numpy()
X = (data.x[data.test_mask]).numpy()
XX = (data.y[data.train_mask]).numpy()
y = (data.y[data.test_mask]).numpy()
yy
= GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer
model.train()for epoch in range(400):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()eval()
model.
= model(data).argmax(dim=1) # argmax말고
pred = pred[data.test_mask]
yyhat
pred
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1 _results1
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.994202 | 0.0 | 0.0 | 0.0 |
= roc_curve(yy, yyhat)
gnn_fpr, gnn_tpr, gnn_thresholds = auc(gnn_fpr, gnn_tpr)
gnn_roc_auc_lr gnn_roc_auc_lr
0.5
1] model(data)[:,
tensor([-8.5130, -7.5498, -7.2450, ..., -6.0895, -7.1428, -6.7693],
grad_fn=<SelectBackward0>)
0].mean() model(data)[data.test_mask][:,
tensor(-0.0059, grad_fn=<MeanBackward0>)
1].mean() model(data)[data.test_mask][:,
tensor(-7.0153, grad_fn=<MeanBackward0>)
model(data)[data.test_mask]
tensor([[-5.7895e-04, -7.4546e+00],
[-5.3886e-03, -5.2262e+00],
[-1.2064e-03, -6.7207e+00],
...,
[-2.4151e-03, -6.0272e+00],
[-4.8757e-04, -7.6263e+00],
[-1.1491e-03, -6.7693e+00]], grad_fn=<IndexBackward0>)
0.005 train에는 df50으로 학습하고 test는