[FRAUD] df02 accuracy 0.9707 f1은 망함
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
# gnn
import torch
import torch_geometric
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:18: UserWarning: An issue occurred while importing 'pyg-lib'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/libpyg.so: undefined symbol: _ZN2at4_ops12split_Tensor4callERKNS_6TensorEN3c106SymIntEl
warnings.warn(f"An issue occurred while importing 'pyg-lib'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:31: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_scatter/_scatter_cuda.so: undefined symbol: _ZNK3c107SymBool10guard_boolEPKcl
warnings.warn(f"An issue occurred while importing 'torch-scatter'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:42: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_sparse/_diag_cuda.so: undefined symbol: _ZN3c106detail19maybe_wrap_dim_slowIlEET_S2_S2_b
warnings.warn(f"An issue occurred while importing 'torch-sparse'. "
-
fraudTrain
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv" ).iloc[:,1 :]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list (map (lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
0
2019-01-01 00:00:00
2.703190e+15
fraud_Rippin, Kub and Mann
misc_net
4.97
Jennifer
Banks
F
561 Perry Cove
Moravian Falls
...
36.0788
-81.1781
3495
Psychologist, counselling
1988-03-09
0b242abb623afc578575680df30655b9
1325376018
36.011293
-82.048315
0
1
2019-01-01 00:00:00
6.304230e+11
fraud_Heller, Gutmann and Zieme
grocery_pos
107.23
Stephanie
Gill
F
43039 Riley Greens Suite 393
Orient
...
48.8878
-118.2105
149
Special educational needs teacher
1978-06-21
1f76529f8574734946361c461b024d99
1325376044
49.159047
-118.186462
0
2
2019-01-01 00:00:00
3.885950e+13
fraud_Lind-Buckridge
entertainment
220.11
Edward
Sanchez
M
594 White Dale Suite 530
Malad City
...
42.1808
-112.2620
4154
Nature conservation officer
1962-01-19
a1a22d70485983eac12b5b88dad1cf95
1325376051
43.150704
-112.154481
0
3
2019-01-01 00:01:00
3.534090e+15
fraud_Kutch, Hermiston and Farrell
gas_transport
45.00
Jeremy
White
M
9443 Cynthia Court Apt. 038
Boulder
...
46.2306
-112.1138
1939
Patent attorney
1967-01-12
6b849c168bdad6f867558c3793159a81
1325376076
47.034331
-112.561071
0
4
2019-01-01 00:03:00
3.755340e+14
fraud_Keeling-Crist
misc_pos
41.96
Tyler
Garcia
M
408 Bradley Rest
Doe Hill
...
38.4207
-79.4629
99
Dance movement psychotherapist
1986-03-28
a41d7549acf90789359a9aa5346dcb46
1325376186
38.674999
-78.632459
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
1048570
2020-03-10 16:07:00
6.011980e+15
fraud_Fadel Inc
health_fitness
77.00
Haley
Wagner
F
05561 Farrell Crescent
Annapolis
...
39.0305
-76.5515
92106
Accountant, chartered certified
1943-05-28
45ecd198c65e81e597db22e8d2ef7361
1362931649
38.779464
-76.317042
0
1048571
2020-03-10 16:07:00
4.839040e+15
fraud_Cremin, Hamill and Reichel
misc_pos
116.94
Meredith
Campbell
F
043 Hanson Turnpike
Hedrick
...
41.1826
-92.3097
1583
Geochemist
1999-06-28
c00ce51c6ebb7657474a77b9e0b51f34
1362931670
41.400318
-92.726724
0
1048572
2020-03-10 16:08:00
5.718440e+11
fraud_O'Connell, Botsford and Hand
home
21.27
Susan
Mills
F
005 Cody Estates
Louisville
...
38.2507
-85.7476
736284
Engineering geologist
1952-04-02
17c9dc8b2a6449ca2473726346e58e6c
1362931711
37.293339
-84.798122
0
1048573
2020-03-10 16:08:00
4.646850e+18
fraud_Thompson-Gleason
health_fitness
9.52
Julia
Bell
F
576 House Crossroad
West Sayville
...
40.7320
-73.1000
4056
Film/video editor
1990-06-25
5ca650881b48a6a38754f841c23b77ab
1362931718
39.773077
-72.213209
0
1048574
2020-03-10 16:08:00
2.283740e+15
fraud_Buckridge PLC
misc_pos
6.81
Shannon
Williams
F
9345 Spencer Junctions Suite 183
Alpharetta
...
34.0770
-84.3033
165556
Prison officer
1997-12-27
8d0a575fe635bbde12f1a2bffc126731
1362931730
33.601468
-83.891921
0
1048575 rows × 22 columns
-
df02
_df1 = fraudTrain[fraudTrain["is_fraud" ] == 0 ].sample(frac= 0.20 , random_state= 42 )
_df2 = fraudTrain[fraudTrain["is_fraud" ] == 1 ]
df02 = pd.concat([_df1,_df2])
df02 = df02.reset_index()
-
df_toy
df_toy= df02[:5 ].copy()
df_toy.cc_num = pd.Series([1 ,1 ,1 ,2 ,2 ])
df_toy
0
669418
2019-10-12 18:21:00
1
fraud_Haley, Jewess and Bechtelar
shopping_pos
7.53
Debra
Stark
F
686 Linda Rest
...
32.3836
-94.8653
24536
Multimedia programmer
1983-10-14
d313353fa30233e5fab5468e852d22fc
1350066071
32.202008
-94.371865
0
1
32567
2019-01-20 13:06:00
1
fraud_Turner LLC
travel
3.79
Judith
Moss
F
46297 Benjamin Plains Suite 703
...
39.5370
-83.4550
22305
Television floor manager
1939-03-09
88c65b4e1585934d578511e627fe3589
1327064760
39.156673
-82.930503
0
2
156587
2019-03-24 18:09:00
1
fraud_Klein Group
entertainment
59.07
Debbie
Payne
F
204 Ashley Neck Apt. 169
...
41.5224
-71.9934
4720
Broadcast presenter
1977-05-18
3bd9ede04b5c093143d5e5292940b670
1332612553
41.657152
-72.595751
0
3
1020243
2020-02-25 15:12:00
2
fraud_Monahan-Morar
personal_care
25.58
Alan
Parsons
M
0547 Russell Ford Suite 574
...
39.6171
-102.4776
207
Network engineer
1955-12-04
19e16ee7a01d229e750359098365e321
1361805120
39.080346
-103.213452
0
4
116272
2019-03-06 23:19:00
2
fraud_Kozey-Kuhlman
personal_care
84.96
Jill
Flores
F
639 Cruz Islands
...
41.9488
-86.4913
3104
Horticulturist, commercial
1981-03-29
a0c8641ca1f5d6e243ed5a2246e66176
1331075954
42.502065
-86.732664
0
5 rows × 23 columns
-
df_toy 에서 time_difference 구함
고객1
df_toy.iloc[0 ].trans_date_trans_time.value - df_toy.iloc[1 ].trans_date_trans_time.value
df_toy.iloc[0 ].trans_date_trans_time.value - df_toy.iloc[2 ].trans_date_trans_time.value
df_toy.iloc[1 ].trans_date_trans_time.value - df_toy.iloc[2 ].trans_date_trans_time.value
고객2
df_toy.iloc[3 ].trans_date_trans_time.value - df_toy.iloc[4 ].trans_date_trans_time.value
고객1,2
def compute_time_difference(group):
n = len (group)
result = []
for i in range (n):
for j in range (n):
time_difference = abs (group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
return result
groups = df_toy.groupby('cc_num' )
edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
edge_index_list_plus_nparr
array([[ 0, 0, 0],
[ 0, 1, 22914900000000000],
[ 0, 2, 17453520000000000],
[ 1, 0, 22914900000000000],
[ 1, 1, 0],
[ 1, 2, 5461380000000000],
[ 2, 0, 17453520000000000],
[ 2, 1, 5461380000000000],
[ 2, 2, 0],
[ 3, 3, 0],
[ 3, 4, 30729180000000000],
[ 4, 3, 30729180000000000],
[ 4, 4, 0]])
-
df02에서 time_difference 구함
# t1 = time.time()
# groups = df02.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus02.npy', edge_index_list_plus_nparr)
# t2 = time.time()
# t2-t1
groups = df02.groupby("cc_num" )
edge_index_list_plus02[:,2 ] = (np.exp(- edge_index_list_plus02[:,2 ]/ theta) != 1 )* (np.exp(- edge_index_list_plus02[:,2 ]/ theta))
array([[ 2881, 2881, 0],
[ 2881, 3061, 0],
[ 2881, 4867, 0],
...,
[212771, 212765, 0],
[212771, 212769, 0],
[212771, 212771, 0]])
edge_index_list_plus02 = np.load('edge_index_list_plus02.npy' ).astype(np.float64)
theta = edge_index_list_plus02[:,2 ].mean()
edge_index_list_plus02[:,2 ] = (np.exp(- edge_index_list_plus02[:,2 ]/ theta) != 1 )* (np.exp(- edge_index_list_plus02[:,2 ]/ theta))
array([[2.88100000e+03, 2.88100000e+03, 0.00000000e+00],
[2.88100000e+03, 3.06100000e+03, 1.96061280e-01],
[2.88100000e+03, 4.86700000e+03, 8.12918172e-01],
...,
[2.12771000e+05, 2.12765000e+05, 9.97708695e-01],
[2.12771000e+05, 2.12769000e+05, 9.99923197e-01],
[2.12771000e+05, 2.12771000e+05, 0.00000000e+00]])
edge_index_list_plus02.shape
edge_index_list_updated = edge_index_list_plus02.tolist()
np.array(edge_index_list_updated)[:,2 ].mean()
mm = np.array(edge_index_list_updated)[:,2 ].mean()
selected_edges = [(int (row[0 ]), int (row[1 ])) for row in edge_index_list_updated if row[2 ] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype= torch.long ).t()
edge_index_selected.shape
torch.Size([2, 29970380])
tr/test
df02_tr,df02_test = sklearn.model_selection.train_test_split(df02, random_state= 42 )
df02_tr.shape, df02_test.shape
((160890, 23), (53630, 23))
N = len (df02)
train_mask = [i in df02_tr.index for i in range (N)]
test_mask = [i in df02_test.index for i in range (N)]
train_mask = np.array(train_mask)
test_mask = np.array(test_mask)
train_mask.shape, test_mask.shape
data
x = torch.tensor(df02['amt' ], dtype= torch.float ).reshape(- 1 ,1 )
y = torch.tensor(df02['is_fraud' ],dtype= torch.int64)
data = torch_geometric.data.Data(x= x, edge_index = edge_index_selected, y= y, train_mask = train_mask, test_mask = test_mask)
Data(x=[214520, 1], edge_index=[2, 29970380], y=[214520], train_mask=[214520], test_mask=[214520])
torch.manual_seed(202250926 )
class GCN(torch.nn.Module):
def __init__ (self ):
super ().__init__ ()
self .conv1 = GCNConv(1 , 32 )
self .conv2 = GCNConv(32 ,2 )
def forward(self , data):
x, edge_index = data.x, data.edge_index
x = self .conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training= self .training)
x = self .conv2(x, edge_index)
return F.log_softmax(x, dim= 1 )
X = (data.x[data.train_mask]).numpy()
XX = (data.x[data.test_mask]).numpy()
y = (data.y[data.train_mask]).numpy()
yy = (data.y[data.test_mask]).numpy()
model = GCN()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.01 , weight_decay= 5e-4 )
model.train()
for epoch in range (400 ):
optimizer.zero_grad()
out = model(data)
loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss.backward()
optimizer.step()
model.eval ()
pred = model(data).argmax(dim= 1 )
yyhat = pred[data.test_mask]
metrics = [sklearn.metrics.accuracy_score,
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
_results2= pd.DataFrame({m.__name__ :[m(yy,yyhat).round (6 )] for m in metrics},index= ['분석2' ])
_results2
분석2
0.973112
0.72695
0.130573
0.221382
model.eval ()
pred = model(data).argmax(dim= 1 )
correct = (pred[data.test_mask] == data.y[data.test_mask]).sum ()
acc = int (correct) / int (np.array(data.test_mask).sum ())
print (f'Accuracy: { acc:.4f} ' )
predicted_labels = pred[data.test_mask]
true_labels = data.y[data.test_mask]
precision = precision_score(true_labels, predicted_labels, average= 'macro' )
recall = recall_score(true_labels, predicted_labels, average= 'macro' )
f1 = f1_score(true_labels, predicted_labels, average= 'macro' )
print (f'Precision: { precision:.4f} ' )
print (f'Recall: { recall:.4f} ' )
print (f'F1 Score: { f1:.4f} ' )
Precision: 0.4854
Recall: 0.5000
F1 Score: 0.4926
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))