imports
import pandas as pd
import numpy as np
import sklearn
import pickle
import time
import datetime
import warnings
warnings.filterwarnings('ignore' )
% run ../ function_proposed_gcn.py
with open ('../fraudTrain.pkl' , 'rb' ) as file :
fraudTrain = pickle.load(file )
fraudTrain.is_fraud.mean()
df50 = throw(fraudTrain,0.5 )
df_tr, df_tst = sklearn.model_selection.train_test_split(df50)
dfn = fraudTrain[::10 ]
dfn = dfn.reset_index(drop= True )
df_trn, df_tstn = sklearn.model_selection.train_test_split(dfn)
df_tr.shape,df_tstn.shape
((9009, 22), (26215, 22))
df2, mask = concat(df_tr, df_tstn)
df2['index' ] = df2.index
df = df2.reset_index()
df.is_fraud.mean(), df_tr.is_fraud.mean(), df_tstn.is_fraud.mean()
(0.1329775153304565, 0.5034965034965035, 0.005645622735075339)
groups = df.groupby('cc_num' )
# edge_index = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
# edge_index = edge_index.astype(np.float64)
# np.save('edge_index_0215.npy', edge_index)
edge_index = np.load('edge_index_0215.npy' )
plt.hist(edge_index[:,2 ])
(array([396764., 293276., 254602., 221676., 185488., 142054., 107284.,
72978., 32646., 10480.]),
array([ 0., 3754074., 7508148., 11262222., 15016296., 18770370.,
22524444., 26278518., 30032592., 33786666., 37540740.]),
<BarContainer object of 10 artists>)
time의 차이니까…… 값이 작을수록, 거래시간이 가까운것.
theta = edge_index[:,2 ].mean()
plt.hist(- edge_index[:,2 ]/ 1e7 )
(array([ 10480., 32646., 72978., 107284., 142054., 185488., 221676.,
254602., 293276., 396764.]),
array([-3.754074 , -3.3786666, -3.0032592, -2.6278518, -2.2524444,
-1.877037 , -1.5016296, -1.1262222, -0.7508148, -0.3754074,
-0. ]),
<BarContainer object of 10 artists>)
plt.hist(- edge_index[:,2 ]/ theta)
(array([ 10480., 32646., 72978., 107284., 142054., 185488., 221676.,
254602., 293276., 396764.]),
array([-3.28599018, -2.95739116, -2.62879214, -2.30019313, -1.97159411,
-1.64299509, -1.31439607, -0.98579705, -0.65719804, -0.32859902,
-0. ]),
<BarContainer object of 10 artists>)
마이너스 하니까 값이 클수록 거래시간이 가까움.
np.percentile(- edge_index[:,2 ]/ theta,75 ), np.percentile(- edge_index[:,2 ]/ theta,80 ), np.percentile(- edge_index[:,2 ]/ theta,85 ), np.percentile(- edge_index[:,2 ]/ theta,90 )
(-0.36472984220131965,
-0.27277732494466095,
-0.18372121876074105,
-0.09886661549285204)
(pd.DataFrame(- edge_index[:,2 ]/ 1e7 )).describe()
count
1.717248e+06
mean
-1.142448e+00
std
8.596337e-01
min
-3.754074e+00
25%
-1.739953e+00
50%
-9.940050e-01
75%
-4.166850e-01
max
-0.000000e+00
(pd.DataFrame(- edge_index[:,2 ]/ theta)).describe()
count
1.717248e+06
mean
-1.000000e+00
std
7.524487e-01
min
-3.285990e+00
25%
-1.523004e+00
50%
-8.700656e-01
75%
-3.647298e-01
max
-0.000000e+00
plt.hist((np.exp(- edge_index[:,2 ]/ (theta)) != 1 )* (np.exp(- edge_index[:,2 ]/ (theta))).tolist())
(array([151332., 266952., 239110., 201528., 174204., 157072., 141056.,
125410., 117738., 142846.]),
array([0. , 0.09999947, 0.19999895, 0.29999842, 0.3999979 ,
0.49999737, 0.59999685, 0.69999632, 0.7999958 , 0.89999527,
0.99999475]),
<BarContainer object of 10 artists>)
pd.DataFrame((np.exp(- edge_index[:,2 ]/ (theta)) != 1 )* (np.exp(- edge_index[:,2 ]/ (theta))).tolist()).describe()
count
1.717248e+06
mean
4.457620e-01
std
2.803088e-01
min
0.000000e+00
25%
2.042338e-01
50%
3.998302e-01
75%
6.679522e-01
max
9.999947e-01
edge_index[:,2 ] = (np.exp(- edge_index[:,2 ]/ (theta)) != 1 )* (np.exp(- edge_index[:,2 ]/ (theta))).tolist()
edge_index = torch.tensor([(int (row[0 ]), int (row[1 ])) for row in edge_index if row[2 ] > 0.8 ], dtype= torch.long ).t()
x = torch.tensor(df['amt' ].values, dtype= torch.float ).reshape(- 1 ,1 )
y = torch.tensor(df['is_fraud' ].values,dtype= torch.int64)
data = torch_geometric.data.Data(x= x, edge_index = edge_index, y= y, train_mask = mask[0 ], test_mask= mask[1 ])
Data(x=[35224, 1], edge_index=[2, 260578], y=[35224], train_mask=[35224], test_mask=[35224])
model = GCN1()
optimizer = torch.optim.Adam(model.parameters(), lr= 0.01 , weight_decay= 5e-4 )
yy = (data.y[data.test_mask]).numpy()
yyhat, yyhat_ = train_and_evaluate_model(data, model, optimizer)
yyhat_ = yyhat_.detach().numpy()
eval = evaluation(yy, yyhat, yyhat_)
{'acc': 0.8473393095555979,
'pre': 0.018009004502251125,
'rec': 0.4864864864864865,
'f1': 0.03473227206946455,
'auc': 0.7499672102762216}
?!?!?