import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
imports
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
def mask(df):
= sklearn.model_selection.train_test_split(df, random_state=42)
df_tr,df_test = len(df)
N = [i in df_tr.index for i in range(N)]
train_mask = [i in df_test.index for i in range(N)]
test_mask = np.array(train_mask)
train_mask = np.array(test_mask)
test_mask return train_mask, test_mask
def edge_index_selected(edge_index):
= edge_index[:,2].mean()
theta 2] = (np.exp(-edge_index[:,2]/theta) != 1)*(np.exp(-edge_index[:,2]/theta))
edge_index[:,= edge_index.tolist()
edge_index = np.array(edge_index)[:,2].mean()
mean_ = [(int(row[0]), int(row[1])) for row in edge_index if row[2] > mean_]
selected_edges = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected return edge_index_selected
= 'https://media.githubusercontent.com/media/musthave-ML10/data_source/main/fraud.csv'
file_url = pd.read_csv(file_url) fraudTrain
# fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:18 | 2703186189652095 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:44 | 630423337322 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:51 | 38859492057661 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:16 | 3534093764340240 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:06 | 375534208663984 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1852389 | 2020-12-31 23:59:07 | 30560609640617 | fraud_Reilly and Sons | health_fitness | 43.77 | Michael | Olson | M | 558 Michael Estates | Luray | ... | 40.4931 | -91.8912 | 519 | Town planner | 1966-02-13 | 9b1f753c79894c9f4b71f04581835ada | 1388534347 | 39.946837 | -91.333331 | 0 |
1852390 | 2020-12-31 23:59:09 | 3556613125071656 | fraud_Hoppe-Parisian | kids_pets | 111.84 | Jose | Vasquez | M | 572 Davis Mountains | Lake Jackson | ... | 29.0393 | -95.4401 | 28739 | Futures trader | 1999-12-27 | 2090647dac2c89a1d86c514c427f5b91 | 1388534349 | 29.661049 | -96.186633 | 0 |
1852391 | 2020-12-31 23:59:15 | 6011724471098086 | fraud_Rau-Robel | kids_pets | 86.88 | Ann | Lawson | F | 144 Evans Islands Apt. 683 | Burbank | ... | 46.1966 | -118.9017 | 3684 | Musician | 1981-11-29 | 6c5b7c8add471975aa0fec023b2e8408 | 1388534355 | 46.658340 | -119.715054 | 0 |
1852392 | 2020-12-31 23:59:24 | 4079773899158 | fraud_Breitenberg LLC | travel | 7.99 | Eric | Preston | M | 7020 Doyle Stream Apt. 951 | Mesa | ... | 44.6255 | -116.4493 | 129 | Cartographer | 1965-12-15 | 14392d723bb7737606b2700ac791b7aa | 1388534364 | 44.470525 | -117.080888 | 0 |
1852393 | 2020-12-31 23:59:34 | 4170689372027579 | fraud_Dare-Marvin | entertainment | 38.13 | Samuel | Frey | M | 830 Myers Plaza Apt. 384 | Edmond | ... | 35.6665 | -97.4798 | 116001 | Media buyer | 1993-05-10 | 1765bb45b3aa3224b4cdcb6e7a96cee3 | 1388534374 | 36.210097 | -97.036372 | 0 |
1852394 rows × 22 columns
street/tsate/zip: 고객 거주지 정보
lat/long: rhror wnthdp eogks dnleh alc rudeh
city_pop: 고객의 zipcode에 속하는 인구 수
job: 직업
dob: 생년월일
trans_num: 거래번호
unix_time: 거래 시간(유닉스 타임 스탬프 형식)
데이터정리
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
(378200, 22)
'is_fraud'] == 1).sum() / 378200 (df02[
0.025518244315177154
= down_sample_textbook(df02)
df50 = df50.reset_index()
df50 df50.shape
(19302, 23)
tr/test
mask(df50)
(array([False, True, True, ..., True, False, True]),
array([ True, False, False, ..., False, True, False]))
= mask(df50) train_mask, test_mask
= sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
edge_index 설정
# groups = df50.groupby('cc_num')
# edge_index_list_plus = [compute_time_difference(group) for _, group in groups]
# edge_index_list_plus_flat = [item for sublist in edge_index_list_plus for item in sublist]
# edge_index_list_plus_nparr = np.array(edge_index_list_plus_flat)
# np.save('edge_index_list_plus50.npy', edge_index_list_plus_nparr)
= np.load('edge_index_list_plus50_2.npy').astype(np.float64)
edge_index edge_index.shape
(429162, 3)
= edge_index_selected(edge_index) edge_index_selected
data설정(x, edge_index, y)
= torch.tensor(df50['amt'], dtype=torch.float).reshape(-1,1)
x = torch.tensor(df50['is_fraud'],dtype=torch.int64)
y = torch_geometric.data.Data(x=x, edge_index = edge_index_selected, y=y, train_mask = train_mask, test_mask = test_mask)
data data
Data(x=[19302, 1], edge_index=[2, 195056], y=[19302], train_mask=[19302], test_mask=[19302])
분석 1(GCN)
202250926)
torch.manual_seed(
class GCN1(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 32)
self.conv2 = GCNConv(32,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= (data.x[data.train_mask]).numpy()
X = (data.x[data.test_mask]).numpy()
XX = (data.y[data.train_mask]).numpy()
y = (data.y[data.test_mask]).numpy()
yy
= GCN1()
model = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer
model.train()for epoch in range(400):
optimizer.zero_grad()= model(data)
out = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
loss
loss.backward()
optimizer.step()eval()
model.
= model(data).argmax(dim=1) # argmax말고
pred = pred[data.test_mask]
yyhat
pred
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석1'])
_results1 _results1
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석1 | 0.860133 | 0.810655 | 0.937943 | 0.869666 |
분석2(로지스틱 회귀)
202250926)
torch.manual_seed(= np.array(df50_tr.loc[:,['amt']])
X = np.array(df50_test.loc[:,['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud)
yy
= sklearn.linear_model.LogisticRegression()
lrnr
lrnr.fit(X,y)
#thresh = y.mean()
#yyhat = (lrnr.predict_proba(XX)> thresh)[:,-1]
= lrnr.predict(XX)
yyhat
yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석2'])
_results2 _results2
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석2 | 0.857853 | 0.951078 | 0.75302 | 0.840539 |
분석3(XGBoost)
import xgboost as xgb
202250926)
torch.manual_seed(= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud)
yy
= xgb.XGBClassifier()
lrnr
lrnr.fit(X,y)= lrnr.predict(XX)
yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__:[m(yy,yyhat).round(6)] for m in metrics},index=['분석3'])
_results3 _results3
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석3 | 0.888313 | 0.896508 | 0.876718 | 0.886502 |
분석4(Light GBM)
import lightgbm as lgb
202250926)
torch.manual_seed(= np.array(df50_tr.loc[:, ['amt']])
X = np.array(df50_test.loc[:, ['amt']])
XX = np.array(df50_tr.is_fraud)
y = np.array(df50_test.is_fraud)
yy
= lgb.LGBMClassifier()
lrnr
lrnr.fit(X, y)= lrnr.predict(XX)
yyhat
= [sklearn.metrics.accuracy_score,
metrics
sklearn.metrics.precision_score,
sklearn.metrics.recall_score,
sklearn.metrics.f1_score]
= pd.DataFrame({m.__name__: [m(yy, yyhat).round(6)] for m in metrics}, index=['분석4'])
_results4 _results4
accuracy_score | precision_score | recall_score | f1_score | |
---|---|---|---|---|
분석4 | 0.887484 | 0.894312 | 0.877551 | 0.885852 |