[FRAUD] 데이터정리 시도(1, tr로만 96퍼 accuracy)

Author

김보람

Published

August 10, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

- 모든엣지를 고려하는 방법

N = 10 
edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
# edge_attr = 그래프의 웨이트 
edge_index
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
         7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
         9, 9, 9, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
         4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
         8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
         2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
         6, 7, 8, 9]])
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

- 시간 차이 계산하려면?

diff = fraudTrain.trans_date_trans_time[101]-fraudTrain.trans_date_trans_time[0]
diff
Timedelta('0 days 01:17:00')
diff.total_seconds()
4620.0

- 적당한 theta값을 정하자.

theta = 86400*1.2
theta
103680.0
theta = 86400*1.2
np.exp(-diff.total_seconds()/theta)
0.9564180361647693

시도

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

N = len(fraudTrain)
N
1048575
  • df02을 이용해서 해보자.
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
214520*214520
46018830400
# N = len(df02)
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
  • df50
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
12012*12012
144288144

고려할 것(230810)

  • df50 의 shape이 12000개 이므로 9000개의 T, 3000개의 F를 train mask로 만들자.

  • 고객정보가 동일하면 edge를 1로, 아니면 0으로 놓고 1에대한 weight를 만들자.

  • g(V,E,W)에서의 weight

df50 = df50.reset_index()
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)
(0.49828, 0.50516)

고려할 것(230810)2

  • 현재 df50의 fraud 비율은 5:5 인데, 다른 비율을 가진 데이터로도 해보자

  • GNN으로 돌려본 것과 다른 방법들과 비교를 해보자

  • undersampling한 다른 데이터들과 비교해 볼 수 있을 듯(boost, logis, …)

  • 9000/3000 데이터를 통해 합성 데이터를 만드는데, 12000개를 그대로 만드는 방법, 고객별로(cc_num) 합성 데이터를 만드는 방법, 똑같은 cc_num로 특이한 데이터가 있다면 normal데이터와 특이 데이터를 생각해서 돌리는 방법 등을 고려하자.

df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
N = len(df50_tr)
#edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
#edge_index
df50_tr = df50_tr.reset_index()
edge_index_list = []
for i in range(N):
    for j in range(N):
        time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])
edge_index = np.array(edge_index_list)
edge_index.shape
(81162081, 3)
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
12230796.273867842
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 1.90157975e-01],
       [0.00000000e+00, 2.00000000e+00, 9.79646259e-02],
       ...,
       [9.00800000e+03, 9.00600000e+03, 6.60662164e-01],
       [9.00800000e+03, 9.00700000e+03, 1.49150646e-01],
       [9.00800000e+03, 9.00800000e+03, 0.00000000e+00]])
eee = edge_index[:,:]
eee[:,1]
array([0.000e+00, 1.000e+00, 2.000e+00, ..., 9.006e+03, 9.007e+03,
       9.008e+03])
edge_index_list_updated = edge_index.tolist()
edge_index_list_updated[:5]
[[0.0, 0.0, 0.0],
 [0.0, 1.0, 0.19015797528259762],
 [0.0, 2.0, 0.09796462590589798],
 [0.0, 3.0, 0.1424157407389685],
 [0.0, 4.0, 0.11107338192969567]]
df50_tr
level_0 index trans_date_trans_time cc_num merchant category amt first last gender ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 476 51331 2019-01-31 00:44:00 3.543590e+15 fraud_Medhurst PLC shopping_net 921.24 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c8928ba53be26fdd997b26f7130c757e 1327970678 40.064488 -78.210499 1
1 3671 625691 2019-09-23 00:09:00 2.610530e+15 fraud_Torphy-Goyette shopping_pos 698.28 Tanya Dickerson F ... 36.2416 -86.6117 22191 Prison officer 1994-07-27 90453290b765904ed1c3426882a6788b 1348358993 35.884288 -87.513318 1
2 6641 896244 2019-12-25 21:30:00 6.011330e+15 fraud_Monahan-Morar personal_care 220.56 Lauren Butler F ... 36.0557 -96.0602 413574 Teacher, special educational needs 1971-09-01 4072a3effcf51cf7cf88f69d00642cd9 1356471044 35.789798 -95.859736 0
3 4288 717690 2019-11-02 22:22:00 6.011380e+15 fraud_Daugherty, Pouros and Beahan shopping_pos 905.43 Martin Duarte M ... 44.6001 -84.2931 864 General practice doctor 1942-05-04 f2fa1b25eef2f43fa5c09e3e1bfe7f77 1351894926 44.652759 -84.500359 1
4 4770 815813 2019-12-08 02:50:00 4.430880e+15 fraud_Hudson-Ratke grocery_pos 307.98 Alicia Morales F ... 39.3199 -106.6596 61 Public relations account executive 1939-11-04 f06eff8da349e36e623cff026de8e970 1354935056 38.389399 -106.111026 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9004 11964 177703 2019-04-02 21:48:00 3.572980e+15 fraud_Ziemann-Waters health_fitness 63.89 William Lopez M ... 41.1832 -96.9882 614 Associate Professor 1967-06-20 5b19aad28d65a6b0a912fa7b9d1896de 1333403300 42.067169 -96.876892 0
9005 5191 921796 2019-12-30 23:29:00 6.762920e+11 fraud_Wiza, Schaden and Stark misc_pos 51.41 Lisa Fitzpatrick F ... 41.2336 -75.2389 104 Financial trader 1927-08-25 b2a9e44026fc57e54b4e45ade6017668 1356910178 40.502189 -74.814956 1
9006 5390 950365 2020-01-16 03:15:00 4.807550e+12 fraud_Murray-Smitham grocery_pos 357.62 Kimberly Castro F ... 40.2158 -83.9579 133 Professor Emeritus 1954-01-29 4bfa37c329f327074e7220ea6e5d8f8d 1358306148 40.620284 -84.274495 1
9007 860 88685 2019-02-22 02:19:00 5.738600e+11 fraud_McDermott-Weimann grocery_pos 304.75 Cristian Jones M ... 42.0765 -87.7246 27020 Trade mark attorney 1986-07-23 a1c3025ddb615ab2ef890bf82fc3d66a 1329877195 42.722479 -88.362364 1
9008 7270 753787 2019-11-18 10:58:00 6.042293e+10 fraud_Terry, Johns and Bins misc_pos 1.64 Jeffrey Powers M ... 33.6028 -81.9748 46944 Secondary school teacher 1942-04-02 ee10d61782bde2b5cabc2ad649e977cc 1353236287 34.243599 -82.971344 0

9009 rows × 24 columns

  • cc_num로 그룹별로 묶자.
df50_tr[df50_tr['cc_num']==3.543590e+15]
level_0 index trans_date_trans_time cc_num merchant category amt first last gender ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 476 51331 2019-01-31 00:44:00 3.543590e+15 fraud_Medhurst PLC shopping_net 921.24 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c8928ba53be26fdd997b26f7130c757e 1327970678 40.064488 -78.210499 1
344 462 50905 2019-01-30 16:53:00 3.543590e+15 fraud_Lesch Ltd shopping_pos 881.11 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 9f7b7675c4decefd03cce56df045ed1c 1327942400 39.591484 -79.575246 1
1377 6607 814736 2019-12-07 22:17:00 3.543590e+15 fraud_Botsford and Sons home 10.41 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 aa9b533e84970309a4ad60a914a8cd77 1354918668 41.287791 -79.980592 0
1447 485 51816 2019-01-31 12:38:00 3.543590e+15 fraud_Ruecker-Mayert kids_pets 21.93 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 cec656f154e0978b0f26702c29ddeeca 1328013517 39.946187 -78.078864 1
1639 11176 12947 2019-01-08 11:08:00 3.543590e+15 fraud_Stroman, Hudson and Erdman gas_transport 76.03 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c10451edc4e21b865d049312acf18ecd 1326020892 39.503960 -78.471680 0
2046 8124 627045 2019-09-23 12:53:00 3.543590e+15 fraud_Botsford Ltd shopping_pos 3.20 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 003d591d208f7ee52277b5cc4fa4a37f 1348404838 40.066686 -79.326630 0
2093 477 51367 2019-01-31 01:36:00 3.543590e+15 fraud_Watsica, Haag and Considine shopping_pos 1090.67 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 b42bc0820a78de54845c5138b9c39dd5 1327973774 40.923284 -78.882504 1
2415 491 52402 2019-01-31 22:17:00 3.543590e+15 fraud_Metz, Russel and Metz kids_pets 22.35 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 51f9352216e99bbe9e8b03b082305971 1328048275 39.979547 -78.851379 1
2625 463 51047 2019-01-30 19:35:00 3.543590e+15 fraud_Ruecker-Mayert kids_pets 22.95 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 8e804422b761537e3a49a237afd1ea9a 1327952100 40.051981 -79.021769 1
2769 478 51374 2019-01-31 01:42:00 3.543590e+15 fraud_Schmidt and Sons shopping_net 1043.59 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 bbe4e9e431cba66e6531199ffaf79657 1327974178 40.192896 -79.366393 1
3192 505 52522 2019-01-31 23:57:00 3.543590e+15 fraud_Kutch, Steuber and Gerhold food_dining 116.45 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 fcf46ca0264437bbb938c29eca2c92ad 1328054256 40.288401 -78.286914 1
3670 11714 1010269 2020-02-20 06:02:00 3.543590e+15 fraud_Huels-Hahn gas_transport 51.80 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 b72c0124f4c5662db13e1bea2f04784b 1361340164 39.672719 -79.642589 0
3945 6087 243892 2019-05-02 13:38:00 3.543590e+15 fraud_Cruickshank-Mills entertainment 5.72 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 990da059d387e5fa7481d76ff5c29199 1335965925 40.577553 -79.315460 0
5017 484 51431 2019-01-31 03:28:00 3.543590e+15 fraud_Cremin, Hamill and Reichel misc_pos 741.98 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 41312d7d5fc76be3782b5e9cef04726f 1327980509 41.290570 -79.682069 1
5505 8148 181398 2019-04-04 23:32:00 3.543590e+15 fraud_Feil, Hilpert and Koss food_dining 89.23 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 e304fd4ebc897fce190925dadcd2b524 1333582347 39.736380 -79.481667 0
5729 11116 329202 2019-06-06 03:26:00 3.543590e+15 fraud_Connelly, Reichert and Fritsch gas_transport 69.36 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 bc0832ac8bac6d26548ab6ab553d5d5e 1338953171 40.780469 -79.668417 0
7605 481 51392 2019-01-31 02:16:00 3.543590e+15 fraud_Huels-Hahn gas_transport 12.41 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 5f4379c2fc20457f0f99a126cadda1af 1327976216 39.884234 -79.374966 1
7800 8609 55920 2019-02-03 06:51:00 3.543590e+15 fraud_Corwin-Gorczany misc_net 6.70 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 90f33381b6b6644c6d03c8cdb51d05dc 1328251865 40.064532 -78.920283 0
8100 10488 509733 2019-08-09 11:47:00 3.543590e+15 fraud_Kutch and Sons grocery_pos 108.74 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 40a620cc7c5ba396b1fe112f5361e4a9 1344512838 40.057443 -78.569798 0
8313 504 52514 2019-01-31 23:52:00 3.543590e+15 fraud_Douglas, Schneider and Turner shopping_pos 1129.56 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 ec208107f178422e0953560343d0cf8b 1328053975 40.840340 -78.027854 1

20 rows × 24 columns

df50_grouped=df50_tr.groupby(by='cc_num')
edge_index_list = []
for i in range(N):
    for j in range(N):
        if df50_tr['cc_num'][i] != df50_tr['cc_num'][j]:  # cc_num 값이 같다면
            time_difference = 0
        else:
            time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])
edge_index = np.array(edge_index_list)
edge_index.shape
(81162081, 3)
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
10988.585252761077
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index_list_updated = edge_index.tolist()
np.array(edge_index_list_updated)[:,2].mean()
8.344409093328692e-05
mm = np.array(edge_index_list_updated)[:,2].mean()

edge_index_list_updated가 w

selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape
torch.Size([2, 28472])
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])

np.save(‘edge_index.npy’)

  • edge_index 돌아가는 게 너무 오래걸려서 이렇게 저장해놓으면 빠르게 실행할 수 있다.
#import numpy as np

#data = np.array([1, 2, 3, 4, 5])
np.save('edge_index.npy', edge_index)

loaded_data = np.load('edge_index.npy')
  • npy로 끝나는 건 위에처럼 저장하기 아님 피클로!ㅡ, torch방법
x = df50_tr['amt']
x
0       921.24
1       698.28
2       220.56
3       905.43
4       307.98
         ...  
9004     63.89
9005     51.41
9006    357.62
9007    304.75
9008      1.64
Name: amt, Length: 9009, dtype: float64
a = torch.tensor(x, dtype=torch.float)
a = a.reshape(-1,1)
a
tensor([[921.2400],
        [698.2800],
        [220.5600],
        ...,
        [357.6200],
        [304.7500],
        [  1.6400]])
y = df50_tr['is_fraud']
b = torch.tensor(y,dtype=torch.int64)
b
tensor([1, 1, 0,  ..., 1, 1, 0])
import torch_geometric
data = torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b)
data
Data(x=[9009, 1], edge_index=[2, 28472], y=[9009])

- pyg lesson6

gconv = torch_geometric.nn.GCNConv(1,4)
gconv
GCNConv(1, 4)
gconv(data.x, data.edge_index)
tensor([[ 4.0225e+02,  2.5312e+02, -2.9747e+02, -1.6831e+02],
        [ 3.7246e+02,  2.3437e+02, -2.7543e+02, -1.5584e+02],
        [ 1.5695e+02,  9.8760e+01, -1.1606e+02, -6.5670e+01],
        ...,
        [ 2.5448e+02,  1.6013e+02, -1.8818e+02, -1.0648e+02],
        [ 5.4738e+02,  3.4444e+02, -4.0478e+02, -2.2903e+02],
        [ 1.1670e+00,  7.3434e-01, -8.6299e-01, -4.8830e-01]],
       grad_fn=<AddBackward0>)
list(gconv.parameters())
[Parameter containing:
 tensor([0., 0., 0., 0.], requires_grad=True),
 Parameter containing:
 tensor([[ 0.7116],
         [ 0.4478],
         [-0.5262],
         [-0.2977]], requires_grad=True)]
_,W = list(gconv.parameters())
W
Parameter containing:
tensor([[-0.6724],
        [ 0.7172],
        [-0.3185],
        [ 0.5363]], requires_grad=True)

- pyg lesson5

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
model = GCN()
model
GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 2)
)
out
tensor([[-1.8963e+02,  0.0000e+00],
        [-1.5192e+02,  0.0000e+00],
        [-5.3630e+01,  0.0000e+00],
        ...,
        [-3.0590e+02,  0.0000e+00],
        [-3.0298e+02,  0.0000e+00],
        [-1.3924e+00, -2.8567e-01]], grad_fn=<LogSoftmaxBackward0>)
data.y
tensor([1, 1, 0,  ..., 1, 1, 0])
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out, data.y)
    loss.backward()
    optimizer.step()
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred == data.y).sum() # 애큐러시는 test
acc = int(correct) / 9009
print(f'Accuracy: {acc:.4f}')
Accuracy: 0.9633
fraud_mask = (data.y == 1)
model.eval()
pred = model(data).argmax(dim=1)
correct = (pred[fraud_mask] == data.y[fraud_mask]).sum() # 애큐러시는 test
acc = int(correct) / int(fraud_mask.sum())
print(f'recall: {acc:.4f}')
recall: 0.9619
  • 위의 recall은 test가 없어서 train으로만 했던 거..!