[FRAUD] 그래프자료로 데이터정리

Author

김보람

Published

August 10, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import torch

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics 

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부     
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G


def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")   
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")  
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

    return G
    
    
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def embedding(Graph):
    # Graph -> X (feature)
    _edgs = list(Graph.edges)
    subGraph = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
    subGraph.add_nodes_from(list(set(Graph.nodes) - set(subGraph.nodes)))    
    embedded = AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
    X = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
    # Graph -> y (label)
    y = np.array(list(nx.get_edge_attributes(Graph, "label").values()))
    return X,y 

def anal(df):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=100, random_state=42) 
    lrnr.fit(X,y)
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({
        'acc':[sklearn.metrics.accuracy_score(yy,yyhat)], 
        'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
        'rec':[sklearn.metrics.recall_score(yy,yyhat)],
        'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
    )    
    return df

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

- 모든엣지를 고려

# N = 10 
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
# # edge_attr = 그래프의 웨이트 
# edge_index
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
         2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
         4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
         7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
         9, 9, 9, 9],
        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
         4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
         8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
         2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
         6, 7, 8, 9]])
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

# diff = fraudTrain.trans_date_trans_time[101]-fraudTrain.trans_date_trans_time[0]
# diff
Timedelta('0 days 01:17:00')
# diff.total_seconds()
4620.0
# theta = 86400*1.2
# theta
103680.0
# theta = 86400*1.2
# np.exp(-diff.total_seconds()/theta)
0.9564180361647693
# !git add .
# !git commit -m. 
# !git push 
# !quarto publish --no-browser --no-prompt

해보자

fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

N = len(fraudTrain)
N
1048575

- 시도1

edge_index_list = []
for i in range(N):
    for j in range(N):
        time_difference = (fraudTrain['trans_date_trans_time'][i] - fraudTrain['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])
edge_index = torch.tensor(edge_index_list).T

- 시도2

N = len(fraudTrain)
edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
# edge_attr = 그래프의 웨이트 
  • 너~무 오래걸린다.

- 시도3

  • df02을 이용해서 해보자.
_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
214520*214520
46018830400
# N = len(df02)
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T

- 시도4: df50

df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
12012*12012
144288144
df50 = df50.reset_index()
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)
df50_tr.is_fraud.mean().round(5), df50_test.is_fraud.mean().round(5)
(0.49828, 0.50516)
df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
N = len(df50_tr)
edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
edge_index
tensor([[   0,    0,    0,  ..., 9008, 9008, 9008],
        [   0,    1,    2,  ..., 9006, 9007, 9008]])
df50_tr = df50_tr.reset_index()
edge_index_list = []
for i in range(N):
    for j in range(N):
        time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])
edge_index = np.array(edge_index_list)
edge_index.shape
(81162081, 3)
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
12230796.273867842
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.00000000e+00, 1.90157975e-01],
       [0.00000000e+00, 2.00000000e+00, 9.79646259e-02],
       ...,
       [9.00800000e+03, 9.00600000e+03, 6.60662164e-01],
       [9.00800000e+03, 9.00700000e+03, 1.49150646e-01],
       [9.00800000e+03, 9.00800000e+03, 0.00000000e+00]])
eee = edge_index[:,:]
eee[:,1]
array([0.000e+00, 1.000e+00, 2.000e+00, ..., 9.006e+03, 9.007e+03,
       9.008e+03])
edge_index_list_updated = edge_index.tolist()
edge_index_list_updated[:5]
[[0.0, 0.0, 0.0],
 [0.0, 1.0, 0.19015797528259762],
 [0.0, 2.0, 0.09796462590589798],
 [0.0, 3.0, 0.1424157407389685],
 [0.0, 4.0, 0.11107338192969567]]
df50_tr
level_0 index trans_date_trans_time cc_num merchant category amt first last gender ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 476 51331 2019-01-31 00:44:00 3.543590e+15 fraud_Medhurst PLC shopping_net 921.24 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c8928ba53be26fdd997b26f7130c757e 1327970678 40.064488 -78.210499 1
1 3671 625691 2019-09-23 00:09:00 2.610530e+15 fraud_Torphy-Goyette shopping_pos 698.28 Tanya Dickerson F ... 36.2416 -86.6117 22191 Prison officer 1994-07-27 90453290b765904ed1c3426882a6788b 1348358993 35.884288 -87.513318 1
2 6641 896244 2019-12-25 21:30:00 6.011330e+15 fraud_Monahan-Morar personal_care 220.56 Lauren Butler F ... 36.0557 -96.0602 413574 Teacher, special educational needs 1971-09-01 4072a3effcf51cf7cf88f69d00642cd9 1356471044 35.789798 -95.859736 0
3 4288 717690 2019-11-02 22:22:00 6.011380e+15 fraud_Daugherty, Pouros and Beahan shopping_pos 905.43 Martin Duarte M ... 44.6001 -84.2931 864 General practice doctor 1942-05-04 f2fa1b25eef2f43fa5c09e3e1bfe7f77 1351894926 44.652759 -84.500359 1
4 4770 815813 2019-12-08 02:50:00 4.430880e+15 fraud_Hudson-Ratke grocery_pos 307.98 Alicia Morales F ... 39.3199 -106.6596 61 Public relations account executive 1939-11-04 f06eff8da349e36e623cff026de8e970 1354935056 38.389399 -106.111026 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9004 11964 177703 2019-04-02 21:48:00 3.572980e+15 fraud_Ziemann-Waters health_fitness 63.89 William Lopez M ... 41.1832 -96.9882 614 Associate Professor 1967-06-20 5b19aad28d65a6b0a912fa7b9d1896de 1333403300 42.067169 -96.876892 0
9005 5191 921796 2019-12-30 23:29:00 6.762920e+11 fraud_Wiza, Schaden and Stark misc_pos 51.41 Lisa Fitzpatrick F ... 41.2336 -75.2389 104 Financial trader 1927-08-25 b2a9e44026fc57e54b4e45ade6017668 1356910178 40.502189 -74.814956 1
9006 5390 950365 2020-01-16 03:15:00 4.807550e+12 fraud_Murray-Smitham grocery_pos 357.62 Kimberly Castro F ... 40.2158 -83.9579 133 Professor Emeritus 1954-01-29 4bfa37c329f327074e7220ea6e5d8f8d 1358306148 40.620284 -84.274495 1
9007 860 88685 2019-02-22 02:19:00 5.738600e+11 fraud_McDermott-Weimann grocery_pos 304.75 Cristian Jones M ... 42.0765 -87.7246 27020 Trade mark attorney 1986-07-23 a1c3025ddb615ab2ef890bf82fc3d66a 1329877195 42.722479 -88.362364 1
9008 7270 753787 2019-11-18 10:58:00 6.042293e+10 fraud_Terry, Johns and Bins misc_pos 1.64 Jeffrey Powers M ... 33.6028 -81.9748 46944 Secondary school teacher 1942-04-02 ee10d61782bde2b5cabc2ad649e977cc 1353236287 34.243599 -82.971344 0

9009 rows × 24 columns

  • cc_num로 그룹별로 묶자.
df50_tr[df50_tr['cc_num']==3.543590e+15]
level_0 index trans_date_trans_time cc_num merchant category amt first last gender ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 476 51331 2019-01-31 00:44:00 3.543590e+15 fraud_Medhurst PLC shopping_net 921.24 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c8928ba53be26fdd997b26f7130c757e 1327970678 40.064488 -78.210499 1
344 462 50905 2019-01-30 16:53:00 3.543590e+15 fraud_Lesch Ltd shopping_pos 881.11 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 9f7b7675c4decefd03cce56df045ed1c 1327942400 39.591484 -79.575246 1
1377 6607 814736 2019-12-07 22:17:00 3.543590e+15 fraud_Botsford and Sons home 10.41 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 aa9b533e84970309a4ad60a914a8cd77 1354918668 41.287791 -79.980592 0
1447 485 51816 2019-01-31 12:38:00 3.543590e+15 fraud_Ruecker-Mayert kids_pets 21.93 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 cec656f154e0978b0f26702c29ddeeca 1328013517 39.946187 -78.078864 1
1639 11176 12947 2019-01-08 11:08:00 3.543590e+15 fraud_Stroman, Hudson and Erdman gas_transport 76.03 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 c10451edc4e21b865d049312acf18ecd 1326020892 39.503960 -78.471680 0
2046 8124 627045 2019-09-23 12:53:00 3.543590e+15 fraud_Botsford Ltd shopping_pos 3.20 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 003d591d208f7ee52277b5cc4fa4a37f 1348404838 40.066686 -79.326630 0
2093 477 51367 2019-01-31 01:36:00 3.543590e+15 fraud_Watsica, Haag and Considine shopping_pos 1090.67 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 b42bc0820a78de54845c5138b9c39dd5 1327973774 40.923284 -78.882504 1
2415 491 52402 2019-01-31 22:17:00 3.543590e+15 fraud_Metz, Russel and Metz kids_pets 22.35 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 51f9352216e99bbe9e8b03b082305971 1328048275 39.979547 -78.851379 1
2625 463 51047 2019-01-30 19:35:00 3.543590e+15 fraud_Ruecker-Mayert kids_pets 22.95 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 8e804422b761537e3a49a237afd1ea9a 1327952100 40.051981 -79.021769 1
2769 478 51374 2019-01-31 01:42:00 3.543590e+15 fraud_Schmidt and Sons shopping_net 1043.59 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 bbe4e9e431cba66e6531199ffaf79657 1327974178 40.192896 -79.366393 1
3192 505 52522 2019-01-31 23:57:00 3.543590e+15 fraud_Kutch, Steuber and Gerhold food_dining 116.45 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 fcf46ca0264437bbb938c29eca2c92ad 1328054256 40.288401 -78.286914 1
3670 11714 1010269 2020-02-20 06:02:00 3.543590e+15 fraud_Huels-Hahn gas_transport 51.80 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 b72c0124f4c5662db13e1bea2f04784b 1361340164 39.672719 -79.642589 0
3945 6087 243892 2019-05-02 13:38:00 3.543590e+15 fraud_Cruickshank-Mills entertainment 5.72 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 990da059d387e5fa7481d76ff5c29199 1335965925 40.577553 -79.315460 0
5017 484 51431 2019-01-31 03:28:00 3.543590e+15 fraud_Cremin, Hamill and Reichel misc_pos 741.98 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 41312d7d5fc76be3782b5e9cef04726f 1327980509 41.290570 -79.682069 1
5505 8148 181398 2019-04-04 23:32:00 3.543590e+15 fraud_Feil, Hilpert and Koss food_dining 89.23 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 e304fd4ebc897fce190925dadcd2b524 1333582347 39.736380 -79.481667 0
5729 11116 329202 2019-06-06 03:26:00 3.543590e+15 fraud_Connelly, Reichert and Fritsch gas_transport 69.36 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 bc0832ac8bac6d26548ab6ab553d5d5e 1338953171 40.780469 -79.668417 0
7605 481 51392 2019-01-31 02:16:00 3.543590e+15 fraud_Huels-Hahn gas_transport 12.41 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 5f4379c2fc20457f0f99a126cadda1af 1327976216 39.884234 -79.374966 1
7800 8609 55920 2019-02-03 06:51:00 3.543590e+15 fraud_Corwin-Gorczany misc_net 6.70 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 90f33381b6b6644c6d03c8cdb51d05dc 1328251865 40.064532 -78.920283 0
8100 10488 509733 2019-08-09 11:47:00 3.543590e+15 fraud_Kutch and Sons grocery_pos 108.74 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 40a620cc7c5ba396b1fe112f5361e4a9 1344512838 40.057443 -78.569798 0
8313 504 52514 2019-01-31 23:52:00 3.543590e+15 fraud_Douglas, Schneider and Turner shopping_pos 1129.56 Margaret Lam F ... 40.4603 -79.0097 922 Early years teacher 1972-10-04 ec208107f178422e0953560343d0cf8b 1328053975 40.840340 -78.027854 1

20 rows × 24 columns

df50_grouped=df50_tr.groupby(by='cc_num')
edge_index_list = []
for i in range(N):
    for j in range(N):
        if df50_tr['cc_num'][i] != df50_tr['cc_num'][j]:  # cc_num 값이 같다면
            time_difference = 0
        else:
            time_difference = (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
        edge_index_list.append([i, j, time_difference])
edge_index = np.array(edge_index_list)
edge_index.shape
(81162081, 3)
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index[:,2] = np.abs(edge_index[:,2])
theta = edge_index[:,2].mean()
theta
10988.585252761077
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index[:,2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta)
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
       [0.000e+00, 1.000e+00, 0.000e+00],
       [0.000e+00, 2.000e+00, 0.000e+00],
       ...,
       [9.008e+03, 9.006e+03, 0.000e+00],
       [9.008e+03, 9.007e+03, 0.000e+00],
       [9.008e+03, 9.008e+03, 0.000e+00]])
edge_index_list_updated = edge_index.tolist()
np.array(edge_index_list_updated)[:,2].mean()
8.344409093328692e-05
mm = np.array(edge_index_list_updated)[:,2].mean()

edge_index_list_updated가 w

selected_edges = [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm]
edge_index_selected = torch.tensor(selected_edges, dtype=torch.long).t()
edge_index_selected.shape
torch.Size([2, 28472])
x = df50_tr['amt']
x
0       921.24
1       698.28
2       220.56
3       905.43
4       307.98
         ...  
9004     63.89
9005     51.41
9006    357.62
9007    304.75
9008      1.64
Name: amt, Length: 9009, dtype: float64
a = torch.tensor(x, dtype=torch.float)
a = a.reshape(-1,1)
a
tensor([[921.2400],
        [698.2800],
        [220.5600],
        ...,
        [357.6200],
        [304.7500],
        [  1.6400]])
y = df50_tr['is_fraud']
b = torch.tensor(y,dtype=torch.int64)
b
tensor([1, 1, 0,  ..., 1, 1, 0])
import torch_geometric
data = torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b)
data
Data(x=[9009, 1], edge_index=[2, 28472], y=[9009])
  • 예시처럼 edge_index 하려햇ㄴ는뎅.. 흠..

- pyg lesson6

gconv = torch_geometric.nn.GCNConv(1,4)
gconv
GCNConv(1, 4)
gconv(data.x, data.edge_index)
tensor([[-3.8013e+02,  4.0544e+02, -1.8002e+02,  3.0315e+02],
        [-3.5198e+02,  3.7541e+02, -1.6669e+02,  2.8070e+02],
        [-1.4832e+02,  1.5819e+02, -7.0240e+01,  1.1828e+02],
        ...,
        [-2.4048e+02,  2.5649e+02, -1.1389e+02,  1.9178e+02],
        [-5.1728e+02,  5.5172e+02, -2.4497e+02,  4.1253e+02],
        [-1.1028e+00,  1.1762e+00, -5.2228e-01,  8.7949e-01]],
       grad_fn=<AddBackward0>)
list(gconv.parameters())
[Parameter containing:
 tensor([0., 0., 0., 0.], requires_grad=True),
 Parameter containing:
 tensor([[-0.6724],
         [ 0.7172],
         [-0.3185],
         [ 0.5363]], requires_grad=True)]
_,W = list(gconv.parameters())
W
Parameter containing:
tensor([[-0.6724],
        [ 0.7172],
        [-0.3185],
        [ 0.5363]], requires_grad=True)

- pyg lesson5

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(9009, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
model = GCN()
model
GCN(
  (conv1): GCNConv(9009, 16)
  (conv2): GCNConv(16, 2)
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
model.train()
GCN(
  (conv1): GCNConv(9009, 16)
  (conv2): GCNConv(16, 2)
)
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(data.x, data.y)
    loss.backward()
    optimizer.step()
RuntimeError: mat1 and mat2 shapes cannot be multiplied (9009x1 and 9009x16)
  • df50_test
df50_test 
NameError: name 'df50_test' is not defined
data
Data(x=[9009, 1], edge_index=[2, 28472], y=[9009])

x=[9009, 1] : 9009개의 거래량이 있다. 특징은 걍.. 거래량 하나? <- 거래자체에 대한 특징을 더 추가해도 좋을듯.. 흠.

edge_index=[2, 28472]: 거래끼리의 edge값은 28472.. 일단 고객id가 같지 않으면 엣지값0으로 하고..

data.train_mask
AttributeError: 'GlobalStorage' object has no attribute 'train_mask'