import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import torch
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
imports
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def embedding(Graph):
# Graph -> X (feature)
= list(Graph.edges)
_edgs = Graph.edge_subgraph([_edgs[x] for x in range(len(Graph.edges))]).copy()
subGraph list(set(Graph.nodes) - set(subGraph.nodes)))
subGraph.add_nodes_from(= AverageEmbedder(Node2Vec(subGraph, weight_key='weight').fit(window=10).wv)
embedded = [embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in range(len(Graph.edges))]
X # Graph -> y (label)
= np.array(list(nx.get_edge_attributes(Graph, "label").values()))
y return X,y
def anal(df):
= build_graph_bipartite(df)
Graph = embedding(Graph)
X,XX,y,yy = RandomForestClassifier(n_estimators=100, random_state=42)
lrnr
lrnr.fit(X,y)= lrnr.predict(XX)
yyhat = pd.DataFrame({
df 'acc':[sklearn.metrics.accuracy_score(yy,yyhat)],
'pre':[sklearn.metrics.precision_score(yy,yyhat)],
'rec':[sklearn.metrics.recall_score(yy,yyhat)],
'f1':[sklearn.metrics.f1_score(yy,yyhat)]}
) return df
def our_sampling1(df):
= set(df.query('is_fraud==1').cc_num.tolist())
cus_list return df.query("cc_num in @ cus_list")
-
모든엣지를 고려
# N = 10
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
# # edge_attr = 그래프의 웨이트
# edge_index
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9],
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3,
4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7,
8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
6, 7, 8, 9]])
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
# diff = fraudTrain.trans_date_trans_time[101]-fraudTrain.trans_date_trans_time[0]
# diff
Timedelta('0 days 01:17:00')
# diff.total_seconds()
4620.0
# theta = 86400*1.2
# theta
103680.0
# theta = 86400*1.2
# np.exp(-diff.total_seconds()/theta)
0.9564180361647693
# !git add .
# !git commit -m.
# !git push
# !quarto publish --no-browser --no-prompt
해보자
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
= len(fraudTrain)
N N
1048575
-
시도1
= []
edge_index_list for i in range(N):
for j in range(N):
= (fraudTrain['trans_date_trans_time'][i] - fraudTrain['trans_date_trans_time'][j]).total_seconds()
time_difference edge_index_list.append([i, j, time_difference])
= torch.tensor(edge_index_list).T edge_index
-
시도2
= len(fraudTrain)
N = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
edge_index # edge_attr = 그래프의 웨이트
- 너~무 오래걸린다.
-
시도3
- df02을 이용해서 해보자.
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
(214520, 22)
214520*214520
46018830400
# N = len(df02)
# edge_index = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
-
시도4: df50
= down_sample_textbook(df02)
df50 df50.shape
(12012, 22)
12012*12012
144288144
= df50.reset_index() df50
= sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
round(5), df50_test.is_fraud.mean().round(5) df50_tr.is_fraud.mean().
(0.49828, 0.50516)
df50_tr.shape, df50_test.shape
((9009, 23), (3003, 23))
= len(df50_tr)
N = torch.tensor([[i,j] for i in range(N) for j in range(N)]).T
edge_index edge_index
tensor([[ 0, 0, 0, ..., 9008, 9008, 9008],
[ 0, 1, 2, ..., 9006, 9007, 9008]])
= df50_tr.reset_index() df50_tr
= []
edge_index_list for i in range(N):
for j in range(N):
= (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
time_difference edge_index_list.append([i, j, time_difference])
= np.array(edge_index_list) edge_index
edge_index.shape
(81162081, 3)
2] = np.abs(edge_index[:,2]) edge_index[:,
= edge_index[:,2].mean()
theta theta
12230796.273867842
2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta) edge_index[:,
edge_index
array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
[0.00000000e+00, 1.00000000e+00, 1.90157975e-01],
[0.00000000e+00, 2.00000000e+00, 9.79646259e-02],
...,
[9.00800000e+03, 9.00600000e+03, 6.60662164e-01],
[9.00800000e+03, 9.00700000e+03, 1.49150646e-01],
[9.00800000e+03, 9.00800000e+03, 0.00000000e+00]])
= edge_index[:,:] eee
1] eee[:,
array([0.000e+00, 1.000e+00, 2.000e+00, ..., 9.006e+03, 9.007e+03,
9.008e+03])
= edge_index.tolist() edge_index_list_updated
5] edge_index_list_updated[:
[[0.0, 0.0, 0.0],
[0.0, 1.0, 0.19015797528259762],
[0.0, 2.0, 0.09796462590589798],
[0.0, 3.0, 0.1424157407389685],
[0.0, 4.0, 0.11107338192969567]]
df50_tr
level_0 | index | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 476 | 51331 | 2019-01-31 00:44:00 | 3.543590e+15 | fraud_Medhurst PLC | shopping_net | 921.24 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | c8928ba53be26fdd997b26f7130c757e | 1327970678 | 40.064488 | -78.210499 | 1 |
1 | 3671 | 625691 | 2019-09-23 00:09:00 | 2.610530e+15 | fraud_Torphy-Goyette | shopping_pos | 698.28 | Tanya | Dickerson | F | ... | 36.2416 | -86.6117 | 22191 | Prison officer | 1994-07-27 | 90453290b765904ed1c3426882a6788b | 1348358993 | 35.884288 | -87.513318 | 1 |
2 | 6641 | 896244 | 2019-12-25 21:30:00 | 6.011330e+15 | fraud_Monahan-Morar | personal_care | 220.56 | Lauren | Butler | F | ... | 36.0557 | -96.0602 | 413574 | Teacher, special educational needs | 1971-09-01 | 4072a3effcf51cf7cf88f69d00642cd9 | 1356471044 | 35.789798 | -95.859736 | 0 |
3 | 4288 | 717690 | 2019-11-02 22:22:00 | 6.011380e+15 | fraud_Daugherty, Pouros and Beahan | shopping_pos | 905.43 | Martin | Duarte | M | ... | 44.6001 | -84.2931 | 864 | General practice doctor | 1942-05-04 | f2fa1b25eef2f43fa5c09e3e1bfe7f77 | 1351894926 | 44.652759 | -84.500359 | 1 |
4 | 4770 | 815813 | 2019-12-08 02:50:00 | 4.430880e+15 | fraud_Hudson-Ratke | grocery_pos | 307.98 | Alicia | Morales | F | ... | 39.3199 | -106.6596 | 61 | Public relations account executive | 1939-11-04 | f06eff8da349e36e623cff026de8e970 | 1354935056 | 38.389399 | -106.111026 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
9004 | 11964 | 177703 | 2019-04-02 21:48:00 | 3.572980e+15 | fraud_Ziemann-Waters | health_fitness | 63.89 | William | Lopez | M | ... | 41.1832 | -96.9882 | 614 | Associate Professor | 1967-06-20 | 5b19aad28d65a6b0a912fa7b9d1896de | 1333403300 | 42.067169 | -96.876892 | 0 |
9005 | 5191 | 921796 | 2019-12-30 23:29:00 | 6.762920e+11 | fraud_Wiza, Schaden and Stark | misc_pos | 51.41 | Lisa | Fitzpatrick | F | ... | 41.2336 | -75.2389 | 104 | Financial trader | 1927-08-25 | b2a9e44026fc57e54b4e45ade6017668 | 1356910178 | 40.502189 | -74.814956 | 1 |
9006 | 5390 | 950365 | 2020-01-16 03:15:00 | 4.807550e+12 | fraud_Murray-Smitham | grocery_pos | 357.62 | Kimberly | Castro | F | ... | 40.2158 | -83.9579 | 133 | Professor Emeritus | 1954-01-29 | 4bfa37c329f327074e7220ea6e5d8f8d | 1358306148 | 40.620284 | -84.274495 | 1 |
9007 | 860 | 88685 | 2019-02-22 02:19:00 | 5.738600e+11 | fraud_McDermott-Weimann | grocery_pos | 304.75 | Cristian | Jones | M | ... | 42.0765 | -87.7246 | 27020 | Trade mark attorney | 1986-07-23 | a1c3025ddb615ab2ef890bf82fc3d66a | 1329877195 | 42.722479 | -88.362364 | 1 |
9008 | 7270 | 753787 | 2019-11-18 10:58:00 | 6.042293e+10 | fraud_Terry, Johns and Bins | misc_pos | 1.64 | Jeffrey | Powers | M | ... | 33.6028 | -81.9748 | 46944 | Secondary school teacher | 1942-04-02 | ee10d61782bde2b5cabc2ad649e977cc | 1353236287 | 34.243599 | -82.971344 | 0 |
9009 rows × 24 columns
- cc_num로 그룹별로 묶자.
'cc_num']==3.543590e+15] df50_tr[df50_tr[
level_0 | index | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 476 | 51331 | 2019-01-31 00:44:00 | 3.543590e+15 | fraud_Medhurst PLC | shopping_net | 921.24 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | c8928ba53be26fdd997b26f7130c757e | 1327970678 | 40.064488 | -78.210499 | 1 |
344 | 462 | 50905 | 2019-01-30 16:53:00 | 3.543590e+15 | fraud_Lesch Ltd | shopping_pos | 881.11 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 9f7b7675c4decefd03cce56df045ed1c | 1327942400 | 39.591484 | -79.575246 | 1 |
1377 | 6607 | 814736 | 2019-12-07 22:17:00 | 3.543590e+15 | fraud_Botsford and Sons | home | 10.41 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | aa9b533e84970309a4ad60a914a8cd77 | 1354918668 | 41.287791 | -79.980592 | 0 |
1447 | 485 | 51816 | 2019-01-31 12:38:00 | 3.543590e+15 | fraud_Ruecker-Mayert | kids_pets | 21.93 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | cec656f154e0978b0f26702c29ddeeca | 1328013517 | 39.946187 | -78.078864 | 1 |
1639 | 11176 | 12947 | 2019-01-08 11:08:00 | 3.543590e+15 | fraud_Stroman, Hudson and Erdman | gas_transport | 76.03 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | c10451edc4e21b865d049312acf18ecd | 1326020892 | 39.503960 | -78.471680 | 0 |
2046 | 8124 | 627045 | 2019-09-23 12:53:00 | 3.543590e+15 | fraud_Botsford Ltd | shopping_pos | 3.20 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 003d591d208f7ee52277b5cc4fa4a37f | 1348404838 | 40.066686 | -79.326630 | 0 |
2093 | 477 | 51367 | 2019-01-31 01:36:00 | 3.543590e+15 | fraud_Watsica, Haag and Considine | shopping_pos | 1090.67 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | b42bc0820a78de54845c5138b9c39dd5 | 1327973774 | 40.923284 | -78.882504 | 1 |
2415 | 491 | 52402 | 2019-01-31 22:17:00 | 3.543590e+15 | fraud_Metz, Russel and Metz | kids_pets | 22.35 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 51f9352216e99bbe9e8b03b082305971 | 1328048275 | 39.979547 | -78.851379 | 1 |
2625 | 463 | 51047 | 2019-01-30 19:35:00 | 3.543590e+15 | fraud_Ruecker-Mayert | kids_pets | 22.95 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 8e804422b761537e3a49a237afd1ea9a | 1327952100 | 40.051981 | -79.021769 | 1 |
2769 | 478 | 51374 | 2019-01-31 01:42:00 | 3.543590e+15 | fraud_Schmidt and Sons | shopping_net | 1043.59 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | bbe4e9e431cba66e6531199ffaf79657 | 1327974178 | 40.192896 | -79.366393 | 1 |
3192 | 505 | 52522 | 2019-01-31 23:57:00 | 3.543590e+15 | fraud_Kutch, Steuber and Gerhold | food_dining | 116.45 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | fcf46ca0264437bbb938c29eca2c92ad | 1328054256 | 40.288401 | -78.286914 | 1 |
3670 | 11714 | 1010269 | 2020-02-20 06:02:00 | 3.543590e+15 | fraud_Huels-Hahn | gas_transport | 51.80 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | b72c0124f4c5662db13e1bea2f04784b | 1361340164 | 39.672719 | -79.642589 | 0 |
3945 | 6087 | 243892 | 2019-05-02 13:38:00 | 3.543590e+15 | fraud_Cruickshank-Mills | entertainment | 5.72 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 990da059d387e5fa7481d76ff5c29199 | 1335965925 | 40.577553 | -79.315460 | 0 |
5017 | 484 | 51431 | 2019-01-31 03:28:00 | 3.543590e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 741.98 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 41312d7d5fc76be3782b5e9cef04726f | 1327980509 | 41.290570 | -79.682069 | 1 |
5505 | 8148 | 181398 | 2019-04-04 23:32:00 | 3.543590e+15 | fraud_Feil, Hilpert and Koss | food_dining | 89.23 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | e304fd4ebc897fce190925dadcd2b524 | 1333582347 | 39.736380 | -79.481667 | 0 |
5729 | 11116 | 329202 | 2019-06-06 03:26:00 | 3.543590e+15 | fraud_Connelly, Reichert and Fritsch | gas_transport | 69.36 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | bc0832ac8bac6d26548ab6ab553d5d5e | 1338953171 | 40.780469 | -79.668417 | 0 |
7605 | 481 | 51392 | 2019-01-31 02:16:00 | 3.543590e+15 | fraud_Huels-Hahn | gas_transport | 12.41 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 5f4379c2fc20457f0f99a126cadda1af | 1327976216 | 39.884234 | -79.374966 | 1 |
7800 | 8609 | 55920 | 2019-02-03 06:51:00 | 3.543590e+15 | fraud_Corwin-Gorczany | misc_net | 6.70 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 90f33381b6b6644c6d03c8cdb51d05dc | 1328251865 | 40.064532 | -78.920283 | 0 |
8100 | 10488 | 509733 | 2019-08-09 11:47:00 | 3.543590e+15 | fraud_Kutch and Sons | grocery_pos | 108.74 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | 40a620cc7c5ba396b1fe112f5361e4a9 | 1344512838 | 40.057443 | -78.569798 | 0 |
8313 | 504 | 52514 | 2019-01-31 23:52:00 | 3.543590e+15 | fraud_Douglas, Schneider and Turner | shopping_pos | 1129.56 | Margaret | Lam | F | ... | 40.4603 | -79.0097 | 922 | Early years teacher | 1972-10-04 | ec208107f178422e0953560343d0cf8b | 1328053975 | 40.840340 | -78.027854 | 1 |
20 rows × 24 columns
=df50_tr.groupby(by='cc_num') df50_grouped
= []
edge_index_list for i in range(N):
for j in range(N):
if df50_tr['cc_num'][i] != df50_tr['cc_num'][j]: # cc_num 값이 같다면
= 0
time_difference else:
= (df50_tr['trans_date_trans_time'][i] - df50_tr['trans_date_trans_time'][j]).total_seconds()
time_difference edge_index_list.append([i, j, time_difference])
= np.array(edge_index_list) edge_index
edge_index.shape
(81162081, 3)
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
[0.000e+00, 1.000e+00, 0.000e+00],
[0.000e+00, 2.000e+00, 0.000e+00],
...,
[9.008e+03, 9.006e+03, 0.000e+00],
[9.008e+03, 9.007e+03, 0.000e+00],
[9.008e+03, 9.008e+03, 0.000e+00]])
2] = np.abs(edge_index[:,2]) edge_index[:,
= edge_index[:,2].mean()
theta theta
10988.585252761077
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
[0.000e+00, 1.000e+00, 0.000e+00],
[0.000e+00, 2.000e+00, 0.000e+00],
...,
[9.008e+03, 9.006e+03, 0.000e+00],
[9.008e+03, 9.007e+03, 0.000e+00],
[9.008e+03, 9.008e+03, 0.000e+00]])
2] = (np.exp(-edge_index[:,2]/theta)!=1) * np.exp(-edge_index[:,2]/theta) edge_index[:,
edge_index
array([[0.000e+00, 0.000e+00, 0.000e+00],
[0.000e+00, 1.000e+00, 0.000e+00],
[0.000e+00, 2.000e+00, 0.000e+00],
...,
[9.008e+03, 9.006e+03, 0.000e+00],
[9.008e+03, 9.007e+03, 0.000e+00],
[9.008e+03, 9.008e+03, 0.000e+00]])
= edge_index.tolist() edge_index_list_updated
2].mean() np.array(edge_index_list_updated)[:,
8.344409093328692e-05
= np.array(edge_index_list_updated)[:,2].mean() mm
edge_index_list_updated가 w
= [(int(row[0]), int(row[1])) for row in edge_index_list_updated if row[2] > mm] selected_edges
= torch.tensor(selected_edges, dtype=torch.long).t() edge_index_selected
edge_index_selected.shape
torch.Size([2, 28472])
= df50_tr['amt'] x
x
0 921.24
1 698.28
2 220.56
3 905.43
4 307.98
...
9004 63.89
9005 51.41
9006 357.62
9007 304.75
9008 1.64
Name: amt, Length: 9009, dtype: float64
= torch.tensor(x, dtype=torch.float) a
= a.reshape(-1,1)
a a
tensor([[921.2400],
[698.2800],
[220.5600],
...,
[357.6200],
[304.7500],
[ 1.6400]])
= df50_tr['is_fraud'] y
= torch.tensor(y,dtype=torch.int64) b
b
tensor([1, 1, 0, ..., 1, 1, 0])
import torch_geometric
= torch_geometric.data.Data(x=a, edge_index = edge_index_selected, y=b) data
data
Data(x=[9009, 1], edge_index=[2, 28472], y=[9009])
- 예시처럼 edge_index 하려햇ㄴ는뎅.. 흠..
-
pyg lesson6
= torch_geometric.nn.GCNConv(1,4)
gconv gconv
GCNConv(1, 4)
gconv(data.x, data.edge_index)
tensor([[-3.8013e+02, 4.0544e+02, -1.8002e+02, 3.0315e+02],
[-3.5198e+02, 3.7541e+02, -1.6669e+02, 2.8070e+02],
[-1.4832e+02, 1.5819e+02, -7.0240e+01, 1.1828e+02],
...,
[-2.4048e+02, 2.5649e+02, -1.1389e+02, 1.9178e+02],
[-5.1728e+02, 5.5172e+02, -2.4497e+02, 4.1253e+02],
[-1.1028e+00, 1.1762e+00, -5.2228e-01, 8.7949e-01]],
grad_fn=<AddBackward0>)
list(gconv.parameters())
[Parameter containing:
tensor([0., 0., 0., 0.], requires_grad=True),
Parameter containing:
tensor([[-0.6724],
[ 0.7172],
[-0.3185],
[ 0.5363]], requires_grad=True)]
= list(gconv.parameters())
_,W W
Parameter containing:
tensor([[-0.6724],
[ 0.7172],
[-0.3185],
[ 0.5363]], requires_grad=True)
-
pyg lesson5
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(9009, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= GCN() model
model
GCN(
(conv1): GCNConv(9009, 16)
(conv2): GCNConv(16, 2)
)
= torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
optimizer model.train()
GCN(
(conv1): GCNConv(9009, 16)
(conv2): GCNConv(16, 2)
)
for epoch in range(200):
optimizer.zero_grad()= model(data)
out = F.nll_loss(data.x, data.y)
loss
loss.backward() optimizer.step()
RuntimeError: mat1 and mat2 shapes cannot be multiplied (9009x1 and 9009x16)
- df50_test
df50_test
NameError: name 'df50_test' is not defined
data
Data(x=[9009, 1], edge_index=[2, 28472], y=[9009])
x=[9009, 1]
: 9009개의 거래량이 있다. 특징은 걍.. 거래량 하나? <- 거래자체에 대한 특징을 더 추가해도 좋을듯.. 흠.
edge_index=[2, 28472]
: 거래끼리의 edge값은 28472.. 일단 고객id가 같지 않으면 엣지값0으로 하고..
data.train_mask
AttributeError: 'GlobalStorage' object has no attribute 'train_mask'