CH8. 신용카드 거래 분석(로지스틱+그래프)

graph
Author

김보람

Published

April 27, 2023

import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

default_edge_color = 'gray'
default_node_color = '#407cc9'
enhanced_node_color = '#f5b042'
enhanced_edge_color = '#cc2f04'

df

import pandas as pd
df = pd.read_csv("fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df.head()
Unnamed: 0 trans_date_trans_time cc_num merchant category amt first last gender street ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
669418 669418 2019-10-12 18:21 4.089100e+18 fraud_Haley, Jewess and Bechtelar shopping_pos 7.53 Debra Stark F 686 Linda Rest ... 32.3836 -94.8653 24536 Multimedia programmer 1983-10-14 d313353fa30233e5fab5468e852d22fc 1350066071 32.202008 -94.371865 0
32567 32567 2019-01-20 13:06 4.247920e+12 fraud_Turner LLC travel 3.79 Judith Moss F 46297 Benjamin Plains Suite 703 ... 39.5370 -83.4550 22305 Television floor manager 1939-03-09 88c65b4e1585934d578511e627fe3589 1327064760 39.156673 -82.930503 0
156587 156587 2019-03-24 18:09 4.026220e+12 fraud_Klein Group entertainment 59.07 Debbie Payne F 204 Ashley Neck Apt. 169 ... 41.5224 -71.9934 4720 Broadcast presenter 1977-05-18 3bd9ede04b5c093143d5e5292940b670 1332612553 41.657152 -72.595751 0
1020243 1020243 2020-02-25 15:12 4.957920e+12 fraud_Monahan-Morar personal_care 25.58 Alan Parsons M 0547 Russell Ford Suite 574 ... 39.6171 -102.4776 207 Network engineer 1955-12-04 19e16ee7a01d229e750359098365e321 1361805120 39.080346 -103.213452 0
116272 116272 2019-03-06 23:19 4.178100e+15 fraud_Kozey-Kuhlman personal_care 84.96 Jill Flores F 639 Cruz Islands ... 41.9488 -86.4913 3104 Horticulturist, commercial 1981-03-29 a0c8641ca1f5d6e243ed5a2246e66176 1331075954 42.502065 -86.732664 0

5 rows × 23 columns

df["is_fraud"].value_counts()
0    208514
1      6006
Name: is_fraud, dtype: int64
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)
1    6006
0    6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.30, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.29it/s]

_df2

cus_list = set(_df.query('is_fraud==1').cc_num.tolist())
_df2 = _df.query("cc_num in @ cus_list")
_df2 = _df2.assign(time= list(map(lambda x: int(x.split(' ')[-1].split(':')[0]), _df2['trans_date_trans_time'])))
_df2["is_fraud"].value_counts()
0    645424
1      6006
Name: is_fraud, dtype: int64
df = _df2 
def build_graph_bipartite2(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled2 = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled2 = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled2.is_fraud.value_counts())
G_down2 = build_graph_bipartite(df_downsampled2)
1    6006
0    6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split


train_edges2, test_edges2, train_labels2, test_labels2 = train_test_split(list(range(len(G_down2.edges))), 
                                                                      list(nx.get_edge_attributes(G_down2, "label").values()), 
                                                                      test_size=0.30, 
                                                                      random_state=42)
edgs2 = list(G_down2.edges)
train_graph2 = G_down2.edge_subgraph([edgs[x] for x in train_edges2]).copy()
train_graph2.add_nodes_from(list(set(G_down2.nodes) - set(train_graph2.nodes)))
node2vec_train2 = Node2Vec(train_graph2, weight_key='weight')
model_train2 = node2vec_train2.fit(window=10)
Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.40it/s]

traing(graph), test(logistic)

from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges2]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 

    y_pred2 = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred2)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred2)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred2)) 
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.6554307116104869
Recall: 0.09599561162918267
F1-Score: 0.16746411483253587
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.7195710455764075
Recall: 0.7361492046077893
F1-Score: 0.7277657266811279
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.4666666666666667
Recall: 0.01151947339550192
F1-Score: 0.022483940042826552
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.5319148936170213
Recall: 0.013713658804168952
F1-Score: 0.026737967914438502