[FRAUD] 책_코드

graph
Author

김보람

Published

May 18, 2024

FAURD코드

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv

- 이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

- 삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    
  • 판매자, 고객, 거래에 노드 할당
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])
G_bi_8 = build_graph_bipartite(df_train8, nx.Graph(name="Bipartite Undirect"))
G_bi_7 = build_graph_bipartite(df_train7, nx.Graph(name="Bipartite Undirect"))
G_bi_6 = build_graph_bipartite(df_train6, nx.Graph(name="Bipartite Undirect"))
G_bi_5 = build_graph_bipartite(df_train5, nx.Graph(name="Bipartite Undirect"))
G_bi_4 = build_graph_bipartite(df_train4, nx.Graph(name="Bipartite Undirect"))
G_bi_3 = build_graph_bipartite(df_train3, nx.Graph(name="Bipartite Undirect"))
G_bi_2 = build_graph_bipartite(df_train2, nx.Graph(name="Bipartite Undirect"))
G_bi_1 = build_graph_bipartite(df_train1, nx.Graph(name="Bipartite Undirect"))
G_bi_test = build_graph_bipartite(df_test, nx.Graph(name="Bipartite Undirect"))
G_8 = build_graph_bipartite(_df8, nx.Graph(name="Bipartite Undirect"))
G_7 = build_graph_bipartite(_df7, nx.Graph(name="Bipartite Undirect"))
G_6 = build_graph_bipartite(_df6, nx.Graph(name="Bipartite Undirect"))
G_5 = build_graph_bipartite(_df5, nx.Graph(name="Bipartite Undirect"))
G_4 = build_graph_bipartite(_df4, nx.Graph(name="Bipartite Undirect"))
G_3 = build_graph_bipartite(_df3, nx.Graph(name="Bipartite Undirect"))
G_2 = build_graph_bipartite(_df2, nx.Graph(name="Bipartite Undirect"))
G_1 = build_graph_bipartite(_df1, nx.Graph(name="Bipartite Undirect"))



train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_8.edges))), 
                                                                      list(nx.get_edge_attributes(G_8, "label").values()), 
                                                                      test_size=0.973961397229567, 
                                                                      random_state=42)
edgs = list(G_8.edges)
train_graph = G_8.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph.add_nodes_from(list(set(G_8.nodes) - set(train_graph.nodes)))
train_graph.number_of_edges(), train_graph.number_of_nodes()
(6002, 1636)
  • 데이터 8:2 비율로 학습 검증
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  2.96it/s]
  • Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]

for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv)

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]

    rf = RandomForestClassifier(n_estimators=1000, random_state=42)
    rf.fit(train_embeddings, train_labels)

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred))
    print('Recall:', metrics.recall_score(test_labels, y_pred))
    print('F1-Score:', metrics.f1_score(test_labels, y_pred))
    print('Accuracy:', metrics.accuracy_score(test_labels, y_pred))
    print('auc:', metrics.roc_auc_score(test_labels, y_pred))
        
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Accuracy: 0.9743555484296225
auc: 0.5
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Accuracy: 0.9743110113480484
auc: 0.49997714536462284
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.3333333333333333
Recall: 0.0001736714136853074
F1-Score: 0.0003471619510501649
Accuracy: 0.9743510947214651
auc: 0.5000822647797671
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.3333333333333333
Recall: 0.0001736714136853074
F1-Score: 0.0003471619510501649
Accuracy: 0.9743510947214651
auc: 0.5000822647797671
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))