[FRAUD] 책_코드 함수 만들기

graph
Author

김보람

Published

May 20, 2024

FAURD코드

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv


import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])

- 이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
G_bi_8 = build_graph_bipartite(df_train8, nx.Graph(name="Bipartite Undirect"))
G_bi_7 = build_graph_bipartite(df_train7, nx.Graph(name="Bipartite Undirect"))
G_bi_6 = build_graph_bipartite(df_train6, nx.Graph(name="Bipartite Undirect"))
G_bi_5 = build_graph_bipartite(df_train5, nx.Graph(name="Bipartite Undirect"))
G_bi_4 = build_graph_bipartite(df_train4, nx.Graph(name="Bipartite Undirect"))
G_bi_3 = build_graph_bipartite(df_train3, nx.Graph(name="Bipartite Undirect"))
G_bi_2 = build_graph_bipartite(df_train2, nx.Graph(name="Bipartite Undirect"))
G_bi_1 = build_graph_bipartite(df_train1, nx.Graph(name="Bipartite Undirect"))
G_bi_test = build_graph_bipartite(df_test, nx.Graph(name="Bipartite Undirect"))
def evaluate_node_classification(G_train, G_test, embedding_dimension=128, random_state=42):
    # Ensure the same set of nodes for both train and test graphs
    common_nodes = set(G_train.nodes).intersection(G_test.nodes)
    G_train = G_train.subgraph(common_nodes).copy()
    G_test = G_test.subgraph(common_nodes).copy()

    node2vec_train = Node2Vec(G_train, dimensions=embedding_dimension, weight_key='weight')
    model_train = node2vec_train.fit(window=10)
    
    classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    evaluation_results = {}
    
    train_labels = [G_train.edges[edge]['label'] for edge in G_train.edges]
    y_test =  list(nx.get_edge_attributes(G_test, "label").values())
    
    for cl in classes:
        embeddings_train = cl(keyed_vectors=model_train.wv)

        train_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_train.edges]
        test_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_test.edges]

        rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)
        rf.fit(train_embeddings, train_labels)

        yhat = rf.predict(test_embeddings)
        acc = metrics.accuracy_score(y_test, yhat)
        pre = metrics.precision_score(y_test, yhat)
        rec = metrics.recall_score(y_test, yhat)
        f1 = metrics.f1_score(y_test, yhat)
        auc = metrics.roc_auc_score(y_test, yhat)
        
        evaluation_results[cl.__name__] = {"accuracy": acc, "precision": pre, "recall": rec, "f1-score": f1, "auc": auc}

    return evaluation_results
evaluate_node_classification(G_bi_8, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:06<00:00,  1.59it/s]
{'HadamardEmbedder': {'accuracy': 0.9399995452788578,
  'precision': 0.007048912576291584,
  'recall': 0.047508690614136734,
  'f1-score': 0.01227636799161614,
  'auc': 0.4972841749501759},
 'AverageEmbedder': {'accuracy': 0.6192437987404225,
  'precision': 0.007648711606349511,
  'recall': 0.3690614136732329,
  'f1-score': 0.014986824769433466,
  'auc': 0.4951421492099717},
 'WeightedL1Embedder': {'accuracy': 0.9711888684264375,
  'precision': 0.006423982869379015,
  'recall': 0.01738122827346466,
  'f1-score': 0.009380863039399626,
  'auc': 0.498057630805767},
 'WeightedL2Embedder': {'accuracy': 0.9706477502671487,
  'precision': 0.006264355815410315,
  'recall': 0.01738122827346466,
  'f1-score': 0.009209516500383728,
  'auc': 0.4977849314487875}}
evaluate_node_classification(G_bi_7, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:06<00:00,  1.65it/s]
{'HadamardEmbedder': {'accuracy': 0.9847127608217562,
  'precision': 0.008928571428571428,
  'recall': 0.00847457627118644,
  'f1-score': 0.008695652173913042,
  'auc': 0.5004863757514998},
 'AverageEmbedder': {'accuracy': 0.8575828282286471,
  'precision': 0.008236639973851937,
  'recall': 0.1423728813559322,
  'f1-score': 0.015572377568360884,
  'auc': 0.5028297232782383},
 'WeightedL1Embedder': {'accuracy': 0.9865365016360028,
  'precision': 0.009478672985781991,
  'recall': 0.006779661016949152,
  'f1-score': 0.007905138339920948,
  'auc': 0.5005648190191934},
 'WeightedL2Embedder': {'accuracy': 0.9864381626705288,
  'precision': 0.008553654743390357,
  'recall': 0.0062146892655367235,
  'f1-score': 0.0071989528795811525,
  'auc': 0.5002350243386429}}
evaluate_node_classification(G_bi_6, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.14it/s]
{'HadamardEmbedder': {'accuracy': 0.9886607007971279,
  'precision': 0.0026109660574412533,
  'recall': 0.0011173184357541898,
  'f1-score': 0.001564945226917058,
  'auc': 0.4988477109404706},
 'AverageEmbedder': {'accuracy': 0.9745487829803873,
  'precision': 0.005027652086475616,
  'recall': 0.0111731843575419,
  'f1-score': 0.006934812760055479,
  'auc': 0.49672280516047906},
 'WeightedL1Embedder': {'accuracy': 0.9887140203858561,
  'precision': 0.003968253968253968,
  'recall': 0.0016759776536312849,
  'f1-score': 0.0023566378633150037,
  'auc': 0.499151674621466},
 'WeightedL2Embedder': {'accuracy': 0.9887051337877347,
  'precision': 0.00395778364116095,
  'recall': 0.0016759776536312849,
  'f1-score': 0.002354788069073783,
  'auc': 0.49914719569927385}}
evaluate_node_classification(G_bi_5, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:05<00:00,  1.86it/s]
{'HadamardEmbedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583},
 'AverageEmbedder': {'accuracy': 0.988075523162621,
  'precision': 0.008733624454148471,
  'recall': 0.004461795872838818,
  'f1-score': 0.005906238464377999,
  'auc': 0.5002045193071262},
 'WeightedL1Embedder': {'accuracy': 0.9889743975770242,
  'precision': 0.009845288326300985,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005591054313099042,
  'auc': 0.500380922748307},
 'WeightedL2Embedder': {'accuracy': 0.9889743975770242,
  'precision': 0.009845288326300985,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005591054313099042,
  'auc': 0.500380922748307}}
evaluate_node_classification(G_bi_4, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:08<00:00,  1.24it/s]
{'HadamardEmbedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583},
 'AverageEmbedder': {'accuracy': 0.9885537420628946,
  'precision': 0.009900990099009901,
  'recall': 0.004461795872838818,
  'f1-score': 0.006151480199923107,
  'auc': 0.5004455423158968},
 'WeightedL1Embedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583},
 'WeightedL2Embedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583}}
evaluate_node_classification(G_bi_3, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:11<00:00,  1.16s/it]
{'HadamardEmbedder': {'accuracy': 0.988943401907562,
  'precision': 0.011111111111111112,
  'recall': 0.004461795872838818,
  'f1-score': 0.006366892160764027,
  'auc': 0.5006419314341543},
 'AverageEmbedder': {'accuracy': 0.9878275578069236,
  'precision': 0.00823045267489712,
  'recall': 0.004461795872838818,
  'f1-score': 0.005786618444846294,
  'auc': 0.5000795444136896},
 'WeightedL1Embedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583},
 'WeightedL2Embedder': {'accuracy': 0.9889832534825849,
  'precision': 0.009873060648801129,
  'recall': 0.0039040713887339654,
  'f1-score': 0.005595523581135092,
  'auc': 0.5003853861373583}}
evaluate_node_classification(G_bi_2, G_bi_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00,  3.25s/it]
evaluate_node_classification(G_bi_1, G_bi_test)