[FRAUD] 책_코드 함수 만들기(tri)

graph
Author

김보람

Published

May 20, 2024

FAURD코드

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb
import pickle 
import time 
import datetime
import warnings
warnings.filterwarnings('ignore')

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv


import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')
_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])

- 삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    
  • 판매자, 고객, 거래에 노드 할당
G_ti_8 = build_graph_tripartite(df_train8, nx.Graph(name="Tripartite Undirect"))
G_ti_7 = build_graph_tripartite(df_train7, nx.Graph(name="Tripartite Undirect"))
G_ti_6 = build_graph_tripartite(df_train6, nx.Graph(name="Tripartite Undirect"))
G_ti_5 = build_graph_tripartite(df_train5, nx.Graph(name="Tripartite Undirect"))
G_ti_4 = build_graph_tripartite(df_train4, nx.Graph(name="Tripartite Undirect"))
G_ti_3 = build_graph_tripartite(df_train3, nx.Graph(name="Tripartite Undirect"))
G_ti_2 = build_graph_tripartite(df_train2, nx.Graph(name="Tripartite Undirect"))
G_ti_1 = build_graph_tripartite(df_train1, nx.Graph(name="Tripartite Undirect"))
G_ti_test = build_graph_tripartite(df_test, nx.Graph(name="Tripartite Undirect"))
G_ti_test
<networkx.classes.graph.Graph at 0x7f91b62b31c0>
def evaluate_node_classification(G_train, G_test, embedding_dimension=128, random_state=42):
    # Ensure the same set of nodes for both train and test graphs
    common_nodes = set(G_train.nodes).intersection(G_test.nodes)
    G_train = G_train.subgraph(common_nodes).copy()
    G_test = G_test.subgraph(common_nodes).copy()

    node2vec_train = Node2Vec(G_train, dimensions=embedding_dimension, weight_key='weight')
    model_train = node2vec_train.fit(window=10)
    
    classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
    evaluation_results = {}
    
    train_labels = [G_train.edges[edge]['label'] for edge in G_train.edges]
    y_test =  list(nx.get_edge_attributes(G_test, "label").values())
    
    for cl in classes:
        embeddings_train = cl(keyed_vectors=model_train.wv)

        train_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_train.edges]
        test_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_test.edges]

        rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)
        rf.fit(train_embeddings, train_labels)

        yhat = rf.predict(test_embeddings)
        acc = metrics.accuracy_score(y_test, yhat)
        pre = metrics.precision_score(y_test, yhat)
        rec = metrics.recall_score(y_test, yhat)
        f1 = metrics.f1_score(y_test, yhat)
        auc = metrics.roc_auc_score(y_test, yhat)
        
        evaluation_results[cl.__name__] = {"accuracy": acc, "precision": pre, "recall": rec, "f1-score": f1, "auc": auc}

    return evaluation_results
evaluate_node_classification(G_ti_8, G_ti_test)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:18<00:00,  1.90s/it]
ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
import time
time.time()
evaluate_node_classification(G_ti_7, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_6, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_5, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_4, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_3, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_2, G_ti_test)
time.time()
time.time()
evaluate_node_classification(G_ti_1, G_ti_test)
time.time()