import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
import pickle
import time
import datetime
import warnings
warnings.filterwarnings('ignore')
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
import networkx as nx
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2EmbedderFAURD코드
df_train1 = pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_test = pd.read_csv('~/Dropbox/Data/df_test.csv')_df1 = pd.concat([df_train1, df_test])
_df2 = pd.concat([df_train2, df_test])
_df3 = pd.concat([df_train3, df_test])
_df4 = pd.concat([df_train4, df_test])
_df5 = pd.concat([df_train5, df_test])
_df6 = pd.concat([df_train6, df_test])
_df7 = pd.concat([df_train7, df_test])
_df8 = pd.concat([df_train8, df_test])- 이분그래프
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
df=df_input.copy()
mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
df["merchant"].values.tolist()))}
df["from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
return GG_bi_8 = build_graph_bipartite(df_train8, nx.Graph(name="Bipartite Undirect"))
G_bi_7 = build_graph_bipartite(df_train7, nx.Graph(name="Bipartite Undirect"))
G_bi_6 = build_graph_bipartite(df_train6, nx.Graph(name="Bipartite Undirect"))
G_bi_5 = build_graph_bipartite(df_train5, nx.Graph(name="Bipartite Undirect"))
G_bi_4 = build_graph_bipartite(df_train4, nx.Graph(name="Bipartite Undirect"))
G_bi_3 = build_graph_bipartite(df_train3, nx.Graph(name="Bipartite Undirect"))
G_bi_2 = build_graph_bipartite(df_train2, nx.Graph(name="Bipartite Undirect"))
G_bi_1 = build_graph_bipartite(df_train1, nx.Graph(name="Bipartite Undirect"))
G_bi_test = build_graph_bipartite(df_test, nx.Graph(name="Bipartite Undirect"))def evaluate_node_classification(G_train, G_test, embedding_dimension=128, random_state=42):
# Ensure the same set of nodes for both train and test graphs
common_nodes = set(G_train.nodes).intersection(G_test.nodes)
G_train = G_train.subgraph(common_nodes).copy()
G_test = G_test.subgraph(common_nodes).copy()
node2vec_train = Node2Vec(G_train, dimensions=embedding_dimension, weight_key='weight')
model_train = node2vec_train.fit(window=10)
classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
evaluation_results = {}
train_labels = [G_train.edges[edge]['label'] for edge in G_train.edges]
y_test = list(nx.get_edge_attributes(G_test, "label").values())
for cl in classes:
embeddings_train = cl(keyed_vectors=model_train.wv)
train_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_train.edges]
test_embeddings = [embeddings_train[str(edge[0]), str(edge[1])] for edge in G_test.edges]
rf = RandomForestClassifier(n_estimators=1000, random_state=random_state)
rf.fit(train_embeddings, train_labels)
yhat = rf.predict(test_embeddings)
acc = metrics.accuracy_score(y_test, yhat)
pre = metrics.precision_score(y_test, yhat)
rec = metrics.recall_score(y_test, yhat)
f1 = metrics.f1_score(y_test, yhat)
auc = metrics.roc_auc_score(y_test, yhat)
evaluation_results[cl.__name__] = {"accuracy": acc, "precision": pre, "recall": rec, "f1-score": f1, "auc": auc}
return evaluation_resultsevaluate_node_classification(G_bi_8, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:06<00:00, 1.59it/s]
{'HadamardEmbedder': {'accuracy': 0.9399995452788578,
'precision': 0.007048912576291584,
'recall': 0.047508690614136734,
'f1-score': 0.01227636799161614,
'auc': 0.4972841749501759},
'AverageEmbedder': {'accuracy': 0.6192437987404225,
'precision': 0.007648711606349511,
'recall': 0.3690614136732329,
'f1-score': 0.014986824769433466,
'auc': 0.4951421492099717},
'WeightedL1Embedder': {'accuracy': 0.9711888684264375,
'precision': 0.006423982869379015,
'recall': 0.01738122827346466,
'f1-score': 0.009380863039399626,
'auc': 0.498057630805767},
'WeightedL2Embedder': {'accuracy': 0.9706477502671487,
'precision': 0.006264355815410315,
'recall': 0.01738122827346466,
'f1-score': 0.009209516500383728,
'auc': 0.4977849314487875}}
evaluate_node_classification(G_bi_7, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:06<00:00, 1.65it/s]
{'HadamardEmbedder': {'accuracy': 0.9847127608217562,
'precision': 0.008928571428571428,
'recall': 0.00847457627118644,
'f1-score': 0.008695652173913042,
'auc': 0.5004863757514998},
'AverageEmbedder': {'accuracy': 0.8575828282286471,
'precision': 0.008236639973851937,
'recall': 0.1423728813559322,
'f1-score': 0.015572377568360884,
'auc': 0.5028297232782383},
'WeightedL1Embedder': {'accuracy': 0.9865365016360028,
'precision': 0.009478672985781991,
'recall': 0.006779661016949152,
'f1-score': 0.007905138339920948,
'auc': 0.5005648190191934},
'WeightedL2Embedder': {'accuracy': 0.9864381626705288,
'precision': 0.008553654743390357,
'recall': 0.0062146892655367235,
'f1-score': 0.0071989528795811525,
'auc': 0.5002350243386429}}
evaluate_node_classification(G_bi_6, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00, 2.14it/s]
{'HadamardEmbedder': {'accuracy': 0.9886607007971279,
'precision': 0.0026109660574412533,
'recall': 0.0011173184357541898,
'f1-score': 0.001564945226917058,
'auc': 0.4988477109404706},
'AverageEmbedder': {'accuracy': 0.9745487829803873,
'precision': 0.005027652086475616,
'recall': 0.0111731843575419,
'f1-score': 0.006934812760055479,
'auc': 0.49672280516047906},
'WeightedL1Embedder': {'accuracy': 0.9887140203858561,
'precision': 0.003968253968253968,
'recall': 0.0016759776536312849,
'f1-score': 0.0023566378633150037,
'auc': 0.499151674621466},
'WeightedL2Embedder': {'accuracy': 0.9887051337877347,
'precision': 0.00395778364116095,
'recall': 0.0016759776536312849,
'f1-score': 0.002354788069073783,
'auc': 0.49914719569927385}}
evaluate_node_classification(G_bi_5, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:05<00:00, 1.86it/s]
{'HadamardEmbedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583},
'AverageEmbedder': {'accuracy': 0.988075523162621,
'precision': 0.008733624454148471,
'recall': 0.004461795872838818,
'f1-score': 0.005906238464377999,
'auc': 0.5002045193071262},
'WeightedL1Embedder': {'accuracy': 0.9889743975770242,
'precision': 0.009845288326300985,
'recall': 0.0039040713887339654,
'f1-score': 0.005591054313099042,
'auc': 0.500380922748307},
'WeightedL2Embedder': {'accuracy': 0.9889743975770242,
'precision': 0.009845288326300985,
'recall': 0.0039040713887339654,
'f1-score': 0.005591054313099042,
'auc': 0.500380922748307}}
evaluate_node_classification(G_bi_4, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:08<00:00, 1.24it/s]
{'HadamardEmbedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583},
'AverageEmbedder': {'accuracy': 0.9885537420628946,
'precision': 0.009900990099009901,
'recall': 0.004461795872838818,
'f1-score': 0.006151480199923107,
'auc': 0.5004455423158968},
'WeightedL1Embedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583},
'WeightedL2Embedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583}}
evaluate_node_classification(G_bi_3, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:11<00:00, 1.16s/it]
{'HadamardEmbedder': {'accuracy': 0.988943401907562,
'precision': 0.011111111111111112,
'recall': 0.004461795872838818,
'f1-score': 0.006366892160764027,
'auc': 0.5006419314341543},
'AverageEmbedder': {'accuracy': 0.9878275578069236,
'precision': 0.00823045267489712,
'recall': 0.004461795872838818,
'f1-score': 0.005786618444846294,
'auc': 0.5000795444136896},
'WeightedL1Embedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583},
'WeightedL2Embedder': {'accuracy': 0.9889832534825849,
'precision': 0.009873060648801129,
'recall': 0.0039040713887339654,
'f1-score': 0.005595523581135092,
'auc': 0.5003853861373583}}
evaluate_node_classification(G_bi_2, G_bi_test)Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00, 3.25s/it]
evaluate_node_classification(G_bi_1, G_bi_test)