import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
FAURD코드
-
이분그래프
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
-
삼분그래프
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
- 판매자, 고객, 거래에 노드 할당
= pd.read_csv('~/Dropbox/Data/df_train1.csv')
df_train1 = pd.read_csv('~/Dropbox/Data/df_train2.csv')
df_train2 = pd.read_csv('~/Dropbox/Data/df_train3.csv')
df_train3 = pd.read_csv('~/Dropbox/Data/df_train4.csv')
df_train4 = pd.read_csv('~/Dropbox/Data/df_train5.csv')
df_train5 = pd.read_csv('~/Dropbox/Data/df_train6.csv')
df_train6 = pd.read_csv('~/Dropbox/Data/df_train7.csv')
df_train7 = pd.read_csv('~/Dropbox/Data/df_train8.csv')
df_train8 = pd.read_csv('~/Dropbox/Data/df_test.csv') df_test
= pd.concat([df_train1, df_test])
_df1 = pd.concat([df_train2, df_test])
_df2 = pd.concat([df_train3, df_test])
_df3 = pd.concat([df_train4, df_test])
_df4 = pd.concat([df_train5, df_test])
_df5 = pd.concat([df_train6, df_test])
_df6 = pd.concat([df_train7, df_test])
_df7 = pd.concat([df_train8, df_test]) _df8
= build_graph_bipartite(df_train8, nx.Graph(name="Bipartite Undirect"))
G_bi_8 = build_graph_bipartite(df_train7, nx.Graph(name="Bipartite Undirect"))
G_bi_7 = build_graph_bipartite(df_train6, nx.Graph(name="Bipartite Undirect"))
G_bi_6 = build_graph_bipartite(df_train5, nx.Graph(name="Bipartite Undirect"))
G_bi_5 = build_graph_bipartite(df_train4, nx.Graph(name="Bipartite Undirect"))
G_bi_4 = build_graph_bipartite(df_train3, nx.Graph(name="Bipartite Undirect"))
G_bi_3 = build_graph_bipartite(df_train2, nx.Graph(name="Bipartite Undirect"))
G_bi_2 = build_graph_bipartite(df_train1, nx.Graph(name="Bipartite Undirect"))
G_bi_1 = build_graph_bipartite(df_test, nx.Graph(name="Bipartite Undirect")) G_bi_test
= build_graph_bipartite(_df8, nx.Graph(name="Bipartite Undirect"))
G_8 = build_graph_bipartite(_df7, nx.Graph(name="Bipartite Undirect"))
G_7 = build_graph_bipartite(_df6, nx.Graph(name="Bipartite Undirect"))
G_6 = build_graph_bipartite(_df5, nx.Graph(name="Bipartite Undirect"))
G_5 = build_graph_bipartite(_df4, nx.Graph(name="Bipartite Undirect"))
G_4 = build_graph_bipartite(_df3, nx.Graph(name="Bipartite Undirect"))
G_3 = build_graph_bipartite(_df2, nx.Graph(name="Bipartite Undirect"))
G_2 = build_graph_bipartite(_df1, nx.Graph(name="Bipartite Undirect")) G_1
= train_test_split(list(range(len(G_8.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_8, "label").values()),
=0.973961397229567,
test_size=42) random_state
= list(G_8.edges)
edgs = G_8.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_8.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
train_graph.number_of_edges(), train_graph.number_of_nodes()
(6002, 1636)
- 데이터 8:2 비율로 학습 검증
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 2.96it/s]
- Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes
for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
y_pred print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))
print('Accuracy:', metrics.accuracy_score(test_labels, y_pred))
print('auc:', metrics.roc_auc_score(test_labels, y_pred))
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Accuracy: 0.9743555484296225
auc: 0.5
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
Accuracy: 0.9743110113480484
auc: 0.49997714536462284
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.3333333333333333
Recall: 0.0001736714136853074
F1-Score: 0.0003471619510501649
Accuracy: 0.9743510947214651
auc: 0.5000822647797671
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.3333333333333333
Recall: 0.0001736714136853074
F1-Score: 0.0003471619510501649
Accuracy: 0.9743510947214651
auc: 0.5000822647797671
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))