import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df에서 그냥 처음부터 샘플뽑지 않고 전체 데이터를 한번 돌려보자. f1-score가 어떻게 나오는지 기존 샘플한거랑 비교해보기
= pd.read_csv("fraudTrain.csv") _df
"is_fraud"].value_counts() _df[
0 1042569
1 6006
Name: is_fraud, dtype: int64
"is_fraud"].value_counts()/len(_df) _df[
0 0.994272
1 0.005728
Name: is_fraud, dtype: float64
= _df df
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
= 'gray'
default_edge_color = '#407cc9'
default_node_color = '#f5b042'
enhanced_node_color = '#cc2f04' enhanced_edge_color
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_majority,
df_maj_dowsampled =len(df_minority),
n_samples=42)
random_state
= pd.concat([df_minority, df_maj_dowsampled])
df_downsampled
print(df_downsampled.is_fraud.value_counts())
= build_graph_bipartite(df_downsampled) G_down
1 6006
0 6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
=0.20,
test_size=42) random_state
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 0%| | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 2.53it/s]
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf ;
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
y_pred print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.7345132743362832
Recall: 0.1424892703862661
F1-Score: 0.23867721063982747
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.6886792452830188
Recall: 0.751931330472103
F1-Score: 0.7189167008617152
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.6136363636363636
Recall: 0.02317596566523605
F1-Score: 0.04466501240694789
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.66
Recall: 0.02832618025751073
F1-Score: 0.05432098765432099
- 음 어차피 나중에 downsampled를 하기 때문에 … f1 score값은 샘플을 무얼 하든 큰 차이가 없다. 오히려 더 낮은듯