import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
= 'gray'
default_edge_color = '#407cc9'
default_node_color = '#f5b042'
enhanced_node_color = '#cc2f04' enhanced_edge_color
import pandas as pd
= pd.read_csv("fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df df.head()
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 32.3836 | -94.8653 | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 |
32567 | 32567 | 2019-01-20 13:06 | 4.247920e+12 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 39.5370 | -83.4550 | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 |
156587 | 156587 | 2019-03-24 18:09 | 4.026220e+12 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 41.5224 | -71.9934 | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 |
1020243 | 1020243 | 2020-02-25 15:12 | 4.957920e+12 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 39.6171 | -102.4776 | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 |
116272 | 116272 | 2019-03-06 23:19 | 4.178100e+15 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 41.9488 | -86.4913 | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 |
5 rows × 23 columns
"is_fraud"].value_counts() df[
0 208514
1 6006
Name: is_fraud, dtype: int64
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
= build_graph_bipartite(df, nx.Graph(name="Bipartite Undirect")) G_bu
# 기존 코드 (down)
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_majority,
df_maj_dowsampled =len(df_minority),
n_samples=42)
random_state
= pd.concat([df_minority, df_maj_dowsampled])
df_downsampled
print(df_downsampled.is_fraud.value_counts())
= build_graph_bipartite(df_downsampled) G_down
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_minority,
df_min_oversampled =len(df_majority),
n_samples=True,
replace=42)
random_state
= pd.concat([df_majority, df_min_oversampled])
df_oversampled
print(df_oversampled.is_fraud.value_counts())
= build_graph_bipartite(df_oversampled) G_over
0 208514
1 208514
Name: is_fraud, dtype: int64
- oversampling했다
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_over.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_over, "label").values()),
=0.20,
test_size=42) random_state
= list(G_over.edges)
edgs = G_over.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_over.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:15<00:00, 1.53s/it]
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train # 벡터스페이스 상에 edge를 투영..
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf ;
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
y_pred print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.0
Recall: 0.0
F1-Score: 0.0
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
= Node2Vec(G_over, weight_key='weight')
nod2vec_unsup = nod2vec_unsup.fit(window=10) unsup_vals
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:17<00:00, 1.79s/it]
from sklearn.cluster import KMeans
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes = [x for x in nx.get_edge_attributes(G_over, "label").values()]
true_labels
for cl in classes:
= cl(keyed_vectors=unsup_vals.wv)
embedding_edge
= [embedding_edge[str(x[0]), str(x[1])] for x in G_over.edges()]
embedding = KMeans(2, random_state=42).fit(embedding)
kmeans
= metrics.adjusted_mutual_info_score(true_labels, kmeans.labels_)
nmi = metrics.homogeneity_score(true_labels, kmeans.labels_)
ho = metrics.completeness_score(true_labels, kmeans.labels_)
co = metrics.v_measure_score(true_labels, kmeans.labels_)
vmeasure
print(cl)
print('NMI:', nmi)
print('Homogeneity:', ho)
print('Completeness:', co)
print('V-Measure:', vmeasure)
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
<class 'node2vec.edges.HadamardEmbedder'>
NMI: 0.00544668790993928
Homogeneity: 0.013903066513115778
Completeness: 0.0033926306209409642
V-Measure: 0.005454301010451627
<class 'node2vec.edges.AverageEmbedder'>
NMI: 0.022759004483245575
Homogeneity: 0.06217623778028962
Completeness: 0.013933972719173142
V-Measure: 0.022765986201483266
<class 'node2vec.edges.WeightedL1Embedder'>
NMI: 0.04571547270228404
Homogeneity: 0.1238534042248765
Completeness: 0.02803615788549803
V-Measure: 0.04572234651623731
<class 'node2vec.edges.WeightedL2Embedder'>
NMI: 0.04580696581262077
Homogeneity: 0.12668781634894563
Completeness: 0.027962918433326666
V-Measure: 0.045813698590560094