imports

import numpy as np
import pandas as pd
import networkx as nx
import sklearn

# split 
from sklearn.model_selection import train_test_split

# embedding 
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

# models 
from sklearn.ensemble import RandomForestClassifier 

# 평가 
from sklearn import metrics

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

def down_sample_textbook(df):
    df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
    df_majority = df[df.is_fraud==0]
    df_minority = df[df.is_fraud==1]
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def split(Graph,test_size=0.20,random_state=42):
    edg = list(range(len(Graph.edges))) 
    edg_att = list(nx.get_edge_attributes(Graph, "label").values())
    return train_test_split(edg,edg_att,test_size=test_size,random_state=random_state) 

def embedding(Graph):
    _edgs = list(Graph.edges)
    _train_edges, _test_edges, y, yy = split(Graph)
    _train_graph = Graph.edge_subgraph([_edgs[x] for x in _train_edges]).copy()
    _train_graph.add_nodes_from(list(set(Graph.nodes) - set(_train_graph.nodes)))
    _embedded = AverageEmbedder(Node2Vec(_train_graph, weight_key='weight').fit(window=10).wv)
    X = [_embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in _train_edges]
    XX = [_embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in _test_edges]
    return X,XX,y,yy 

def evaluate(lrnr,XX,yy):
    yyhat = lrnr.predict(XX)
    df = pd.DataFrame({'pre':[sklearn.metrics.precision_score(yy,yyhat)], 
                  'rec':[sklearn.metrics.recall_score(yy,yyhat)],
                  'f1':[sklearn.metrics.f1_score(yy,yyhat)]})
    return df 

def anal(df,n_estimators=10):
    Graph = build_graph_bipartite(df)
    X,XX,y,yy = embedding(Graph)
    lrnr = RandomForestClassifier(n_estimators=n_estimators, random_state=42) 
    lrnr.fit(X,y)
    return lrnr, XX,yy, evaluate(lrnr,XX,yy)

def our_sampling1(df):
    cus_list = set(df.query('is_fraud==1').cc_num.tolist())
    return df.query("cc_num in @ cus_list")

data1

read and define data

df = pd.read_csv("~/Desktop/fraudTrain.csv")

	Unnamed: 0	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long
669418	669418	2019-10-12 18:21	4.089100e+18	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	32.3836	-94.8653	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865
32567	32567	2019-01-20 13:06	4.247920e+12	fraud_Turner LLC	travel	3.79	Judith	Moss	F	46297 Benjamin Plains Suite 703	...	39.5370	-83.4550	22305	Television floor manager	1939-03-09	88c65b4e1585934d578511e627fe3589	1327064760	39.156673	-82.930503
156587	156587	2019-03-24 18:09	4.026220e+12	fraud_Klein Group	entertainment	59.07	Debbie	Payne	F	204 Ashley Neck Apt. 169	...	41.5224	-71.9934	4720	Broadcast presenter	1977-05-18	3bd9ede04b5c093143d5e5292940b670	1332612553	41.657152	-72.595751
1020243	1020243	2020-02-25 15:12	4.957920e+12	fraud_Monahan-Morar	personal_care	25.58	Alan	Parsons	M	0547 Russell Ford Suite 574	...	39.6171	-102.4776	207	Network engineer	1955-12-04	19e16ee7a01d229e750359098365e321	1361805120	39.080346	-103.213452
116272	116272	2019-03-06 23:19	4.178100e+15	fraud_Kozey-Kuhlman	personal_care	84.96	Jill	Flores	F	639 Cruz Islands	...	41.9488	-86.4913	3104	Horticulturist, commercial	1981-03-29	a0c8641ca1f5d6e243ed5a2246e66176	1331075954	42.502065	-86.732664

5 rows × 23 columns

# df_downsampled = down_sample_textbook(df)

embedding

#G_down = build_graph_bipartite(df_downsampled)

# X,XX,y,yy = embedding(G_down)

learn

# lrnr = RandomForestClassifier(n_estimators=10, random_state=42) 
# lrnr.fit(X,y)

evaluate

# evaluate(lrnr,XX,yy)

data1 : 다른코드

lrnr1, XX_textbook, yy_texbook, results = anal(down_sample_textbook(df),n_estimators=100)

Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.42it/s]

data2

read and define data

df = pd.read_csv("~/Desktop/fraudTrain.csv")
lrnr2, _,_,_ = anal(down_sample_textbook(our_sampling1(df)),n_estimators=100)

Generating walks (CPU: 1):   0%|          | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00,  3.01it/s]

textbook vs proposed

yyhat_textbook = lrnr1.predict(XX_textbook)
yyhat_proposed = lrnr2.predict(XX_textbook)

evaluate(lrnr1, XX_textbook,yy_texbook)

	pre	rec	f1
0	0.691654	0.761054	0.724696

evaluate(lrnr2, XX_textbook,yy_texbook)

	pre	rec	f1
0	0.521797	0.335884	0.408691

appedix

김보람 ref - https://boram-coco.github.io/coco/posts/Graph%20Machine%20Learning/graph8.html

데이터: df - “~/Desktop/fraudTrain.csv” // (214520, 23) // 여기에서 214520은 전체에서 0.2%의 비율로 sampled 된 자료임

obs: 거래건수
var: cc_num(userid), store, 사기유무, 시간, 지역, amt, …
y: 사기유무
x:

목적: 사기거래 y==1을 찾는 것

교재의 방법

def build_graph_bipartite

1. df를 변형하여 from, to를 만든다. from은 출발점 / to는 도착점 

2. df에서 from,to,amt,is_fraud를 선택하여 (from,to)로 그룹핑 => is_fraud 와 amt 의 sum을 계산 

3. sum(is_fraud) > 0 일경우 is_fraud=1 로 설정 

4. 노드들의 집합= {고객1,고객2,...고객m, 상점1,상점2,...,상점k} => 1632의 노드가 있음 즉 m+k=1632 

5. 고객-상점 간의 사기가 있으면 엣지를 1로 설정, 그렇지 않으면 0으로 설정 

6. 엣지가1인 경우 amt를 weight로 설정

main code

df load
df -> df_downsampled
G = build_graph_bipartite(df_downsampled)
tr_edg, test_edg, tr_lable, test_label = split(G)