import numpy as np
import pandas as pd
import networkx as nx
import sklearn
# split
from sklearn.model_selection import train_test_split
# embedding
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
# models
from sklearn.ensemble import RandomForestClassifier
# 평가
from sklearn import metrics
imports
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
def down_sample_textbook(df):
= df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df = df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def split(Graph,test_size=0.20,random_state=42):
= list(range(len(Graph.edges)))
edg = list(nx.get_edge_attributes(Graph, "label").values())
edg_att return train_test_split(edg,edg_att,test_size=test_size,random_state=random_state)
def embedding(Graph):
= list(Graph.edges)
_edgs = split(Graph)
_train_edges, _test_edges, y, yy = Graph.edge_subgraph([_edgs[x] for x in _train_edges]).copy()
_train_graph list(set(Graph.nodes) - set(_train_graph.nodes)))
_train_graph.add_nodes_from(= AverageEmbedder(Node2Vec(_train_graph, weight_key='weight').fit(window=10).wv)
_embedded = [_embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in _train_edges]
X = [_embedded[str(_edgs[x][0]), str(_edgs[x][1])] for x in _test_edges]
XX return X,XX,y,yy
def evaluate(lrnr,XX,yy):
= lrnr.predict(XX)
yyhat = pd.DataFrame({'pre':[sklearn.metrics.precision_score(yy,yyhat)],
df 'rec':[sklearn.metrics.recall_score(yy,yyhat)],
'f1':[sklearn.metrics.f1_score(yy,yyhat)]})
return df
def anal(df,n_estimators=10):
= build_graph_bipartite(df)
Graph = embedding(Graph)
X,XX,y,yy = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
lrnr
lrnr.fit(X,y)return lrnr, XX,yy, evaluate(lrnr,XX,yy)
def our_sampling1(df):
= set(df.query('is_fraud==1').cc_num.tolist())
cus_list return df.query("cc_num in @ cus_list")
data1
read and define data
= pd.read_csv("~/Desktop/fraudTrain.csv") df
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 32.3836 | -94.8653 | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 |
32567 | 32567 | 2019-01-20 13:06 | 4.247920e+12 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 39.5370 | -83.4550 | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 |
156587 | 156587 | 2019-03-24 18:09 | 4.026220e+12 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 41.5224 | -71.9934 | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 |
1020243 | 1020243 | 2020-02-25 15:12 | 4.957920e+12 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 39.6171 | -102.4776 | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 |
116272 | 116272 | 2019-03-06 23:19 | 4.178100e+15 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 41.9488 | -86.4913 | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 |
5 rows × 23 columns
# df_downsampled = down_sample_textbook(df)
embedding
#G_down = build_graph_bipartite(df_downsampled)
# X,XX,y,yy = embedding(G_down)
learn
# lrnr = RandomForestClassifier(n_estimators=10, random_state=42)
# lrnr.fit(X,y)
evaluate
# evaluate(lrnr,XX,yy)
data1 : 다른코드
= anal(down_sample_textbook(df),n_estimators=100) lrnr1, XX_textbook, yy_texbook, results
Generating walks (CPU: 1): 0%| | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00, 2.42it/s]
data2
read and define data
= pd.read_csv("~/Desktop/fraudTrain.csv")
df = anal(down_sample_textbook(our_sampling1(df)),n_estimators=100) lrnr2, _,_,_
Generating walks (CPU: 1): 0%| | 0/10 [00:00<?, ?it/s]Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 3.01it/s]
textbook vs proposed
= lrnr1.predict(XX_textbook)
yyhat_textbook = lrnr2.predict(XX_textbook) yyhat_proposed
evaluate(lrnr1, XX_textbook,yy_texbook)
pre | rec | f1 | |
---|---|---|---|
0 | 0.691654 | 0.761054 | 0.724696 |
evaluate(lrnr2, XX_textbook,yy_texbook)
pre | rec | f1 | |
---|---|---|---|
0 | 0.521797 | 0.335884 | 0.408691 |
appedix
김보람 ref - https://boram-coco.github.io/coco/posts/Graph%20Machine%20Learning/graph8.html
데이터: df - “~/Desktop/fraudTrain.csv” // (214520, 23) // 여기에서 214520은 전체에서 0.2%의 비율로 sampled 된 자료임
obs: 거래건수
var: cc_num(userid), store, 사기유무, 시간, 지역, amt, …
y: 사기유무
x:
목적: 사기거래 y==1을 찾는 것
교재의 방법
def build_graph_bipartite
1. df를 변형하여 from, to를 만든다. from은 출발점 / to는 도착점
2. df에서 from,to,amt,is_fraud를 선택하여 (from,to)로 그룹핑 => is_fraud 와 amt 의 sum을 계산
3. sum(is_fraud) > 0 일경우 is_fraud=1 로 설정
4. 노드들의 집합= {고객1,고객2,...고객m, 상점1,상점2,...,상점k} => 1632의 노드가 있음 즉 m+k=1632
5. 고객-상점 간의 사기가 있으면 엣지를 1로 설정, 그렇지 않으면 0으로 설정
6. 엣지가1인 경우 amt를 weight로 설정
main code
df load
df -> df_downsampled
G = build_graph_bipartite(df_downsampled)
tr_edg, test_edg, tr_lable, test_label = split(G)