ref

import pandas as pd

import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

import pandas as pd
df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df.head()

/tmp/ipykernel_3133383/372253127.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])

	Unnamed: 0	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	lat	long	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long
669418	669418	2019-10-12 18:21	4.089100e+18	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	32.3836	-94.8653	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865
32567	32567	2019-01-20 13:06	4.247920e+12	fraud_Turner LLC	travel	3.79	Judith	Moss	F	46297 Benjamin Plains Suite 703	...	39.5370	-83.4550	22305	Television floor manager	1939-03-09	88c65b4e1585934d578511e627fe3589	1327064760	39.156673	-82.930503
156587	156587	2019-03-24 18:09	4.026220e+12	fraud_Klein Group	entertainment	59.07	Debbie	Payne	F	204 Ashley Neck Apt. 169	...	41.5224	-71.9934	4720	Broadcast presenter	1977-05-18	3bd9ede04b5c093143d5e5292940b670	1332612553	41.657152	-72.595751
1020243	1020243	2020-02-25 15:12	4.957920e+12	fraud_Monahan-Morar	personal_care	25.58	Alan	Parsons	M	0547 Russell Ford Suite 574	...	39.6171	-102.4776	207	Network engineer	1955-12-04	19e16ee7a01d229e750359098365e321	1361805120	39.080346	-103.213452
116272	116272	2019-03-06 23:19	4.178100e+15	fraud_Kozey-Kuhlman	personal_care	84.96	Jill	Flores	F	639 Cruz Islands	...	41.9488	-86.4913	3104	Horticulturist, commercial	1981-03-29	a0c8641ca1f5d6e243ed5a2246e66176	1331075954	42.502065	-86.732664

5 rows × 23 columns

총 265,342건 거래 중 7,506건(2,83%)가 사기

이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

- 이분그래프

df_ = df.copy()
mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점

from은 cc_num 고유의 값이라고 생각. to의 값도 merkchant의 고유값

943+693 # 고객  + 상점

df[df['from'] == 1]

	Unnamed: 0	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud	from	to
669418	669418	2019-10-12 18:21	4.089100e+18	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865	0	1	666
1019480	1019480	2020-02-24 22:42	4.089100e+18	fraud_Boyer PLC	shopping_net	7.11	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	fafe649e0bc55f131168b2d9dd84463a	1361745777	33.363174	-94.943839	0	1	864
332666	332666	2019-06-07 15:03	4.089100e+18	fraud_Stiedemann Ltd	food_dining	106.83	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	a7bbe4b43fcb572f950109bece88ce26	1339081432	32.376099	-94.801647	0	1	502
665008	665008	2019-10-10 17:11	4.089100e+18	fraud_Altenwerth-Kilback	home	89.68	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	1e5f07116dcc5a4fa062168987c121a1	1349889093	31.410590	-95.486031	0	1	652
417225	417225	2019-07-07 11:24	4.089100e+18	fraud_Koepp-Witting	grocery_pos	174.12	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	68ede1422d7d6fe0b435661c75cccaa9	1341660240	32.792707	-94.622866	0	1	936
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
907914	907914	2019-12-28 18:05	4.089100e+18	fraud_Hagenes, Kohler and Hoppe	food_dining	31.66	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	a339b4ff5619129e30d0914fb264c6b2	1356717951	32.075614	-95.385633	0	1	574
545087	545087	2019-08-21 14:25	4.089100e+18	fraud_Thiel Ltd	travel	2.59	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	1b72cd11da52010b7f751de70621d68e	1345559124	31.514842	-94.502117	0	1	875
890185	890185	2019-12-23 22:48	4.089100e+18	fraud_Hyatt, Russel and Gleichner	health_fitness	156.26	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	772bdb76fb8e3a365864899d1b7b3a77	1356302914	33.072255	-95.310844	0	1	23
505273	505273	2019-08-07 12:33	4.089100e+18	fraud_Yost, Schamberger and Windler	kids_pets	114.13	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	2cdfcd476e3b08f32a78190c1268df55	1344342827	32.321521	-95.143493	0	1	1245
253317	253317	2019-05-06 8:09	4.089100e+18	fraud_Rempel PLC	grocery_net	52.49	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	a807c5c9e853e94ec7b1680eed9d46c5	1336291776	33.103470	-95.157733	0	1	1299

251 rows × 25 columns

df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()

df

	from	to	is_fraud	amt
0	0	12	0	66.10
1	0	14	0	114.02
2	0	17	0	20.24
3	0	23	0	24.30
4	0	24	0	9.14
...	...	...	...	...
169967	1634	1484	0	1.28
169968	1634	1497	0	3.00
169969	1634	1518	0	40.06
169970	1634	1545	0	284.58
169971	1634	1569	1	338.99

169972 rows × 4 columns

#위와 같은 예시

data = {
    'from': ['A', 'B', 'A', 'B', 'A'],
    'to': ['X', 'Y', 'X', 'Y', 'Z'],
    'amt': [100, 200, 150, 300, 120],
    'is_fraud': [0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

df_grouped = df[['from', 'to', 'amt', 'is_fraud']].groupby(['from', 'to']).agg({"is_fraud": "sum", "amt": "sum"}).reset_index()

print(df_grouped)

  from to  is_fraud  amt
0    A  X         0  250
1    A  Z         0  120
2    B  Y         2  500

df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)

df

	from	to	amt	is_fraud
0	A	X	100	0
1	B	Y	200	1
2	A	X	150	0
3	B	Y	300	1
4	A	Z	120	0

G=nx.from_edgelist(df[["from","to"]].values, create_using=nx.Graph())

<networkx.classes.graph.Graph at 0x7f576becfb20>

G?

Type:           Graph
String form:    Graph with 5 nodes and 3 edges
Length:         5
File:           ~/anaconda3/envs/py38/lib/python3.8/site-packages/networkx/classes/graph.py
Docstring:     
Base class for undirected graphs.
A Graph stores nodes and edges with optional data, or attributes.
Graphs hold undirected edges.  Self loops are allowed but multiple
(parallel) edges are not.
Nodes can be arbitrary (hashable) Python objects with optional
key/value attributes, except that `None` is not allowed as a node.
Edges are represented as links between nodes with optional
key/value attributes.
Parameters
----------
incoming_graph_data : input graph (optional, default: None)
    Data to initialize graph. If None (default) an empty
    graph is created.  The data can be any format that is supported
    by the to_networkx_graph() function, currently including edge list,
    dict of dicts, dict of lists, NetworkX graph, 2D NumPy array, SciPy
    sparse matrix, or PyGraphviz graph.
attr : keyword arguments, optional (default= no attributes)
    Attributes to add to graph as key=value pairs.
See Also
--------
DiGraph
MultiGraph
MultiDiGraph
Examples
--------
Create an empty graph structure (a "null graph") with no nodes and
no edges.
>>> G = nx.Graph()
G can be grown in several ways.
**Nodes:**
Add one node at a time:
>>> G.add_node(1)
Add the nodes from any container (a list, dict, set or
even the lines from a file or the nodes from another graph).
>>> G.add_nodes_from([2, 3])
>>> G.add_nodes_from(range(100, 110))
>>> H = nx.path_graph(10)
>>> G.add_nodes_from(H)
In addition to strings and integers any hashable Python object
(except None) can represent a node, e.g. a customized node object,
or even another Graph.
>>> G.add_node(H)
**Edges:**
G can also be grown by adding edges.
Add one edge,
>>> G.add_edge(1, 2)
a list of edges,
>>> G.add_edges_from([(1, 2), (1, 3)])
or a collection of edges,
>>> G.add_edges_from(H.edges)
If some edges connect nodes not yet in the graph, the nodes
are added automatically.  There are no errors when adding
nodes or edges that already exist.
**Attributes:**
Each graph, node, and edge can hold key/value attribute pairs
in an associated attribute dictionary (the keys must be hashable).
By default these are empty, but can be added or changed using
add_edge, add_node or direct manipulation of the attribute
dictionaries named graph, node and edge respectively.
>>> G = nx.Graph(day="Friday")
>>> G.graph
{'day': 'Friday'}
Add node attributes using add_node(), add_nodes_from() or G.nodes
>>> G.add_node(1, time="5pm")
>>> G.add_nodes_from([3], time="2pm")
>>> G.nodes[1]
{'time': '5pm'}
>>> G.nodes[1]["room"] = 714  # node must exist already to use G.nodes
>>> del G.nodes[1]["room"]  # remove attribute
>>> list(G.nodes(data=True))
[(1, {'time': '5pm'}), (3, {'time': '2pm'})]
Add edge attributes using add_edge(), add_edges_from(), subscript
notation, or G.edges.
>>> G.add_edge(1, 2, weight=4.7)
>>> G.add_edges_from([(3, 4), (4, 5)], color="red")
>>> G.add_edges_from([(1, 2, {"color": "blue"}), (2, 3, {"weight": 8})])
>>> G[1][2]["weight"] = 4.7
>>> G.edges[1, 2]["weight"] = 4
Warning: we protect the graph data structure by making `G.edges` a
read-only dict-like structure. However, you can assign to attributes
in e.g. `G.edges[1, 2]`. Thus, use 2 sets of brackets to add/change
data attributes: `G.edges[1, 2]['weight'] = 4`
(For multigraphs: `MG.edges[u, v, key][name] = value`).
**Shortcuts:**
Many common graph features allow python syntax to speed reporting.
>>> 1 in G  # check if node in graph
True
>>> [n for n in G if n < 3]  # iterate through nodes
[1, 2]
>>> len(G)  # number of nodes in graph
5
Often the best way to traverse all edges of a graph is via the neighbors.
The neighbors are reported as an adjacency-dict `G.adj` or `G.adjacency()`
>>> for n, nbrsdict in G.adjacency():
...     for nbr, eattr in nbrsdict.items():
...         if "weight" in eattr:
...             # Do something useful with the edges
...             pass
But the edges() method is often more convenient:
>>> for u, v, weight in G.edges.data("weight"):
...     if weight is not None:
...         # Do something useful with the edges
...         pass
**Reporting:**
Simple graph information is obtained using object-attributes and methods.
Reporting typically provides views instead of containers to reduce memory
usage. The views update as the graph is updated similarly to dict-views.
The objects `nodes`, `edges` and `adj` provide access to data attributes
via lookup (e.g. `nodes[n]`, `edges[u, v]`, `adj[u][v]`) and iteration
(e.g. `nodes.items()`, `nodes.data('color')`,
`nodes.data('color', default='blue')` and similarly for `edges`)
Views exist for `nodes`, `edges`, `neighbors()`/`adj` and `degree`.
For details on these and other miscellaneous methods, see below.
**Subclasses (Advanced):**
The Graph class uses a dict-of-dict-of-dict data structure.
The outer dict (node_dict) holds adjacency information keyed by node.
The next dict (adjlist_dict) represents the adjacency information and holds
edge data keyed by neighbor.  The inner dict (edge_attr_dict) represents
the edge data and holds edge attribute values keyed by attribute names.
Each of these three dicts can be replaced in a subclass by a user defined
dict-like object. In general, the dict-like features should be
maintained but extra features can be added. To replace one of the
dicts create a new graph class by changing the class(!) variable
holding the factory for that dict-like structure.
node_dict_factory : function, (default: dict)
    Factory function to be used to create the dict containing node
    attributes, keyed by node id.
    It should require no arguments and return a dict-like object
node_attr_dict_factory: function, (default: dict)
    Factory function to be used to create the node attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object
adjlist_outer_dict_factory : function, (default: dict)
    Factory function to be used to create the outer-most dict
    in the data structure that holds adjacency info keyed by node.
    It should require no arguments and return a dict-like object.
adjlist_inner_dict_factory : function, (default: dict)
    Factory function to be used to create the adjacency list
    dict which holds edge data keyed by neighbor.
    It should require no arguments and return a dict-like object
edge_attr_dict_factory : function, (default: dict)
    Factory function to be used to create the edge attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object.
graph_attr_dict_factory : function, (default: dict)
    Factory function to be used to create the graph attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object.
Typically, if your extension doesn't impact the data structure all
methods will inherit without issue except: `to_directed/to_undirected`.
By default these methods create a DiGraph/Graph class and you probably
want them to create your extension of a DiGraph/Graph. To facilitate
this we define two class variables that you can set in your subclass.
to_directed_class : callable, (default: DiGraph or MultiDiGraph)
    Class to create a new graph structure in the `to_directed` method.
    If `None`, a NetworkX class (DiGraph or MultiDiGraph) is used.
to_undirected_class : callable, (default: Graph or MultiGraph)
    Class to create a new graph structure in the `to_undirected` method.
    If `None`, a NetworkX class (Graph or MultiGraph) is used.
**Subclassing Example**
Create a low memory graph class that effectively disallows edge
attributes by using a single attribute dict for all edges.
This reduces the memory used, but you lose edge attributes.
>>> class ThinGraph(nx.Graph):
...     all_edge_dict = {"weight": 1}
...
...     def single_edge_dict(self):
...         return self.all_edge_dict
...
...     edge_attr_dict_factory = single_edge_dict
>>> G = ThinGraph()
>>> G.add_edge(2, 1)
>>> G[2][1]
{'weight': 1}
>>> G.add_edge(2, 2)
>>> G[2][1] is G[2][2]
True
Init docstring:
Initialize a graph with edges, name, or graph attributes.
Parameters
----------
incoming_graph_data : input graph (optional, default: None)
    Data to initialize graph. If None (default) an empty
    graph is created.  The data can be an edge list, or any
    NetworkX graph object.  If the corresponding optional Python
    packages are installed the data can also be a 2D NumPy array, a
    SciPy sparse array, or a PyGraphviz graph.
attr : keyword arguments, optional (default= no attributes)
    Attributes to add to graph as key=value pairs.
See Also
--------
convert
Examples
--------
>>> G = nx.Graph()  # or DiGraph, MultiGraph, MultiDiGraph, etc
>>> G = nx.Graph(name="my graph")
>>> e = [(1, 2), (2, 3), (3, 4)]  # list of edges
>>> G = nx.Graph(e)
Arbitrary graph attribute pairs (key=value) may be assigned
>>> G = nx.Graph(e, day="Friday")
>>> G.graph
{'day': 'Friday'}

nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부

ValueError: invalid literal for int() with base 10: 'A'

edge의 특성 label에 사기거래 여부를 넣음

nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

ValueError: invalid literal for int() with base 10: 'A'

edge의 특성 weight에 거래금액을 넣음

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
G_bu = build_graph_bipartite(df_, nx.Graph(name="Bipartite Undirect"))

삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G

df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])

/tmp/ipykernel_3133383/1475241791.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])

mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}

df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df

	Unnamed: 0	trans_date_trans_time	cc_num	merchant	category	amt	first	last	gender	street	...	city_pop	job	dob	trans_num	unix_time	merch_lat	merch_long	is_fraud	in_node	out_node
669418	669418	2019-10-12 18:21	4.089100e+18	fraud_Haley, Jewess and Bechtelar	shopping_pos	7.53	Debra	Stark	F	686 Linda Rest	...	24536	Multimedia programmer	1983-10-14	d313353fa30233e5fab5468e852d22fc	1350066071	32.202008	-94.371865	0	128914	194152
32567	32567	2019-01-20 13:06	4.247920e+12	fraud_Turner LLC	travel	3.79	Judith	Moss	F	46297 Benjamin Plains Suite 703	...	22305	Television floor manager	1939-03-09	88c65b4e1585934d578511e627fe3589	1327064760	39.156673	-82.930503	0	88222	154234
156587	156587	2019-03-24 18:09	4.026220e+12	fraud_Klein Group	entertainment	59.07	Debbie	Payne	F	204 Ashley Neck Apt. 169	...	4720	Broadcast presenter	1977-05-18	3bd9ede04b5c093143d5e5292940b670	1332612553	41.657152	-72.595751	0	46256	147005
1020243	1020243	2020-02-25 15:12	4.957920e+12	fraud_Monahan-Morar	personal_care	25.58	Alan	Parsons	M	0547 Russell Ford Suite 574	...	207	Network engineer	1955-12-04	19e16ee7a01d229e750359098365e321	1361805120	39.080346	-103.213452	0	201959	214730
116272	116272	2019-03-06 23:19	4.178100e+15	fraud_Kozey-Kuhlman	personal_care	84.96	Jill	Flores	F	639 Cruz Islands	...	3104	Horticulturist, commercial	1981-03-29	a0c8641ca1f5d6e243ed5a2246e66176	1331075954	42.502065	-86.732664	0	163581	206764
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1047089	1047089	2020-03-10 3:59	3.589290e+15	fraud_Kris-Weimann	misc_net	690.49	Paula	Estrada	F	350 Stacy Glens	...	343	Development worker, international aid	1972-03-05	fb1ddd251bbec9b84c9755e856d51723	1362887989	43.254214	-98.267759	1	58387	162384
1047157	1047157	2020-03-10 4:31	3.546670e+15	fraud_Casper, Hand and Zulauf	grocery_pos	324.74	Jordan	May	M	1626 Susan Course	...	13602	Optometrist	1984-07-05	4dca0549e43b7e265cae7fd8a7e563b4	1362889904	33.607221	-97.996506	1	116978	49539
1047208	1047208	2020-03-10 4:59	3.589290e+15	fraud_Kiehn Inc	grocery_pos	331.33	Paula	Estrada	F	350 Stacy Glens	...	343	Development worker, international aid	1972-03-05	d18c55035998e461aa9040e254b74925	1362891561	44.228731	-98.330520	1	58387	206004
1047521	1047521	2020-03-10 8:22	3.589290e+15	fraud_Rau and Sons	grocery_pos	356.20	Paula	Estrada	F	350 Stacy Glens	...	343	Development worker, international aid	1972-03-05	bdaeb5e3413a408d7e6c3720a35337d5	1362903771	43.988931	-97.989985	1	58387	137085
1047918	1047918	2020-03-10 12:09	3.589290e+15	fraud_O'Connell-Ullrich	home	249.56	Paula	Estrada	F	350 Stacy Glens	...	343	Development worker, international aid	1972-03-05	8f0bac74e340483b44babb0d6d07b85b	1362917373	42.868322	-98.537668	1	58387	63205

214520 rows × 25 columns

 G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=nx.Graph())

nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")

label에 사기 거래 여부 표시

nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G

G_tu = build_graph_tripartite(df, nx.Graph())

지도학습(이분그래프)


from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)

1    6006
0    6006
Name: is_fraud, dtype: int64

from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

np.array(list(range(len(G_down.edges))))
np.array(list(nx.get_edge_attributes(G_down, "label").values()))

array([1, 1, 1, ..., 0, 0, 0])

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
#train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

np.array(list(set(G_down.nodes)))[-20:]

array([1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614,
       1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623])

0,1623

(0, 1623)

1623*0.8

1298.4

list(set(G_down.nodes) - set(train_graph.nodes))

[1121, 1608, 1067, 813, 1199, 1584, 593, 473, 1372, 1534]

질문! 왜 왜 ………….. G_dow.nodes - train_grahp.nodes 를 하는거지?

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.44it/s]

Node2Vec 알고리즘 사용해 특징 공간 구축

from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 
    #X=train_embeddings
    #y=train_labels
    #df=[X,y]
    # predictr = TabularPredictor(label='y')
    # predictr.fit(df) 

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred))

<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.7285714285714285
Recall: 0.13144329896907217
F1-Score: 0.22270742358078602
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.6959247648902821
Recall: 0.7628865979381443
F1-Score: 0.7278688524590164
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.574468085106383
Recall: 0.023195876288659795
F1-Score: 0.04459124690338563
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.5319148936170213
Recall: 0.02147766323024055
F1-Score: 0.041288191577208914

.wv : 단어 벡터

만약

edgs = [(1, 2), (2, 3), (3, 4), (4, 5)]

x=2라면 이라면, edge[x=2][0] = 3 edge[x=2][1] = 4 이다.

- 임베딩 방법(ChatGTP)

DeepWalk: 노드의 임베딩을 학습하기 위해 Skip-gram 방식의 Word2Vec을 그래프에 적용하는 방법입니다.

GraphSAGE (Graph Sample and Aggregated Embedding): 각 노드에 대해 이웃 노드들의 임베딩을 집계하여 해당 노드의 임베딩을 생성하는 방법입니다.

Graph Attention Networks (GAT): 노드 간의 관계를 고려한 그래프 신경망으로, 노드에 대한 임베딩을 학습할 수 있습니다.

TADW (Topological Deep Walk): 토플로지와 텍스트 정보를 결합하여 노드를 임베딩하는 방법입니다.

지도학습(삼분그래프)


from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_tripartite(df_downsampled)

1    6006
0    6006
Name: is_fraud, dtype: int64

from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)

edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)

Generating walks (CPU: 1): 100%|██████████| 10/10 [00:24<00:00,  2.42s/it]

Node2Vec 알고리즘 사용해 특징 공간 구축

from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics

from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred))