CH8. 신용카드 거래에 대한 그래프 분석_코드뜯기

graph
Author

김보람

Published

January 18, 2023

ref

import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df.head()
/tmp/ipykernel_3133383/372253127.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
Unnamed: 0 trans_date_trans_time cc_num merchant category amt first last gender street ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
669418 669418 2019-10-12 18:21 4.089100e+18 fraud_Haley, Jewess and Bechtelar shopping_pos 7.53 Debra Stark F 686 Linda Rest ... 32.3836 -94.8653 24536 Multimedia programmer 1983-10-14 d313353fa30233e5fab5468e852d22fc 1350066071 32.202008 -94.371865 0
32567 32567 2019-01-20 13:06 4.247920e+12 fraud_Turner LLC travel 3.79 Judith Moss F 46297 Benjamin Plains Suite 703 ... 39.5370 -83.4550 22305 Television floor manager 1939-03-09 88c65b4e1585934d578511e627fe3589 1327064760 39.156673 -82.930503 0
156587 156587 2019-03-24 18:09 4.026220e+12 fraud_Klein Group entertainment 59.07 Debbie Payne F 204 Ashley Neck Apt. 169 ... 41.5224 -71.9934 4720 Broadcast presenter 1977-05-18 3bd9ede04b5c093143d5e5292940b670 1332612553 41.657152 -72.595751 0
1020243 1020243 2020-02-25 15:12 4.957920e+12 fraud_Monahan-Morar personal_care 25.58 Alan Parsons M 0547 Russell Ford Suite 574 ... 39.6171 -102.4776 207 Network engineer 1955-12-04 19e16ee7a01d229e750359098365e321 1361805120 39.080346 -103.213452 0
116272 116272 2019-03-06 23:19 4.178100e+15 fraud_Kozey-Kuhlman personal_care 84.96 Jill Flores F 639 Cruz Islands ... 41.9488 -86.4913 3104 Horticulturist, commercial 1981-03-29 a0c8641ca1f5d6e243ed5a2246e66176 1331075954 42.502065 -86.732664 0

5 rows × 23 columns

  • 총 265,342건 거래 중 7,506건(2,83%)가 사기

이분그래프

def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G

- 이분그래프

df_ = df.copy()
mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점

from은 cc_num 고유의 값이라고 생각. to의 값도 merkchant의 고유값

943+693 # 고객  + 상점
1636
df[df['from'] == 1]
Unnamed: 0 trans_date_trans_time cc_num merchant category amt first last gender street ... city_pop job dob trans_num unix_time merch_lat merch_long is_fraud from to
669418 669418 2019-10-12 18:21 4.089100e+18 fraud_Haley, Jewess and Bechtelar shopping_pos 7.53 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 d313353fa30233e5fab5468e852d22fc 1350066071 32.202008 -94.371865 0 1 666
1019480 1019480 2020-02-24 22:42 4.089100e+18 fraud_Boyer PLC shopping_net 7.11 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 fafe649e0bc55f131168b2d9dd84463a 1361745777 33.363174 -94.943839 0 1 864
332666 332666 2019-06-07 15:03 4.089100e+18 fraud_Stiedemann Ltd food_dining 106.83 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 a7bbe4b43fcb572f950109bece88ce26 1339081432 32.376099 -94.801647 0 1 502
665008 665008 2019-10-10 17:11 4.089100e+18 fraud_Altenwerth-Kilback home 89.68 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 1e5f07116dcc5a4fa062168987c121a1 1349889093 31.410590 -95.486031 0 1 652
417225 417225 2019-07-07 11:24 4.089100e+18 fraud_Koepp-Witting grocery_pos 174.12 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 68ede1422d7d6fe0b435661c75cccaa9 1341660240 32.792707 -94.622866 0 1 936
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
907914 907914 2019-12-28 18:05 4.089100e+18 fraud_Hagenes, Kohler and Hoppe food_dining 31.66 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 a339b4ff5619129e30d0914fb264c6b2 1356717951 32.075614 -95.385633 0 1 574
545087 545087 2019-08-21 14:25 4.089100e+18 fraud_Thiel Ltd travel 2.59 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 1b72cd11da52010b7f751de70621d68e 1345559124 31.514842 -94.502117 0 1 875
890185 890185 2019-12-23 22:48 4.089100e+18 fraud_Hyatt, Russel and Gleichner health_fitness 156.26 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 772bdb76fb8e3a365864899d1b7b3a77 1356302914 33.072255 -95.310844 0 1 23
505273 505273 2019-08-07 12:33 4.089100e+18 fraud_Yost, Schamberger and Windler kids_pets 114.13 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 2cdfcd476e3b08f32a78190c1268df55 1344342827 32.321521 -95.143493 0 1 1245
253317 253317 2019-05-06 8:09 4.089100e+18 fraud_Rempel PLC grocery_net 52.49 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 a807c5c9e853e94ec7b1680eed9d46c5 1336291776 33.103470 -95.157733 0 1 1299

251 rows × 25 columns

df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df
from to is_fraud amt
0 0 12 0 66.10
1 0 14 0 114.02
2 0 17 0 20.24
3 0 23 0 24.30
4 0 24 0 9.14
... ... ... ... ...
169967 1634 1484 0 1.28
169968 1634 1497 0 3.00
169969 1634 1518 0 40.06
169970 1634 1545 0 284.58
169971 1634 1569 1 338.99

169972 rows × 4 columns

#위와 같은 예시

data = {
    'from': ['A', 'B', 'A', 'B', 'A'],
    'to': ['X', 'Y', 'X', 'Y', 'Z'],
    'amt': [100, 200, 150, 300, 120],
    'is_fraud': [0, 1, 0, 1, 0]
}

df = pd.DataFrame(data)

df_grouped = df[['from', 'to', 'amt', 'is_fraud']].groupby(['from', 'to']).agg({"is_fraud": "sum", "amt": "sum"}).reset_index()

print(df_grouped)
  from to  is_fraud  amt
0    A  X         0  250
1    A  Z         0  120
2    B  Y         2  500
df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df
from to amt is_fraud
0 A X 100 0
1 B Y 200 1
2 A X 150 0
3 B Y 300 1
4 A Z 120 0
G=nx.from_edgelist(df[["from","to"]].values, create_using=nx.Graph())
G
<networkx.classes.graph.Graph at 0x7f576becfb20>
G?
Type:           Graph
String form:    Graph with 5 nodes and 3 edges
Length:         5
File:           ~/anaconda3/envs/py38/lib/python3.8/site-packages/networkx/classes/graph.py
Docstring:     
Base class for undirected graphs.
A Graph stores nodes and edges with optional data, or attributes.
Graphs hold undirected edges.  Self loops are allowed but multiple
(parallel) edges are not.
Nodes can be arbitrary (hashable) Python objects with optional
key/value attributes, except that `None` is not allowed as a node.
Edges are represented as links between nodes with optional
key/value attributes.
Parameters
----------
incoming_graph_data : input graph (optional, default: None)
    Data to initialize graph. If None (default) an empty
    graph is created.  The data can be any format that is supported
    by the to_networkx_graph() function, currently including edge list,
    dict of dicts, dict of lists, NetworkX graph, 2D NumPy array, SciPy
    sparse matrix, or PyGraphviz graph.
attr : keyword arguments, optional (default= no attributes)
    Attributes to add to graph as key=value pairs.
See Also
--------
DiGraph
MultiGraph
MultiDiGraph
Examples
--------
Create an empty graph structure (a "null graph") with no nodes and
no edges.
>>> G = nx.Graph()
G can be grown in several ways.
**Nodes:**
Add one node at a time:
>>> G.add_node(1)
Add the nodes from any container (a list, dict, set or
even the lines from a file or the nodes from another graph).
>>> G.add_nodes_from([2, 3])
>>> G.add_nodes_from(range(100, 110))
>>> H = nx.path_graph(10)
>>> G.add_nodes_from(H)
In addition to strings and integers any hashable Python object
(except None) can represent a node, e.g. a customized node object,
or even another Graph.
>>> G.add_node(H)
**Edges:**
G can also be grown by adding edges.
Add one edge,
>>> G.add_edge(1, 2)
a list of edges,
>>> G.add_edges_from([(1, 2), (1, 3)])
or a collection of edges,
>>> G.add_edges_from(H.edges)
If some edges connect nodes not yet in the graph, the nodes
are added automatically.  There are no errors when adding
nodes or edges that already exist.
**Attributes:**
Each graph, node, and edge can hold key/value attribute pairs
in an associated attribute dictionary (the keys must be hashable).
By default these are empty, but can be added or changed using
add_edge, add_node or direct manipulation of the attribute
dictionaries named graph, node and edge respectively.
>>> G = nx.Graph(day="Friday")
>>> G.graph
{'day': 'Friday'}
Add node attributes using add_node(), add_nodes_from() or G.nodes
>>> G.add_node(1, time="5pm")
>>> G.add_nodes_from([3], time="2pm")
>>> G.nodes[1]
{'time': '5pm'}
>>> G.nodes[1]["room"] = 714  # node must exist already to use G.nodes
>>> del G.nodes[1]["room"]  # remove attribute
>>> list(G.nodes(data=True))
[(1, {'time': '5pm'}), (3, {'time': '2pm'})]
Add edge attributes using add_edge(), add_edges_from(), subscript
notation, or G.edges.
>>> G.add_edge(1, 2, weight=4.7)
>>> G.add_edges_from([(3, 4), (4, 5)], color="red")
>>> G.add_edges_from([(1, 2, {"color": "blue"}), (2, 3, {"weight": 8})])
>>> G[1][2]["weight"] = 4.7
>>> G.edges[1, 2]["weight"] = 4
Warning: we protect the graph data structure by making `G.edges` a
read-only dict-like structure. However, you can assign to attributes
in e.g. `G.edges[1, 2]`. Thus, use 2 sets of brackets to add/change
data attributes: `G.edges[1, 2]['weight'] = 4`
(For multigraphs: `MG.edges[u, v, key][name] = value`).
**Shortcuts:**
Many common graph features allow python syntax to speed reporting.
>>> 1 in G  # check if node in graph
True
>>> [n for n in G if n < 3]  # iterate through nodes
[1, 2]
>>> len(G)  # number of nodes in graph
5
Often the best way to traverse all edges of a graph is via the neighbors.
The neighbors are reported as an adjacency-dict `G.adj` or `G.adjacency()`
>>> for n, nbrsdict in G.adjacency():
...     for nbr, eattr in nbrsdict.items():
...         if "weight" in eattr:
...             # Do something useful with the edges
...             pass
But the edges() method is often more convenient:
>>> for u, v, weight in G.edges.data("weight"):
...     if weight is not None:
...         # Do something useful with the edges
...         pass
**Reporting:**
Simple graph information is obtained using object-attributes and methods.
Reporting typically provides views instead of containers to reduce memory
usage. The views update as the graph is updated similarly to dict-views.
The objects `nodes`, `edges` and `adj` provide access to data attributes
via lookup (e.g. `nodes[n]`, `edges[u, v]`, `adj[u][v]`) and iteration
(e.g. `nodes.items()`, `nodes.data('color')`,
`nodes.data('color', default='blue')` and similarly for `edges`)
Views exist for `nodes`, `edges`, `neighbors()`/`adj` and `degree`.
For details on these and other miscellaneous methods, see below.
**Subclasses (Advanced):**
The Graph class uses a dict-of-dict-of-dict data structure.
The outer dict (node_dict) holds adjacency information keyed by node.
The next dict (adjlist_dict) represents the adjacency information and holds
edge data keyed by neighbor.  The inner dict (edge_attr_dict) represents
the edge data and holds edge attribute values keyed by attribute names.
Each of these three dicts can be replaced in a subclass by a user defined
dict-like object. In general, the dict-like features should be
maintained but extra features can be added. To replace one of the
dicts create a new graph class by changing the class(!) variable
holding the factory for that dict-like structure.
node_dict_factory : function, (default: dict)
    Factory function to be used to create the dict containing node
    attributes, keyed by node id.
    It should require no arguments and return a dict-like object
node_attr_dict_factory: function, (default: dict)
    Factory function to be used to create the node attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object
adjlist_outer_dict_factory : function, (default: dict)
    Factory function to be used to create the outer-most dict
    in the data structure that holds adjacency info keyed by node.
    It should require no arguments and return a dict-like object.
adjlist_inner_dict_factory : function, (default: dict)
    Factory function to be used to create the adjacency list
    dict which holds edge data keyed by neighbor.
    It should require no arguments and return a dict-like object
edge_attr_dict_factory : function, (default: dict)
    Factory function to be used to create the edge attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object.
graph_attr_dict_factory : function, (default: dict)
    Factory function to be used to create the graph attribute
    dict which holds attribute values keyed by attribute name.
    It should require no arguments and return a dict-like object.
Typically, if your extension doesn't impact the data structure all
methods will inherit without issue except: `to_directed/to_undirected`.
By default these methods create a DiGraph/Graph class and you probably
want them to create your extension of a DiGraph/Graph. To facilitate
this we define two class variables that you can set in your subclass.
to_directed_class : callable, (default: DiGraph or MultiDiGraph)
    Class to create a new graph structure in the `to_directed` method.
    If `None`, a NetworkX class (DiGraph or MultiDiGraph) is used.
to_undirected_class : callable, (default: Graph or MultiGraph)
    Class to create a new graph structure in the `to_undirected` method.
    If `None`, a NetworkX class (Graph or MultiGraph) is used.
**Subclassing Example**
Create a low memory graph class that effectively disallows edge
attributes by using a single attribute dict for all edges.
This reduces the memory used, but you lose edge attributes.
>>> class ThinGraph(nx.Graph):
...     all_edge_dict = {"weight": 1}
...
...     def single_edge_dict(self):
...         return self.all_edge_dict
...
...     edge_attr_dict_factory = single_edge_dict
>>> G = ThinGraph()
>>> G.add_edge(2, 1)
>>> G[2][1]
{'weight': 1}
>>> G.add_edge(2, 2)
>>> G[2][1] is G[2][2]
True
Init docstring:
Initialize a graph with edges, name, or graph attributes.
Parameters
----------
incoming_graph_data : input graph (optional, default: None)
    Data to initialize graph. If None (default) an empty
    graph is created.  The data can be an edge list, or any
    NetworkX graph object.  If the corresponding optional Python
    packages are installed the data can also be a 2D NumPy array, a
    SciPy sparse array, or a PyGraphviz graph.
attr : keyword arguments, optional (default= no attributes)
    Attributes to add to graph as key=value pairs.
See Also
--------
convert
Examples
--------
>>> G = nx.Graph()  # or DiGraph, MultiGraph, MultiDiGraph, etc
>>> G = nx.Graph(name="my graph")
>>> e = [(1, 2), (2, 3), (3, 4)]  # list of edges
>>> G = nx.Graph(e)
Arbitrary graph attribute pairs (key=value) may be assigned
>>> G = nx.Graph(e, day="Friday")
>>> G.graph
{'day': 'Friday'}
nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
ValueError: invalid literal for int() with base 10: 'A'
  • edge의 특성 label에 사기거래 여부를 넣음
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
ValueError: invalid literal for int() with base 10: 'A'
  • edge의 특성 weight에 거래금액을 넣음
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
                                                      df["merchant"].values.tolist()))}
    
    df["from"]=df["cc_num"].apply(lambda x:mapping[x])  #엣지의 출발점
    df["to"]=df["merchant"].apply(lambda x:mapping[x])  #엣지의 도착점
    
    df = df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
    df["is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
    
    G=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
    
    nx.set_edge_attributes(G, {(int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")  #엣지 속성 설정,각 속성의 사기 여부부 
    
    nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액

    return G
G_bu = build_graph_bipartite(df_, nx.Graph(name="Bipartite Undirect"))

삼분그래프

def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    
df = pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
/tmp/ipykernel_3133383/1475241791.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df
Unnamed: 0 trans_date_trans_time cc_num merchant category amt first last gender street ... city_pop job dob trans_num unix_time merch_lat merch_long is_fraud in_node out_node
669418 669418 2019-10-12 18:21 4.089100e+18 fraud_Haley, Jewess and Bechtelar shopping_pos 7.53 Debra Stark F 686 Linda Rest ... 24536 Multimedia programmer 1983-10-14 d313353fa30233e5fab5468e852d22fc 1350066071 32.202008 -94.371865 0 128914 194152
32567 32567 2019-01-20 13:06 4.247920e+12 fraud_Turner LLC travel 3.79 Judith Moss F 46297 Benjamin Plains Suite 703 ... 22305 Television floor manager 1939-03-09 88c65b4e1585934d578511e627fe3589 1327064760 39.156673 -82.930503 0 88222 154234
156587 156587 2019-03-24 18:09 4.026220e+12 fraud_Klein Group entertainment 59.07 Debbie Payne F 204 Ashley Neck Apt. 169 ... 4720 Broadcast presenter 1977-05-18 3bd9ede04b5c093143d5e5292940b670 1332612553 41.657152 -72.595751 0 46256 147005
1020243 1020243 2020-02-25 15:12 4.957920e+12 fraud_Monahan-Morar personal_care 25.58 Alan Parsons M 0547 Russell Ford Suite 574 ... 207 Network engineer 1955-12-04 19e16ee7a01d229e750359098365e321 1361805120 39.080346 -103.213452 0 201959 214730
116272 116272 2019-03-06 23:19 4.178100e+15 fraud_Kozey-Kuhlman personal_care 84.96 Jill Flores F 639 Cruz Islands ... 3104 Horticulturist, commercial 1981-03-29 a0c8641ca1f5d6e243ed5a2246e66176 1331075954 42.502065 -86.732664 0 163581 206764
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1047089 1047089 2020-03-10 3:59 3.589290e+15 fraud_Kris-Weimann misc_net 690.49 Paula Estrada F 350 Stacy Glens ... 343 Development worker, international aid 1972-03-05 fb1ddd251bbec9b84c9755e856d51723 1362887989 43.254214 -98.267759 1 58387 162384
1047157 1047157 2020-03-10 4:31 3.546670e+15 fraud_Casper, Hand and Zulauf grocery_pos 324.74 Jordan May M 1626 Susan Course ... 13602 Optometrist 1984-07-05 4dca0549e43b7e265cae7fd8a7e563b4 1362889904 33.607221 -97.996506 1 116978 49539
1047208 1047208 2020-03-10 4:59 3.589290e+15 fraud_Kiehn Inc grocery_pos 331.33 Paula Estrada F 350 Stacy Glens ... 343 Development worker, international aid 1972-03-05 d18c55035998e461aa9040e254b74925 1362891561 44.228731 -98.330520 1 58387 206004
1047521 1047521 2020-03-10 8:22 3.589290e+15 fraud_Rau and Sons grocery_pos 356.20 Paula Estrada F 350 Stacy Glens ... 343 Development worker, international aid 1972-03-05 bdaeb5e3413a408d7e6c3720a35337d5 1362903771 43.988931 -97.989985 1 58387 137085
1047918 1047918 2020-03-10 12:09 3.589290e+15 fraud_O'Connell-Ullrich home 249.56 Paula Estrada F 350 Stacy Glens ... 343 Development worker, international aid 1972-03-05 8f0bac74e340483b44babb0d6d07b85b 1362917373 42.868322 -98.537668 1 58387 63205

214520 rows × 25 columns

 G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=nx.Graph())
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
  • label에 사기 거래 여부 표시
nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
    df=df_input.copy()
    mapping={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() + 
                                                       df["cc_num"].values.tolist() +
                                                       df["merchant"].values.tolist()))}
    df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
    df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
    
        
    G=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
                        [(x["out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
     
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
    
    nx.set_edge_attributes(G,{(x["in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
    
    
    return G
    
G_tu = build_graph_tripartite(df, nx.Graph())

지도학습(이분그래프)


from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_bipartite(df_downsampled)
1    6006
0    6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
np.array(list(range(len(G_down.edges))))
np.array(list(nx.get_edge_attributes(G_down, "label").values()))
array([1, 1, 1, ..., 0, 0, 0])
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
#train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
np.array(list(set(G_down.nodes)))[-20:]
array([1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614,
       1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623])
0,1623
(0, 1623)
1623*0.8
1298.4
list(set(G_down.nodes) - set(train_graph.nodes))
[1121, 1608, 1067, 813, 1199, 1584, 593, 473, 1372, 1534]

질문! 왜 왜 ………….. G_dow.nodes - train_grahp.nodes 를 하는거지?

from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00,  2.44it/s]
  • Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 
    #X=train_embeddings
    #y=train_labels
    #df=[X,y]
    # predictr = TabularPredictor(label='y')
    # predictr.fit(df) 

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred)) 
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.7285714285714285
Recall: 0.13144329896907217
F1-Score: 0.22270742358078602
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.6959247648902821
Recall: 0.7628865979381443
F1-Score: 0.7278688524590164
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.574468085106383
Recall: 0.023195876288659795
F1-Score: 0.04459124690338563
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.5319148936170213
Recall: 0.02147766323024055
F1-Score: 0.041288191577208914

.wv : 단어 벡터

만약

edgs = [(1, 2), (2, 3), (3, 4), (4, 5)]

x=2라면 이라면, edge[x=2][0] = 3 edge[x=2][1] = 4 이다.

- 임베딩 방법(ChatGTP)

DeepWalk: 노드의 임베딩을 학습하기 위해 Skip-gram 방식의 Word2Vec을 그래프에 적용하는 방법입니다.

GraphSAGE (Graph Sample and Aggregated Embedding): 각 노드에 대해 이웃 노드들의 임베딩을 집계하여 해당 노드의 임베딩을 생성하는 방법입니다.

Graph Attention Networks (GAT): 노드 간의 관계를 고려한 그래프 신경망으로, 노드에 대한 임베딩을 학습할 수 있습니다.

TADW (Topological Deep Walk): 토플로지와 텍스트 정보를 결합하여 노드를 임베딩하는 방법입니다.

지도학습(삼분그래프)


from sklearn.utils import resample

df_majority = df[df.is_fraud==0]
df_minority = df[df.is_fraud==1]

df_maj_dowsampled = resample(df_majority,
                             n_samples=len(df_minority),
                             random_state=42)

df_downsampled = pd.concat([df_minority, df_maj_dowsampled])

print(df_downsampled.is_fraud.value_counts())
G_down = build_graph_tripartite(df_downsampled)
1    6006
0    6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split


train_edges, test_edges, train_labels, test_labels = train_test_split(list(range(len(G_down.edges))), 
                                                                      list(nx.get_edge_attributes(G_down, "label").values()), 
                                                                      test_size=0.20, 
                                                                      random_state=42)
edgs = list(G_down.edges)
train_graph = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()      
train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder

node2vec_train = Node2Vec(train_graph, weight_key='weight')
model_train = node2vec_train.fit(window=10)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:24<00:00,  2.42s/it]
  • Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 
from sklearn.ensemble import RandomForestClassifier 
from sklearn import metrics 

classes = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
for cl in classes:
    embeddings_train = cl(keyed_vectors=model_train.wv) 

    train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
    test_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
    
    rf = RandomForestClassifier(n_estimators=1000, random_state=42) 
    rf.fit(train_embeddings, train_labels); 

    y_pred = rf.predict(test_embeddings)
    print(cl)
    print('Precision:', metrics.precision_score(test_labels, y_pred)) 
    print('Recall:', metrics.recall_score(test_labels, y_pred)) 
    print('F1-Score:', metrics.f1_score(test_labels, y_pred))