import pandas as pd
ref
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
= pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
df df.head()
/tmp/ipykernel_3133383/372253127.py:3: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 32.3836 | -94.8653 | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 |
32567 | 32567 | 2019-01-20 13:06 | 4.247920e+12 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 39.5370 | -83.4550 | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 |
156587 | 156587 | 2019-03-24 18:09 | 4.026220e+12 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 41.5224 | -71.9934 | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 |
1020243 | 1020243 | 2020-02-25 15:12 | 4.957920e+12 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 39.6171 | -102.4776 | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 |
116272 | 116272 | 2019-03-06 23:19 | 4.178100e+15 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 41.9488 | -86.4913 | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 |
5 rows × 23 columns
- 총 265,342건 거래 중 7,506건(2,83%)가 사기
이분그래프
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
-
이분그래프
= df.copy()
df_ ={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점 df[
from은 cc_num 고유의 값이라고 생각. to의 값도 merkchant의 고유값
943+693 # 고객 + 상점
1636
'from'] == 1] df[df[
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | from | to | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 | 1 | 666 |
1019480 | 1019480 | 2020-02-24 22:42 | 4.089100e+18 | fraud_Boyer PLC | shopping_net | 7.11 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | fafe649e0bc55f131168b2d9dd84463a | 1361745777 | 33.363174 | -94.943839 | 0 | 1 | 864 |
332666 | 332666 | 2019-06-07 15:03 | 4.089100e+18 | fraud_Stiedemann Ltd | food_dining | 106.83 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | a7bbe4b43fcb572f950109bece88ce26 | 1339081432 | 32.376099 | -94.801647 | 0 | 1 | 502 |
665008 | 665008 | 2019-10-10 17:11 | 4.089100e+18 | fraud_Altenwerth-Kilback | home | 89.68 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | 1e5f07116dcc5a4fa062168987c121a1 | 1349889093 | 31.410590 | -95.486031 | 0 | 1 | 652 |
417225 | 417225 | 2019-07-07 11:24 | 4.089100e+18 | fraud_Koepp-Witting | grocery_pos | 174.12 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | 68ede1422d7d6fe0b435661c75cccaa9 | 1341660240 | 32.792707 | -94.622866 | 0 | 1 | 936 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
907914 | 907914 | 2019-12-28 18:05 | 4.089100e+18 | fraud_Hagenes, Kohler and Hoppe | food_dining | 31.66 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | a339b4ff5619129e30d0914fb264c6b2 | 1356717951 | 32.075614 | -95.385633 | 0 | 1 | 574 |
545087 | 545087 | 2019-08-21 14:25 | 4.089100e+18 | fraud_Thiel Ltd | travel | 2.59 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | 1b72cd11da52010b7f751de70621d68e | 1345559124 | 31.514842 | -94.502117 | 0 | 1 | 875 |
890185 | 890185 | 2019-12-23 22:48 | 4.089100e+18 | fraud_Hyatt, Russel and Gleichner | health_fitness | 156.26 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | 772bdb76fb8e3a365864899d1b7b3a77 | 1356302914 | 33.072255 | -95.310844 | 0 | 1 | 23 |
505273 | 505273 | 2019-08-07 12:33 | 4.089100e+18 | fraud_Yost, Schamberger and Windler | kids_pets | 114.13 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | 2cdfcd476e3b08f32a78190c1268df55 | 1344342827 | 32.321521 | -95.143493 | 0 | 1 | 1245 |
253317 | 253317 | 2019-05-06 8:09 | 4.089100e+18 | fraud_Rempel PLC | grocery_net | 52.49 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | a807c5c9e853e94ec7b1680eed9d46c5 | 1336291776 | 33.103470 | -95.157733 | 0 | 1 | 1299 |
251 rows × 25 columns
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index() df
df
from | to | is_fraud | amt | |
---|---|---|---|---|
0 | 0 | 12 | 0 | 66.10 |
1 | 0 | 14 | 0 | 114.02 |
2 | 0 | 17 | 0 | 20.24 |
3 | 0 | 23 | 0 | 24.30 |
4 | 0 | 24 | 0 | 9.14 |
... | ... | ... | ... | ... |
169967 | 1634 | 1484 | 0 | 1.28 |
169968 | 1634 | 1497 | 0 | 3.00 |
169969 | 1634 | 1518 | 0 | 40.06 |
169970 | 1634 | 1545 | 0 | 284.58 |
169971 | 1634 | 1569 | 1 | 338.99 |
169972 rows × 4 columns
#위와 같은 예시
= {
data 'from': ['A', 'B', 'A', 'B', 'A'],
'to': ['X', 'Y', 'X', 'Y', 'Z'],
'amt': [100, 200, 150, 300, 120],
'is_fraud': [0, 1, 0, 1, 0]
}
= pd.DataFrame(data)
df
= df[['from', 'to', 'amt', 'is_fraud']].groupby(['from', 'to']).agg({"is_fraud": "sum", "amt": "sum"}).reset_index()
df_grouped
print(df_grouped)
from to is_fraud amt
0 A X 0 250
1 A Z 0 120
2 B Y 2 500
"is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0) df[
df
from | to | amt | is_fraud | |
---|---|---|---|---|
0 | A | X | 100 | 0 |
1 | B | Y | 200 | 1 |
2 | A | X | 150 | 0 |
3 | B | Y | 300 | 1 |
4 | A | Z | 120 | 0 |
=nx.from_edgelist(df[["from","to"]].values, create_using=nx.Graph()) G
G
<networkx.classes.graph.Graph at 0x7f576becfb20>
G?
Type: Graph String form: Graph with 5 nodes and 3 edges Length: 5 File: ~/anaconda3/envs/py38/lib/python3.8/site-packages/networkx/classes/graph.py Docstring: Base class for undirected graphs. A Graph stores nodes and edges with optional data, or attributes. Graphs hold undirected edges. Self loops are allowed but multiple (parallel) edges are not. Nodes can be arbitrary (hashable) Python objects with optional key/value attributes, except that `None` is not allowed as a node. Edges are represented as links between nodes with optional key/value attributes. Parameters ---------- incoming_graph_data : input graph (optional, default: None) Data to initialize graph. If None (default) an empty graph is created. The data can be any format that is supported by the to_networkx_graph() function, currently including edge list, dict of dicts, dict of lists, NetworkX graph, 2D NumPy array, SciPy sparse matrix, or PyGraphviz graph. attr : keyword arguments, optional (default= no attributes) Attributes to add to graph as key=value pairs. See Also -------- DiGraph MultiGraph MultiDiGraph Examples -------- Create an empty graph structure (a "null graph") with no nodes and no edges. >>> G = nx.Graph() G can be grown in several ways. **Nodes:** Add one node at a time: >>> G.add_node(1) Add the nodes from any container (a list, dict, set or even the lines from a file or the nodes from another graph). >>> G.add_nodes_from([2, 3]) >>> G.add_nodes_from(range(100, 110)) >>> H = nx.path_graph(10) >>> G.add_nodes_from(H) In addition to strings and integers any hashable Python object (except None) can represent a node, e.g. a customized node object, or even another Graph. >>> G.add_node(H) **Edges:** G can also be grown by adding edges. Add one edge, >>> G.add_edge(1, 2) a list of edges, >>> G.add_edges_from([(1, 2), (1, 3)]) or a collection of edges, >>> G.add_edges_from(H.edges) If some edges connect nodes not yet in the graph, the nodes are added automatically. There are no errors when adding nodes or edges that already exist. **Attributes:** Each graph, node, and edge can hold key/value attribute pairs in an associated attribute dictionary (the keys must be hashable). By default these are empty, but can be added or changed using add_edge, add_node or direct manipulation of the attribute dictionaries named graph, node and edge respectively. >>> G = nx.Graph(day="Friday") >>> G.graph {'day': 'Friday'} Add node attributes using add_node(), add_nodes_from() or G.nodes >>> G.add_node(1, time="5pm") >>> G.add_nodes_from([3], time="2pm") >>> G.nodes[1] {'time': '5pm'} >>> G.nodes[1]["room"] = 714 # node must exist already to use G.nodes >>> del G.nodes[1]["room"] # remove attribute >>> list(G.nodes(data=True)) [(1, {'time': '5pm'}), (3, {'time': '2pm'})] Add edge attributes using add_edge(), add_edges_from(), subscript notation, or G.edges. >>> G.add_edge(1, 2, weight=4.7) >>> G.add_edges_from([(3, 4), (4, 5)], color="red") >>> G.add_edges_from([(1, 2, {"color": "blue"}), (2, 3, {"weight": 8})]) >>> G[1][2]["weight"] = 4.7 >>> G.edges[1, 2]["weight"] = 4 Warning: we protect the graph data structure by making `G.edges` a read-only dict-like structure. However, you can assign to attributes in e.g. `G.edges[1, 2]`. Thus, use 2 sets of brackets to add/change data attributes: `G.edges[1, 2]['weight'] = 4` (For multigraphs: `MG.edges[u, v, key][name] = value`). **Shortcuts:** Many common graph features allow python syntax to speed reporting. >>> 1 in G # check if node in graph True >>> [n for n in G if n < 3] # iterate through nodes [1, 2] >>> len(G) # number of nodes in graph 5 Often the best way to traverse all edges of a graph is via the neighbors. The neighbors are reported as an adjacency-dict `G.adj` or `G.adjacency()` >>> for n, nbrsdict in G.adjacency(): ... for nbr, eattr in nbrsdict.items(): ... if "weight" in eattr: ... # Do something useful with the edges ... pass But the edges() method is often more convenient: >>> for u, v, weight in G.edges.data("weight"): ... if weight is not None: ... # Do something useful with the edges ... pass **Reporting:** Simple graph information is obtained using object-attributes and methods. Reporting typically provides views instead of containers to reduce memory usage. The views update as the graph is updated similarly to dict-views. The objects `nodes`, `edges` and `adj` provide access to data attributes via lookup (e.g. `nodes[n]`, `edges[u, v]`, `adj[u][v]`) and iteration (e.g. `nodes.items()`, `nodes.data('color')`, `nodes.data('color', default='blue')` and similarly for `edges`) Views exist for `nodes`, `edges`, `neighbors()`/`adj` and `degree`. For details on these and other miscellaneous methods, see below. **Subclasses (Advanced):** The Graph class uses a dict-of-dict-of-dict data structure. The outer dict (node_dict) holds adjacency information keyed by node. The next dict (adjlist_dict) represents the adjacency information and holds edge data keyed by neighbor. The inner dict (edge_attr_dict) represents the edge data and holds edge attribute values keyed by attribute names. Each of these three dicts can be replaced in a subclass by a user defined dict-like object. In general, the dict-like features should be maintained but extra features can be added. To replace one of the dicts create a new graph class by changing the class(!) variable holding the factory for that dict-like structure. node_dict_factory : function, (default: dict) Factory function to be used to create the dict containing node attributes, keyed by node id. It should require no arguments and return a dict-like object node_attr_dict_factory: function, (default: dict) Factory function to be used to create the node attribute dict which holds attribute values keyed by attribute name. It should require no arguments and return a dict-like object adjlist_outer_dict_factory : function, (default: dict) Factory function to be used to create the outer-most dict in the data structure that holds adjacency info keyed by node. It should require no arguments and return a dict-like object. adjlist_inner_dict_factory : function, (default: dict) Factory function to be used to create the adjacency list dict which holds edge data keyed by neighbor. It should require no arguments and return a dict-like object edge_attr_dict_factory : function, (default: dict) Factory function to be used to create the edge attribute dict which holds attribute values keyed by attribute name. It should require no arguments and return a dict-like object. graph_attr_dict_factory : function, (default: dict) Factory function to be used to create the graph attribute dict which holds attribute values keyed by attribute name. It should require no arguments and return a dict-like object. Typically, if your extension doesn't impact the data structure all methods will inherit without issue except: `to_directed/to_undirected`. By default these methods create a DiGraph/Graph class and you probably want them to create your extension of a DiGraph/Graph. To facilitate this we define two class variables that you can set in your subclass. to_directed_class : callable, (default: DiGraph or MultiDiGraph) Class to create a new graph structure in the `to_directed` method. If `None`, a NetworkX class (DiGraph or MultiDiGraph) is used. to_undirected_class : callable, (default: Graph or MultiGraph) Class to create a new graph structure in the `to_undirected` method. If `None`, a NetworkX class (Graph or MultiGraph) is used. **Subclassing Example** Create a low memory graph class that effectively disallows edge attributes by using a single attribute dict for all edges. This reduces the memory used, but you lose edge attributes. >>> class ThinGraph(nx.Graph): ... all_edge_dict = {"weight": 1} ... ... def single_edge_dict(self): ... return self.all_edge_dict ... ... edge_attr_dict_factory = single_edge_dict >>> G = ThinGraph() >>> G.add_edge(2, 1) >>> G[2][1] {'weight': 1} >>> G.add_edge(2, 2) >>> G[2][1] is G[2][2] True Init docstring: Initialize a graph with edges, name, or graph attributes. Parameters ---------- incoming_graph_data : input graph (optional, default: None) Data to initialize graph. If None (default) an empty graph is created. The data can be an edge list, or any NetworkX graph object. If the corresponding optional Python packages are installed the data can also be a 2D NumPy array, a SciPy sparse array, or a PyGraphviz graph. attr : keyword arguments, optional (default= no attributes) Attributes to add to graph as key=value pairs. See Also -------- convert Examples -------- >>> G = nx.Graph() # or DiGraph, MultiGraph, MultiDiGraph, etc >>> G = nx.Graph(name="my graph") >>> e = [(1, 2), (2, 3), (3, 4)] # list of edges >>> G = nx.Graph(e) Arbitrary graph attribute pairs (key=value) may be assigned >>> G = nx.Graph(e, day="Friday") >>> G.graph {'day': 'Friday'}
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부 nx.set_edge_attributes(G, {(
ValueError: invalid literal for int() with base 10: 'A'
- edge의 특성 label에 사기거래 여부를 넣음
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액 nx.set_edge_attributes(G,{(
ValueError: invalid literal for int() with base 10: 'A'
- edge의 특성 weight에 거래금액을 넣음
def build_graph_bipartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df["cc_num"].values.tolist()+\
mapping"merchant"].values.tolist()))}
df[
"from"]=df["cc_num"].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df["merchant"].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label") #엣지 속성 설정,각 속성의 사기 여부부
nx.set_edge_attributes(G, {(
int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight") # 엣지 속성 설정, 각 엣지의 거래 금액
nx.set_edge_attributes(G,{(
return G
= build_graph_bipartite(df_, nx.Graph(name="Bipartite Undirect")) G_bu
삼분그래프
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
= pd.read_csv("~/Desktop/fraudTrain.csv")
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1]) df
/tmp/ipykernel_3133383/1475241791.py:2: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
df = df[df["is_fraud"]==0].sample(frac=0.20, random_state=42).append(df[df["is_fraud"] == 1])
={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))} df[
"in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[ df
Unnamed: 0 | trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | ... | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | in_node | out_node | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
669418 | 669418 | 2019-10-12 18:21 | 4.089100e+18 | fraud_Haley, Jewess and Bechtelar | shopping_pos | 7.53 | Debra | Stark | F | 686 Linda Rest | ... | 24536 | Multimedia programmer | 1983-10-14 | d313353fa30233e5fab5468e852d22fc | 1350066071 | 32.202008 | -94.371865 | 0 | 128914 | 194152 |
32567 | 32567 | 2019-01-20 13:06 | 4.247920e+12 | fraud_Turner LLC | travel | 3.79 | Judith | Moss | F | 46297 Benjamin Plains Suite 703 | ... | 22305 | Television floor manager | 1939-03-09 | 88c65b4e1585934d578511e627fe3589 | 1327064760 | 39.156673 | -82.930503 | 0 | 88222 | 154234 |
156587 | 156587 | 2019-03-24 18:09 | 4.026220e+12 | fraud_Klein Group | entertainment | 59.07 | Debbie | Payne | F | 204 Ashley Neck Apt. 169 | ... | 4720 | Broadcast presenter | 1977-05-18 | 3bd9ede04b5c093143d5e5292940b670 | 1332612553 | 41.657152 | -72.595751 | 0 | 46256 | 147005 |
1020243 | 1020243 | 2020-02-25 15:12 | 4.957920e+12 | fraud_Monahan-Morar | personal_care | 25.58 | Alan | Parsons | M | 0547 Russell Ford Suite 574 | ... | 207 | Network engineer | 1955-12-04 | 19e16ee7a01d229e750359098365e321 | 1361805120 | 39.080346 | -103.213452 | 0 | 201959 | 214730 |
116272 | 116272 | 2019-03-06 23:19 | 4.178100e+15 | fraud_Kozey-Kuhlman | personal_care | 84.96 | Jill | Flores | F | 639 Cruz Islands | ... | 3104 | Horticulturist, commercial | 1981-03-29 | a0c8641ca1f5d6e243ed5a2246e66176 | 1331075954 | 42.502065 | -86.732664 | 0 | 163581 | 206764 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1047089 | 1047089 | 2020-03-10 3:59 | 3.589290e+15 | fraud_Kris-Weimann | misc_net | 690.49 | Paula | Estrada | F | 350 Stacy Glens | ... | 343 | Development worker, international aid | 1972-03-05 | fb1ddd251bbec9b84c9755e856d51723 | 1362887989 | 43.254214 | -98.267759 | 1 | 58387 | 162384 |
1047157 | 1047157 | 2020-03-10 4:31 | 3.546670e+15 | fraud_Casper, Hand and Zulauf | grocery_pos | 324.74 | Jordan | May | M | 1626 Susan Course | ... | 13602 | Optometrist | 1984-07-05 | 4dca0549e43b7e265cae7fd8a7e563b4 | 1362889904 | 33.607221 | -97.996506 | 1 | 116978 | 49539 |
1047208 | 1047208 | 2020-03-10 4:59 | 3.589290e+15 | fraud_Kiehn Inc | grocery_pos | 331.33 | Paula | Estrada | F | 350 Stacy Glens | ... | 343 | Development worker, international aid | 1972-03-05 | d18c55035998e461aa9040e254b74925 | 1362891561 | 44.228731 | -98.330520 | 1 | 58387 | 206004 |
1047521 | 1047521 | 2020-03-10 8:22 | 3.589290e+15 | fraud_Rau and Sons | grocery_pos | 356.20 | Paula | Estrada | F | 350 Stacy Glens | ... | 343 | Development worker, international aid | 1972-03-05 | bdaeb5e3413a408d7e6c3720a35337d5 | 1362903771 | 43.988931 | -97.989985 | 1 | 58387 | 137085 |
1047918 | 1047918 | 2020-03-10 12:09 | 3.589290e+15 | fraud_O'Connell-Ullrich | home | 249.56 | Paula | Estrada | F | 350 Stacy Glens | ... | 343 | Development worker, international aid | 1972-03-05 | 8f0bac74e340483b44babb0d6d07b85b | 1362917373 | 42.868322 | -98.537668 | 1 | 58387 | 63205 |
214520 rows × 25 columns
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=nx.Graph()) [(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x["out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label") nx.set_edge_attributes(G,{(x[
- label에 사기 거래 여부 표시
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight") nx.set_edge_attributes(G,{(x[
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
=df_input.copy()
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["merchant"].values.tolist()))}
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
df[
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
[(x[
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
nx.set_edge_attributes(G,{(x[
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
nx.set_edge_attributes(G,{(x[
return G
= build_graph_tripartite(df, nx.Graph()) G_tu
지도학습(이분그래프)
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_majority,
df_maj_dowsampled =len(df_minority),
n_samples=42)
random_state
= pd.concat([df_minority, df_maj_dowsampled])
df_downsampled
print(df_downsampled.is_fraud.value_counts())
= build_graph_bipartite(df_downsampled) G_down
1 6006
0 6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
=0.20,
test_size=42) random_state
list(range(len(G_down.edges))))
np.array(list(nx.get_edge_attributes(G_down, "label").values())) np.array(
array([1, 1, 1, ..., 0, 0, 0])
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph #train_graph.add_nodes_from(list(set(G_down.nodes) - set(train_graph.nodes)))
list(set(G_down.nodes)))[-20:] np.array(
array([1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614,
1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623])
0,1623
(0, 1623)
1623*0.8
1298.4
list(set(G_down.nodes) - set(train_graph.nodes))
[1121, 1608, 1067, 813, 1199, 1584, 593, 473, 1372, 1534]
질문! 왜 왜 ………….. G_dow.nodes - train_grahp.nodes 를 하는거지?
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:04<00:00, 2.44it/s]
- Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf ;
rf.fit(train_embeddings, train_labels)#X=train_embeddings
#y=train_labels
#df=[X,y]
# predictr = TabularPredictor(label='y')
# predictr.fit(df)
= rf.predict(test_embeddings)
y_pred print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))
<class 'node2vec.edges.HadamardEmbedder'>
Precision: 0.7285714285714285
Recall: 0.13144329896907217
F1-Score: 0.22270742358078602
<class 'node2vec.edges.AverageEmbedder'>
Precision: 0.6959247648902821
Recall: 0.7628865979381443
F1-Score: 0.7278688524590164
<class 'node2vec.edges.WeightedL1Embedder'>
Precision: 0.574468085106383
Recall: 0.023195876288659795
F1-Score: 0.04459124690338563
<class 'node2vec.edges.WeightedL2Embedder'>
Precision: 0.5319148936170213
Recall: 0.02147766323024055
F1-Score: 0.041288191577208914
.wv : 단어 벡터
만약
edgs = [(1, 2), (2, 3), (3, 4), (4, 5)]
x=2라면 이라면, edge[x=2][0] = 3 edge[x=2][1] = 4 이다.
-
임베딩 방법(ChatGTP)
DeepWalk: 노드의 임베딩을 학습하기 위해 Skip-gram 방식의 Word2Vec을 그래프에 적용하는 방법입니다.
GraphSAGE (Graph Sample and Aggregated Embedding): 각 노드에 대해 이웃 노드들의 임베딩을 집계하여 해당 노드의 임베딩을 생성하는 방법입니다.
Graph Attention Networks (GAT): 노드 간의 관계를 고려한 그래프 신경망으로, 노드에 대한 임베딩을 학습할 수 있습니다.
TADW (Topological Deep Walk): 토플로지와 텍스트 정보를 결합하여 노드를 임베딩하는 방법입니다.
지도학습(삼분그래프)
from sklearn.utils import resample
= df[df.is_fraud==0]
df_majority = df[df.is_fraud==1]
df_minority
= resample(df_majority,
df_maj_dowsampled =len(df_minority),
n_samples=42)
random_state
= pd.concat([df_minority, df_maj_dowsampled])
df_downsampled
print(df_downsampled.is_fraud.value_counts())
= build_graph_tripartite(df_downsampled) G_down
1 6006
0 6006
Name: is_fraud, dtype: int64
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
=0.20,
test_size=42) random_state
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10) model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:24<00:00, 2.42s/it]
- Node2Vec 알고리즘 사용해 특징 공간 구축
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
test_embeddings
= RandomForestClassifier(n_estimators=1000, random_state=42)
rf ;
rf.fit(train_embeddings, train_labels)
= rf.predict(test_embeddings)
y_pred print(cl)
print('Precision:', metrics.precision_score(test_labels, y_pred))
print('Recall:', metrics.recall_score(test_labels, y_pred))
print('F1-Score:', metrics.f1_score(test_labels, y_pred))