import pandas as pd
import numpy as np
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, recall_score, precision_score, accuracy_score
import networkx as nx
# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
imports
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
def throw(df, percentage): # 사기 거래 비율에 맞춰 버려지는 함수!
= df[df['is_fraud'] == 1].copy()
df1 = df[df['is_fraud'] == 0].copy()
df0 = (len(df1) * (1-percentage)) / (len(df0) * percentage)
df0_downsample = df0.sample(frac=df0_downsample, random_state=42)
df0_down = pd.concat([df1, df0_down])
df_p return df_p
= throw(fraudTrain, 0.5) df
autogluon
A. 데이터
def bipartite(df, node_1, node_2, graph_type=nx.Graph()):
=df.copy()
df={x:node_id for node_id, x in enumerate(set(df[node_1].values.tolist()+\
mapping
df[node_2].values.tolist()))}
"from"]=df[node_1].apply(lambda x:mapping[x]) #엣지의 출발점
df["to"]=df[node_2].apply(lambda x:mapping[x]) #엣지의 도착점
df[
= df[['from', 'to', "amt", "is_fraud"]].groupby(['from','to']).agg({"is_fraud":"sum","amt":"sum"}).reset_index()
df "is_fraud"]=df["is_fraud"].apply(lambda x:1 if x>0 else 0)
df[
=nx.from_edgelist(df[["from","to"]].values, create_using=graph_type)
G
int(x["from"]),int(x["to"])):x["is_fraud"] for idx, x in df[["from","to","is_fraud"]].iterrows()}, "label")
nx.set_edge_attributes(G,{(int(x["from"]),int(x["to"])):x["amt"] for idx,x in df[["from","to","amt"]].iterrows()}, "weight")
nx.set_edge_attributes(G,{(
return G
= bipartite(df, node_1 = 'cc_num', node_2 = 'merchant') G
def G_split(G, test_size):
= train_test_split(list(range(len(G.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G, "label").values()),
=test_size,
test_size=42)
random_state= list(G.edges)
edgs
= G.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G.nodes) - set(train_graph.nodes)))
train_graph.add_nodes_from(= G.edge_subgraph([edgs[x] for x in test_edges]).copy()
test_graph list(set(G.nodes) - set(test_graph.nodes)))
test_graph.add_nodes_from(
return train_graph, test_graph
= G_split(G, test_size=0.2) train_graph, test_graph
= train_test_split(list(range(len(G.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G, "label").values()),
=0.2,
test_size=42) random_state
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= list(G.edges)
edgs = Node2Vec(train_graph, weight_key='weight')
node2vec_train = node2vec_train.fit(window=10)
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
embeddings_train = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges] test_embeddings
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:03<00:00, 2.59it/s]
= TabularDataset(train_embeddings) tr
B. predictor 생성
= TabularPredictor(train_labels) predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20240117_122745/"
C.적합(fit)
='best_quality') predictr.fit(tr, presets
Presets specified: ['best_quality']
AttributeError: 'TabularDataset' object has no attribute 'unique'
predictr.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.894772 0.020896 4.365760 0.009703 2.119462 2 True 14
1 CatBoost_BAG_L1 0.894661 0.004997 1.386780 0.004997 1.386780 1 True 7
2 XGBoost_BAG_L1 0.894439 0.025061 0.600274 0.025061 0.600274 1 True 11
3 LightGBMLarge_BAG_L1 0.894106 0.006197 0.859518 0.006197 0.859518 1 True 13
4 LightGBM_BAG_L1 0.893995 0.015738 0.650386 0.015738 0.650386 1 True 4
5 NeuralNetTorch_BAG_L1 0.888778 0.050014 14.929281 0.050014 14.929281 1 True 12
6 LightGBMXT_BAG_L1 0.885004 0.030494 0.456313 0.030494 0.456313 1 True 3
7 KNeighborsUnif_BAG_L1 0.878233 0.011929 0.005328 0.011929 0.005328 1 True 1
8 NeuralNetFastAI_BAG_L1 0.867022 0.089430 7.351443 0.089430 7.351443 1 True 10
9 KNeighborsDist_BAG_L1 0.864136 0.009754 0.004292 0.009754 0.004292 1 True 2
10 ExtraTreesEntr_BAG_L1 0.862582 0.211025 0.299140 0.211025 0.299140 1 True 9
11 ExtraTreesGini_BAG_L1 0.862249 0.203468 0.341149 0.203468 0.341149 1 True 8
12 RandomForestEntr_BAG_L1 0.856033 0.185369 0.526263 0.185369 0.526263 1 True 6
13 RandomForestGini_BAG_L1 0.856033 0.190420 0.333284 0.190420 0.333284 1 True 5
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | WeightedEnsemble_L2 | 0.894772 | 0.020896 | 4.365760 | 0.009703 | 2.119462 | 2 | True | 14 |
1 | CatBoost_BAG_L1 | 0.894661 | 0.004997 | 1.386780 | 0.004997 | 1.386780 | 1 | True | 7 |
2 | XGBoost_BAG_L1 | 0.894439 | 0.025061 | 0.600274 | 0.025061 | 0.600274 | 1 | True | 11 |
3 | LightGBMLarge_BAG_L1 | 0.894106 | 0.006197 | 0.859518 | 0.006197 | 0.859518 | 1 | True | 13 |
4 | LightGBM_BAG_L1 | 0.893995 | 0.015738 | 0.650386 | 0.015738 | 0.650386 | 1 | True | 4 |
5 | NeuralNetTorch_BAG_L1 | 0.888778 | 0.050014 | 14.929281 | 0.050014 | 14.929281 | 1 | True | 12 |
6 | LightGBMXT_BAG_L1 | 0.885004 | 0.030494 | 0.456313 | 0.030494 | 0.456313 | 1 | True | 3 |
7 | KNeighborsUnif_BAG_L1 | 0.878233 | 0.011929 | 0.005328 | 0.011929 | 0.005328 | 1 | True | 1 |
8 | NeuralNetFastAI_BAG_L1 | 0.867022 | 0.089430 | 7.351443 | 0.089430 | 7.351443 | 1 | True | 10 |
9 | KNeighborsDist_BAG_L1 | 0.864136 | 0.009754 | 0.004292 | 0.009754 | 0.004292 | 1 | True | 2 |
10 | ExtraTreesEntr_BAG_L1 | 0.862582 | 0.211025 | 0.299140 | 0.211025 | 0.299140 | 1 | True | 9 |
11 | ExtraTreesGini_BAG_L1 | 0.862249 | 0.203468 | 0.341149 | 0.203468 | 0.341149 | 1 | True | 8 |
12 | RandomForestEntr_BAG_L1 | 0.856033 | 0.185369 | 0.526263 | 0.185369 | 0.526263 | 1 | True | 6 |
13 | RandomForestGini_BAG_L1 | 0.856033 | 0.190420 | 0.333284 | 0.190420 | 0.333284 | 1 | True | 5 |
D. 예측(predict)
== predictr.predict(tr)).mean() (tr.is_fraud
0.8967698967698968
== predictr.predict(tst)).mean() (tst.is_fraud
0.9021908791725435