import pandas as pd
import os
import math
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
def throw(df, fraud_rate): # 사기 거래 비율에 맞춰 버려지는 함수!
= df[df['is_fraud'] == 1].copy()
df1 = df[df['is_fraud'] == 0].copy()
df0 = (len(df1) * (1-fraud_rate)) / (len(df0) * fraud_rate)
df0_downsample = df0.sample(frac=df0_downsample, random_state=42)
df0_down = pd.concat([df1, df0_down])
df_p return df_p
def split_dataframe(data_frame, test_fraud_rate, test_rate=0.3):
= len(data_frame)
# 사기 거래와 정상 거래를 분리
= data_frame[data_frame['is_fraud'] == 1]
fraud_data = data_frame[data_frame['is_fraud'] == 0]
# 테스트 데이터 크기 계산
= int(test_fraud_rate * (n * test_rate))
test_samples = int(n * test_rate) - test_samples
# 사기 거래 및 정상 거래에서 무작위로 테스트 데이터 추출
= fraud_data.sample(n=test_samples, replace=False)
test_fraud_data = normal_data.sample(n=remaining_test_samples, replace=False)
# 테스트 데이터 합치기
= pd.concat([test_normal_data, test_fraud_data])
# 훈련 데이터 생성
= data_frame[~data_frame.index.isin(test_data.index)]
return train_data, test_data
def concat(df_tr, df_tst):
= pd.concat([df_tr, df_tst])
df = np.concatenate((np.full(len(df_tr), True), np.full(len(df_tst), False))) # index꼬이는거 방지하기 위해서? ★ (이거,, 훔,,?(
train_mask = np.concatenate((np.full(len(df_tr), False), np.full(len(df_tst), True)))
test_mask = (train_mask, test_mask)
mask return df, mask
def evaluation(y, yhat):
= [sklearn.metrics.accuracy_score,
sklearn.metrics.roc_auc_score]return pd.DataFrame({m.__name__:[m(y,yhat).round(6)] for m in metrics})
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs((group.iloc[i].trans_date_trans_time - group.iloc[j].trans_date_trans_time).total_seconds())
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
def edge_index_save(df, unique_col, theta, gamma):
= df.groupby(unique_col)
groups = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
while os.path.exists(filename):
self.save_attempt += 1
= f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
filename, edge_index)#tetha = edge_index_plust_itme[:,].mean()
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index[:,= torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
edge_index return edge_index
def edge_index(df, unique_col, theta, gamma):
= df.groupby(unique_col)
groups = np.array([item for sublist in (compute_time_difference(group) for _, group in groups) for item in sublist])
edge_index = edge_index.astype(np.float64)
edge_index # filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
# while os.path.exists(filename):
# self.save_attempt += 1
# filename = f"edge_index_attempt{self.save_attempt}_{str(unique_col).replace(' ', '').replace('_', '')}.npy"
#, edge_index)
#tetha = edge_index_plust_itme[:,].mean()
2] = (np.exp(-edge_index[:,2]/(theta)) != 1)*(np.exp(-edge_index[:,2]/(theta))).tolist()
edge_index[:,= torch.tensor([(int(row[0]), int(row[1])) for row in edge_index if row[2] > gamma], dtype=torch.long).t()
edge_index return edge_index
= pd.read_csv("~/Desktop/fraudTrain.csv") df
def build_graph_tripartite(df_input, graph_type=nx.Graph()):
df={x:node_id for node_id, x in enumerate(set(df.index.values.tolist() +
mapping"cc_num"].values.tolist() +
df["in_node"]= df["cc_num"].apply(lambda x: mapping[x])
df["out_node"]=df["merchant"].apply(lambda x:mapping[x])
=nx.from_edgelist([(x["in_node"], mapping[idx]) for idx, x in df.iterrows()] +\
G"out_node"], mapping[idx]) for idx, x in df.iterrows()], create_using=graph_type)
"in_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
"out_node"], mapping[idx]):x["is_fraud"] for idx, x in df.iterrows()}, "label")
"in_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
"out_node"], mapping[idx]):x["amt"] for idx, x in df.iterrows()}, "weight")
return G
0.3 / 0.2
= throw(df, 0.3) df
= split_dataframe(df, 0.2) df_tr, df_tst
(14014, 23)
(6006, 23)
= concat(df_tr, df_tst) df_, mask
= build_graph_tripartite(df_) G_down
G_down.number_of_edges(), G_down.number_of_nodes()
(40040, 21656)
= G_down.edges edges
=edges, edge_type_column="label") sg.StellarGraph(edges
from sklearn.model_selection import train_test_split
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
test_size=42) random_state
np.array(train_labels).mean(), np.array(test_labels).mean()
(0.2995442057942058, 0.30182317682317683)
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes))) train_graph.add_nodes_from(
import stellargraph as sg
from stellargraph.mapper import GraphSAGENodeGenerator
from stellargraph.layer import GraphSAGE
from sklearn.model_selection import train_test_split
from tensorflow import keras
from sklearn.preprocessing import LabelEncoder
= sg.StellarGraph(G_down) graph
/tmp/ipykernel_3324862/ DeprecationWarning: Constructing a StellarGraph directly from a NetworkX graph has been replaced by the `StellarGraph.from_networkx` function
graph = sg.StellarGraph(G_down)
from node2vec import Node2Vec
from node2vec.edges import HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder
= Node2Vec(train_graph, weight_key='weight')
node2vec_train = model_train
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:44<00:00, 4.47s/it]
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges] test_embeddings
(32032, 128)
array([[8.79964884e-03, 1.64067835e-01, 7.30881765e-02, ...,
1.81038718e-04, 1.96826290e-02, 6.05850630e-02],
[3.18445265e-04, 9.41526145e-02, 2.41454929e-01, ...,
5.79439476e-02, 9.22752500e-01, 2.49633682e-03],
[1.09851332e-02, 4.12013801e-03, 1.61135435e-01, ...,
1.24859456e-02, 2.32662242e-02, 1.07970215e-01],
[6.13867212e-03, 8.51532519e-02, 7.86146245e-07, ...,
6.33678436e-02, 7.13208392e-02, 2.39914820e-01],
[2.59715295e-03, 1.24797074e-03, 1.26128927e-01, ...,
7.45704547e-02, 2.37582775e-04, 2.18050033e-01],
[3.34992632e-02, 2.74142623e-03, 4.02400009e-02, ...,
2.08475683e-02, 4.62760888e-02, 3.03567946e-01]], dtype=float32)
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
= pd.DataFrame(data=train_labels, columns=['label'])
# DataFrame 합치기
= pd.concat([df_data, df_labels], axis=1) df
X_0 | X_1 | X_2 | X_3 | X_4 | X_5 | X_6 | X_7 | X_8 | X_9 | ... | X_119 | X_120 | X_121 | X_122 | X_123 | X_124 | X_125 | X_126 | X_127 | label | |
0 | 8.799649e-03 | 0.164068 | 7.308818e-02 | 0.007493 | 0.000196 | 0.037998 | 0.026417 | 0.030260 | 0.045050 | 0.047880 | ... | 0.001140 | 0.082948 | 0.018060 | 0.041995 | 0.073492 | 0.011187 | 1.810387e-04 | 0.019683 | 0.060585 | 1 |
1 | 3.184453e-04 | 0.094153 | 2.414549e-01 | 0.016275 | 0.078201 | 0.015268 | 0.000326 | 0.331411 | 0.067376 | 0.061926 | ... | 0.223335 | 0.000269 | 0.041893 | 0.051856 | 0.191570 | 0.000165 | 5.794395e-02 | 0.922752 | 0.002496 | 0 |
2 | 1.098513e-02 | 0.004120 | 1.611354e-01 | 0.548673 | 0.025099 | 0.001323 | 0.005411 | 0.005461 | 0.021607 | 0.094634 | ... | 0.070138 | 0.038877 | 0.014588 | 0.003246 | 0.205133 | 0.064290 | 1.248595e-02 | 0.023266 | 0.107970 | 0 |
3 | 5.125533e-04 | 0.069008 | 2.394260e-02 | 0.038580 | 0.000345 | 0.058848 | 0.005284 | 0.168507 | 0.327026 | 0.002053 | ... | 0.001018 | 0.037014 | 0.015680 | 0.193011 | 0.000812 | 0.001481 | 4.044086e-02 | 0.065931 | 0.131638 | 0 |
4 | 2.551593e-01 | 0.011703 | 4.802953e-02 | 0.020095 | 0.177149 | 0.022541 | 0.196618 | 0.041765 | 0.000156 | 0.158181 | ... | 0.233692 | 0.104250 | 0.091681 | 0.002522 | 0.874027 | 0.003707 | 3.469664e-05 | 0.015489 | 0.005535 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32027 | 1.361575e+00 | 0.023050 | 1.305244e-01 | 0.504093 | 0.014287 | 0.312939 | 0.000587 | 0.015945 | 0.049380 | 0.332058 | ... | 0.002336 | 0.000683 | 0.191192 | 0.356860 | 0.227310 | 0.000661 | 1.307885e-01 | 0.023592 | 0.041412 | 0 |
32028 | 9.765987e-07 | 0.037045 | 1.420536e-02 | 0.026103 | 0.000017 | 0.116524 | 0.065952 | 0.054716 | 0.154811 | 0.011176 | ... | 0.130505 | 0.006745 | 0.039672 | 0.052033 | 0.125356 | 0.015103 | 2.027648e-08 | 0.016023 | 0.169818 | 1 |
32029 | 6.138672e-03 | 0.085153 | 7.861462e-07 | 0.099862 | 0.408811 | 0.021244 | 0.006153 | 0.115799 | 0.000933 | 0.317840 | ... | 0.177379 | 0.565970 | 0.022457 | 0.043790 | 0.059339 | 0.257367 | 6.336784e-02 | 0.071321 | 0.239915 | 0 |
32030 | 2.597153e-03 | 0.001248 | 1.261289e-01 | 0.000002 | 0.001143 | 0.248826 | 0.006462 | 0.016343 | 0.015120 | 0.108578 | ... | 0.000021 | 0.000859 | 0.000894 | 0.496311 | 0.065650 | 0.007386 | 7.457045e-02 | 0.000238 | 0.218050 | 0 |
32031 | 3.349926e-02 | 0.002741 | 4.024000e-02 | 0.003076 | 0.000032 | 0.000168 | 0.065571 | 0.012371 | 0.112705 | 0.023244 | ... | 0.138493 | 0.010161 | 0.032122 | 0.063049 | 0.028025 | 0.001975 | 2.084757e-02 | 0.046276 | 0.303568 | 1 |
32032 rows × 129 columns
= np.array(train_labels) label
= TabularPredictor(label='label') predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_110704/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_110704/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 623.47 GB / 982.82 GB (63.4%)
Train Data Rows: 32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 25264.96 MB
Train Data (Original) Memory Usage: 16.4 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.4s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.39s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd7032fe50>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5904 = Validation score (accuracy)
0.07s = Training runtime
0.18s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd7032fe50>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5904 = Validation score (accuracy)
0.07s = Training runtime
0.08s = Validation runtime
Fitting model: LightGBMXT ...
0.726 = Validation score (accuracy)
1.07s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.7248 = Validation score (accuracy)
0.94s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.7236 = Validation score (accuracy)
6.19s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.7248 = Validation score (accuracy)
8.16s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.7268 = Validation score (accuracy)
1.01s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.7288 = Validation score (accuracy)
1.02s = Training runtime
0.05s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.726 = Validation score (accuracy)
1.06s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
0.742 = Validation score (accuracy)
14.83s = Training runtime
0.03s = Validation runtime
Fitting model: XGBoost ...
0.7244 = Validation score (accuracy)
1.27s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.7432 = Validation score (accuracy)
11.94s = Training runtime
0.11s = Validation runtime
Fitting model: LightGBMLarge ...
0.7268 = Validation score (accuracy)
1.99s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7532 = Validation score (accuracy)
0.8s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 52.58s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_110704/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd0b85c640>
= np.array(test_embeddings) test
(8008, 128)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
# DataFrame 확인
X_0 X_1 X_2 X_3 X_4 X_5 X_6 \
0 0.413663 2.075492 0.097850 0.431519 0.026412 0.046006 0.139835
1 0.012225 0.000547 0.024713 0.703247 0.020913 0.419119 0.352671
2 0.195774 0.000752 0.009002 0.204248 0.070720 0.906959 0.507191
3 0.260361 0.329479 0.229454 0.023667 0.001113 0.002806 0.034591
4 0.427026 0.024197 0.695513 0.057896 0.270117 0.026265 0.320806
X_7 X_8 X_9 ... X_118 X_119 X_120 X_121 \
0 0.014063 0.473687 0.014283 ... 0.048837 0.010120 0.005523 0.961210
1 1.669539 0.126232 0.027797 ... 0.728475 0.378981 1.377197 0.203648
2 0.267592 0.115088 0.409537 ... 0.295417 0.440117 0.088578 0.060138
3 0.045546 0.623125 0.282554 ... 0.174775 0.159482 0.001048 0.035379
4 0.029051 0.263759 0.028128 ... 0.008239 0.063498 0.810952 0.013463
X_122 X_123 X_124 X_125 X_126 X_127
0 0.026908 0.000516 0.054398 0.188361 0.007897 0.495728
1 0.086590 0.230660 0.000614 1.211448 0.513626 0.000002
2 0.004575 1.479674 0.131626 0.928811 0.315709 0.000010
3 0.640979 0.329148 0.003226 0.278314 0.002369 0.150809
4 0.295149 0.006146 0.608962 0.076195 0.106696 0.122096
[5 rows x 128 columns]
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f12cb80>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
= np.array(test_labels) y
= predictr.predict(test_df) yhat
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c03a0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
# sklearn
import sklearn
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.690684 | 0.264249 | 0.021375 | 0.03955 | 0.498058 |
0.3 / 0.3
= throw(df, 0.3)
= split_dataframe(df, 0.3)
df_tr, df_tst
= concat(df_tr, df_tst)
df_, mask
= build_graph_tripartite(df_)
= train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
= [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
= pd.DataFrame(data=train_labels, columns=['label'])
# DataFrame 합치기
= pd.concat([df_data, df_labels], axis=1)
= np.array(train_labels) label
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:46<00:00, 4.61s/it]
= TabularPredictor(label='label') predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_112803/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_112803/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 622.65 GB / 982.82 GB (63.4%)
Train Data Rows: 32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24167.01 MB
Train Data (Original) Memory Usage: 16.4 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.4s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.38s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c00d0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.594 = Validation score (accuracy)
0.07s = Training runtime
0.46s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b8c00d0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5956 = Validation score (accuracy)
0.09s = Training runtime
0.49s = Validation runtime
Fitting model: LightGBMXT ...
0.7392 = Validation score (accuracy)
1.03s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
0.74 = Validation score (accuracy)
1.08s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.7304 = Validation score (accuracy)
6.74s = Training runtime
0.05s = Validation runtime
Fitting model: RandomForestEntr ...
0.7268 = Validation score (accuracy)
8.77s = Training runtime
0.05s = Validation runtime
Fitting model: CatBoost ...
0.7396 = Validation score (accuracy)
1.06s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.7328 = Validation score (accuracy)
1.12s = Training runtime
0.05s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.7312 = Validation score (accuracy)
1.18s = Training runtime
0.05s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
0.758 = Validation score (accuracy)
14.33s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.7376 = Validation score (accuracy)
1.16s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.7464 = Validation score (accuracy)
10.13s = Training runtime
0.09s = Validation runtime
Fitting model: LightGBMLarge ...
0.738 = Validation score (accuracy)
2.1s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7708 = Validation score (accuracy)
0.8s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 52.42s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_112803/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcd8f0e0190>
= np.array(test_embeddings) test
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns) test_df
= np.array(test_labels)
= predictr.predict(test_df) yhat
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf01554c0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/ UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.69493 | 0.0 | 0.0 | 0.0 | 0.5 |
0.3 / 0.4
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.3)
df = split_dataframe(df, 0.4)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:48<00:00, 4.86s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240124_113911/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240124_113911/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 621.83 GB / 982.82 GB (63.3%)
Train Data Rows: 32032
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 23867.75 MB
Train Data (Original) Memory Usage: 16.4 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.3s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 16.4 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.36s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.07804695304695304, Train Rows: 29532, Val Rows: 2500
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e940>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5988 = Validation score (accuracy)
0.07s = Training runtime
0.4s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e8b0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5992 = Validation score (accuracy)
0.08s = Training runtime
0.47s = Validation runtime
Fitting model: LightGBMXT ...
0.738 = Validation score (accuracy)
0.99s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
0.738 = Validation score (accuracy)
1.0s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.7324 = Validation score (accuracy)
6.23s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.7292 = Validation score (accuracy)
8.22s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.7364 = Validation score (accuracy)
1.0s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.7372 = Validation score (accuracy)
1.07s = Training runtime
0.05s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.7324 = Validation score (accuracy)
1.11s = Training runtime
0.05s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
0.756 = Validation score (accuracy)
15.57s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.7352 = Validation score (accuracy)
1.1s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.7552 = Validation score (accuracy)
10.67s = Training runtime
0.09s = Validation runtime
Fitting model: LightGBMLarge ...
0.738 = Validation score (accuracy)
2.21s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7688 = Validation score (accuracy)
0.84s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 52.62s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240124_113911/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e790>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.688686 | 0.350318 | 0.045852 | 0.081091 | 0.504741 |
0.4 / 0.4
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.4)
df = split_dataframe(df, 0.4)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00, 3.23s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002023/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002023/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 620.00 GB / 982.82 GB (63.1%)
Train Data Rows: 24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24334.27 MB
Train Data (Original) Memory Usage: 12.3 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.7s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 12.3 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.7s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 21621, Val Rows: 2403
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd0b85e1f0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5568 = Validation score (accuracy)
0.06s = Training runtime
0.38s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf82ed550>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5576 = Validation score (accuracy)
0.07s = Training runtime
0.45s = Validation runtime
Fitting model: LightGBMXT ...
0.6746 = Validation score (accuracy)
1.38s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.6767 = Validation score (accuracy)
1.73s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.6479 = Validation score (accuracy)
4.39s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.6583 = Validation score (accuracy)
6.06s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.6804 = Validation score (accuracy)
6.64s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.6558 = Validation score (accuracy)
0.8s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.6533 = Validation score (accuracy)
0.86s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 7: early stopping
0.7012 = Validation score (accuracy)
11.36s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.6866 = Validation score (accuracy)
2.02s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.707 = Validation score (accuracy)
7.38s = Training runtime
0.09s = Validation runtime
Fitting model: LightGBMLarge ...
0.6758 = Validation score (accuracy)
3.62s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7179 = Validation score (accuracy)
0.82s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 49.84s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_002023/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd700e30d0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.599234 | 0.0 | 0.0 | 0.0 | 0.499584 |
0.4 / 0.3
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.4)
df = split_dataframe(df, 0.3)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:32<00:00, 3.28s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_002828/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_002828/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 619.33 GB / 982.82 GB (63.0%)
Train Data Rows: 24024
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24266.65 MB
Train Data (Original) Memory Usage: 12.3 MB (0.1% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.4s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 12.3 MB (0.1% of available memory)
Data preprocessing and feature engineering runtime = 0.42s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 21621, Val Rows: 2403
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208ff4c0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5389 = Validation score (accuracy)
0.06s = Training runtime
0.39s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208ff4c0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5397 = Validation score (accuracy)
0.07s = Training runtime
0.45s = Validation runtime
Fitting model: LightGBMXT ...
0.6833 = Validation score (accuracy)
1.88s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.6758 = Validation score (accuracy)
1.16s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.6629 = Validation score (accuracy)
4.52s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.6633 = Validation score (accuracy)
6.25s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.6733 = Validation score (accuracy)
2.82s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.6604 = Validation score (accuracy)
0.81s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.6579 = Validation score (accuracy)
0.85s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
0.6991 = Validation score (accuracy)
11.81s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.6779 = Validation score (accuracy)
2.21s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.6937 = Validation score (accuracy)
9.84s = Training runtime
0.11s = Validation runtime
Fitting model: LightGBMLarge ...
0.6821 = Validation score (accuracy)
4.86s = Training runtime
0.02s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.7116 = Validation score (accuracy)
0.84s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 50.43s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_002828/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd700e3af0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.594406 | 0.464115 | 0.040066 | 0.073764 | 0.504412 |
0.5 / 0.3
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.5)
df = split_dataframe(df, 0.3)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00, 2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004947/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004947/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 617.43 GB / 982.82 GB (62.8%)
Train Data Rows: 19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24477.85 MB
Train Data (Original) Memory Usage: 9.84 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.7s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.69s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2fa700>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5484 = Validation score (accuracy)
0.05s = Training runtime
0.3s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2fa700>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5484 = Validation score (accuracy)
0.07s = Training runtime
0.36s = Validation runtime
Fitting model: LightGBMXT ...
0.6556 = Validation score (accuracy)
1.15s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.6493 = Validation score (accuracy)
1.14s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.6379 = Validation score (accuracy)
3.71s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.6415 = Validation score (accuracy)
5.18s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.6545 = Validation score (accuracy)
2.32s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.6332 = Validation score (accuracy)
0.71s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.6498 = Validation score (accuracy)
0.73s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
0.6597 = Validation score (accuracy)
9.97s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.629 = Validation score (accuracy)
1.53s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.6514 = Validation score (accuracy)
5.69s = Training runtime
0.08s = Validation runtime
Fitting model: LightGBMLarge ...
0.6472 = Validation score (accuracy)
3.91s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.6769 = Validation score (accuracy)
0.75s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 39.24s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_004947/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf0308310>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.508637 | 0.470588 | 0.00339 | 0.006731 | 0.499854 |
0.5 / 0.4
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.5)
df = split_dataframe(df, 0.4)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00, 2.57s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_004313/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_004313/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 617.99 GB / 982.82 GB (62.9%)
Train Data Rows: 19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24485.97 MB
Train Data (Original) Memory Usage: 9.84 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.7s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.68s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf87d4790>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5375 = Validation score (accuracy)
0.05s = Training runtime
0.3s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fccf87d4e50>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5375 = Validation score (accuracy)
0.05s = Training runtime
0.38s = Validation runtime
Fitting model: LightGBMXT ...
0.6774 = Validation score (accuracy)
0.98s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.667 = Validation score (accuracy)
1.03s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
0.6592 = Validation score (accuracy)
3.68s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.6623 = Validation score (accuracy)
5.2s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.6727 = Validation score (accuracy)
3.06s = Training runtime
0.01s = Validation runtime
Fitting model: ExtraTreesGini ...
0.6649 = Validation score (accuracy)
0.7s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.6686 = Validation score (accuracy)
0.74s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 5: early stopping
0.6649 = Validation score (accuracy)
10.27s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.6582 = Validation score (accuracy)
1.74s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.6727 = Validation score (accuracy)
5.93s = Training runtime
0.08s = Validation runtime
Fitting model: LightGBMLarge ...
0.6681 = Validation score (accuracy)
3.79s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.6915 = Validation score (accuracy)
0.74s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 40.32s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_004313/")
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcd8f2579d0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/sklearn/metrics/ UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.501977 | 0.0 | 0.0 | 0.0 | 0.5 |
0.5 / 0.2
= pd.read_csv("~/Desktop/fraudTrain.csv") df
= throw(df, 0.5)
df = split_dataframe(df, 0.2)
df_tr, df_tst = concat(df_tr, df_tst)
df_, mask = build_graph_tripartite(df_)
G_down = train_test_split(list(range(len(G_down.edges))),
train_edges, test_edges, train_labels, test_labels list(nx.get_edge_attributes(G_down, "label").values()),
random_state= list(G_down.edges)
edgs = G_down.edge_subgraph([edgs[x] for x in train_edges]).copy()
train_graph list(set(G_down.nodes) - set(train_graph.nodes)))
= Node2Vec(train_graph, weight_key='weight')
node2vec_train =
model_train = [HadamardEmbedder, AverageEmbedder, WeightedL1Embedder, WeightedL2Embedder]
classes for cl in classes:
= cl(keyed_vectors=model_train.wv)
= [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in train_edges]
train_embeddings = [embeddings_train[str(edgs[x][0]), str(edgs[x][1])] for x in test_edges]
# DataFrame 생성
= [f'X_{i}' for i in range(np.array(train_embeddings).shape[1])]
columns = pd.DataFrame(data=train_embeddings, columns=columns)
df_data = pd.DataFrame(data=train_labels, columns=['label'])
= pd.concat([df_data, df_labels], axis=1)
df = np.array(train_labels)
= TabularPredictor(label='label')
= np.array(test_embeddings)
= [f'X_{i}' for i in range(test.shape[1])]
# DataFrame 생성
= pd.DataFrame(data=test, columns=columns)
= np.array(test_labels)
y = predictr.predict(test_df)
Generating walks (CPU: 1): 100%|██████████| 10/10 [00:25<00:00, 2.54s/it]
No path specified. Models will be saved in: "AutogluonModels/ag-20240125_005617/"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20240125_005617/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #38~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Nov 2 18:01:13 UTC 2
Disk Space Avail: 616.86 GB / 982.82 GB (62.8%)
Train Data Rows: 19219
Train Data Columns: 128
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [0, 1]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 24455.01 MB
Train Data (Original) Memory Usage: 9.84 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 128 | ['X_0', 'X_1', 'X_2', 'X_3', 'X_4', ...]
0.6s = Fit runtime
128 features in original data used to generate 128 features in processed data.
Train Data (Processed) Memory Usage: 9.84 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.65s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 17297, Val Rows: 1922
User-specified model hyperparameters to be fit:
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208fbca0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5182 = Validation score (accuracy)
0.05s = Training runtime
0.3s = Validation runtime
Fitting model: KNeighborsDist ...
Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7fcb208fbca0>
Traceback (most recent call last):
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 400, in match_module_callback
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 606, in __init__
self.version = self.get_version()
File "/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
0.5193 = Validation score (accuracy)
0.07s = Training runtime
0.38s = Validation runtime
Fitting model: LightGBMXT ...
0.6348 = Validation score (accuracy)
0.97s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.6348 = Validation score (accuracy)
1.37s = Training runtime
0.01s = Validation runtime
Fitting model: RandomForestGini ...
0.6254 = Validation score (accuracy)
3.74s = Training runtime
0.04s = Validation runtime
Fitting model: RandomForestEntr ...
0.6337 = Validation score (accuracy)
5.22s = Training runtime
0.04s = Validation runtime
Fitting model: CatBoost ...
0.6462 = Validation score (accuracy)
3.83s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.6384 = Validation score (accuracy)
0.69s = Training runtime
0.04s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.6446 = Validation score (accuracy)
0.74s = Training runtime
0.04s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 8: early stopping
0.6426 = Validation score (accuracy)
9.51s = Training runtime
0.02s = Validation runtime
Fitting model: XGBoost ...
0.6301 = Validation score (accuracy)
1.54s = Training runtime
0.01s = Validation runtime
Fitting model: NeuralNetTorch ...
0.6415 = Validation score (accuracy)
5.31s = Training runtime
0.08s = Validation runtime
Fitting model: LightGBMLarge ...
0.6379 = Validation score (accuracy)
2.72s = Training runtime
0.01s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.6634 = Validation score (accuracy)
0.73s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 38.72s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20240125_005617/")
accuracy_score | precision_score | recall_score | f1_score | roc_auc_score | |
0 | 0.510926 | 0.0 | 0.0 | 0.0 | 0.499796 |