import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import sklearn
import xgboost as xgb
# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv
# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
imports
def down_sample_textbook(df):
= df[df.is_fraud==0].copy()
df_majority = df[df.is_fraud==1].copy()
df_minority = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
df_maj_dowsampled = pd.concat([df_minority, df_maj_dowsampled])
df_downsampled return df_downsampled
def compute_time_difference(group):
= len(group)
n = []
result for i in range(n):
for j in range(n):
= abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
time_difference
result.append([group.iloc[i].name, group.iloc[j].name, time_difference])return result
class GCN(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv1 = GCNConv(1, 16)
self.conv2 = GCNConv(16,2)
def forward(self, data):
= data.x, data.edge_index
x, edge_index
= self.conv1(x, edge_index)
x = F.relu(x)
x = F.dropout(x, training=self.training)
x = self.conv2(x, edge_index)
x
return F.log_softmax(x, dim=1)
= pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:] fraudTrain
= fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain fraudTrain
trans_date_trans_time | cc_num | merchant | category | amt | first | last | gender | street | city | ... | lat | long | city_pop | job | dob | trans_num | unix_time | merch_lat | merch_long | is_fraud | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-01-01 00:00:00 | 2.703190e+15 | fraud_Rippin, Kub and Mann | misc_net | 4.97 | Jennifer | Banks | F | 561 Perry Cove | Moravian Falls | ... | 36.0788 | -81.1781 | 3495 | Psychologist, counselling | 1988-03-09 | 0b242abb623afc578575680df30655b9 | 1325376018 | 36.011293 | -82.048315 | 0 |
1 | 2019-01-01 00:00:00 | 6.304230e+11 | fraud_Heller, Gutmann and Zieme | grocery_pos | 107.23 | Stephanie | Gill | F | 43039 Riley Greens Suite 393 | Orient | ... | 48.8878 | -118.2105 | 149 | Special educational needs teacher | 1978-06-21 | 1f76529f8574734946361c461b024d99 | 1325376044 | 49.159047 | -118.186462 | 0 |
2 | 2019-01-01 00:00:00 | 3.885950e+13 | fraud_Lind-Buckridge | entertainment | 220.11 | Edward | Sanchez | M | 594 White Dale Suite 530 | Malad City | ... | 42.1808 | -112.2620 | 4154 | Nature conservation officer | 1962-01-19 | a1a22d70485983eac12b5b88dad1cf95 | 1325376051 | 43.150704 | -112.154481 | 0 |
3 | 2019-01-01 00:01:00 | 3.534090e+15 | fraud_Kutch, Hermiston and Farrell | gas_transport | 45.00 | Jeremy | White | M | 9443 Cynthia Court Apt. 038 | Boulder | ... | 46.2306 | -112.1138 | 1939 | Patent attorney | 1967-01-12 | 6b849c168bdad6f867558c3793159a81 | 1325376076 | 47.034331 | -112.561071 | 0 |
4 | 2019-01-01 00:03:00 | 3.755340e+14 | fraud_Keeling-Crist | misc_pos | 41.96 | Tyler | Garcia | M | 408 Bradley Rest | Doe Hill | ... | 38.4207 | -79.4629 | 99 | Dance movement psychotherapist | 1986-03-28 | a41d7549acf90789359a9aa5346dcb46 | 1325376186 | 38.674999 | -78.632459 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1048570 | 2020-03-10 16:07:00 | 6.011980e+15 | fraud_Fadel Inc | health_fitness | 77.00 | Haley | Wagner | F | 05561 Farrell Crescent | Annapolis | ... | 39.0305 | -76.5515 | 92106 | Accountant, chartered certified | 1943-05-28 | 45ecd198c65e81e597db22e8d2ef7361 | 1362931649 | 38.779464 | -76.317042 | 0 |
1048571 | 2020-03-10 16:07:00 | 4.839040e+15 | fraud_Cremin, Hamill and Reichel | misc_pos | 116.94 | Meredith | Campbell | F | 043 Hanson Turnpike | Hedrick | ... | 41.1826 | -92.3097 | 1583 | Geochemist | 1999-06-28 | c00ce51c6ebb7657474a77b9e0b51f34 | 1362931670 | 41.400318 | -92.726724 | 0 |
1048572 | 2020-03-10 16:08:00 | 5.718440e+11 | fraud_O'Connell, Botsford and Hand | home | 21.27 | Susan | Mills | F | 005 Cody Estates | Louisville | ... | 38.2507 | -85.7476 | 736284 | Engineering geologist | 1952-04-02 | 17c9dc8b2a6449ca2473726346e58e6c | 1362931711 | 37.293339 | -84.798122 | 0 |
1048573 | 2020-03-10 16:08:00 | 4.646850e+18 | fraud_Thompson-Gleason | health_fitness | 9.52 | Julia | Bell | F | 576 House Crossroad | West Sayville | ... | 40.7320 | -73.1000 | 4056 | Film/video editor | 1990-06-25 | 5ca650881b48a6a38754f841c23b77ab | 1362931718 | 39.773077 | -72.213209 | 0 |
1048574 | 2020-03-10 16:08:00 | 2.283740e+15 | fraud_Buckridge PLC | misc_pos | 6.81 | Shannon | Williams | F | 9345 Spencer Junctions Suite 183 | Alpharetta | ... | 34.0770 | -84.3033 | 165556 | Prison officer | 1997-12-27 | 8d0a575fe635bbde12f1a2bffc126731 | 1362931730 | 33.601468 | -83.891921 | 0 |
1048575 rows × 22 columns
데이터정리
= fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df1 = fraudTrain[fraudTrain["is_fraud"] == 1]
_df2 = pd.concat([_df1,_df2])
df02 df02.shape
(214520, 22)
= down_sample_textbook(df02)
df50 df50.shape
(12012, 22)
= df50.reset_index() df50
= len(df50) N
autogluon1: amt
= df50[["amt","is_fraud"]]
df50 = sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
A. 데이터
= TabularDataset(df50_tr)
tr = TabularDataset(df50_test) tst
B. predictor 생성
= TabularPredictor("is_fraud") predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_124800/"
C.적합(fit)
predictr.fit(tr)
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_124800/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail: 746.82 GB / 982.82 GB (76.0%)
Train Data Rows: 9009
Train Data Columns: 1
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 14427.76 MB
Train Data (Original) Memory Usage: 0.07 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 1 | ['amt']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 1 | ['amt']
0.0s = Fit runtime
1 features in original data used to generate 1 features in processed data.
Train Data (Processed) Memory Usage: 0.07 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.05s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
0.8779 = Validation score (accuracy)
0.01s = Training runtime
0.01s = Validation runtime
Fitting model: KNeighborsDist ...
0.8635 = Validation score (accuracy)
0.0s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBMXT ...
0.8768 = Validation score (accuracy)
0.15s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
0.8923 = Validation score (accuracy)
0.23s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
0.8513 = Validation score (accuracy)
0.31s = Training runtime
0.03s = Validation runtime
Fitting model: RandomForestEntr ...
0.8513 = Validation score (accuracy)
0.32s = Training runtime
0.03s = Validation runtime
Fitting model: CatBoost ...
0.8946 = Validation score (accuracy)
0.64s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.8602 = Validation score (accuracy)
0.28s = Training runtime
0.03s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.8579 = Validation score (accuracy)
0.28s = Training runtime
0.03s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 1: early stopping
0.8635 = Validation score (accuracy)
2.93s = Training runtime
0.01s = Validation runtime
Fitting model: XGBoost ...
0.8935 = Validation score (accuracy)
0.11s = Training runtime
0.0s = Validation runtime
Fitting model: NeuralNetTorch ...
0.8857 = Validation score (accuracy)
4.91s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBMLarge ...
0.8946 = Validation score (accuracy)
0.35s = Training runtime
0.0s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.9023 = Validation score (accuracy)
0.48s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 11.38s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_124800/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f1ff7efc670>
predictr.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.902331 0.051067 7.458454 0.001309 0.513563 2 True 14
1 LightGBMLarge 0.894562 0.001894 0.414943 0.001894 0.414943 1 True 13
2 CatBoost 0.894562 0.001981 0.653966 0.001981 0.653966 1 True 7
3 XGBoost 0.893452 0.003316 0.114061 0.003316 0.114061 1 True 11
4 LightGBM 0.892342 0.003488 0.343734 0.003488 0.343734 1 True 4
5 NeuralNetTorch 0.885683 0.005610 5.186066 0.005610 5.186066 1 True 12
6 KNeighborsUnif 0.877913 0.006206 0.028794 0.006206 0.028794 1 True 1
7 LightGBMXT 0.876804 0.002243 0.245948 0.002243 0.245948 1 True 3
8 KNeighborsDist 0.863485 0.005649 0.024392 0.005649 0.024392 1 True 2
9 NeuralNetFastAI 0.863485 0.008246 2.861539 0.008246 2.861539 1 True 10
10 ExtraTreesGini 0.860155 0.029064 0.305516 0.029064 0.305516 1 True 8
11 ExtraTreesEntr 0.857936 0.029304 0.306215 0.029304 0.306215 1 True 9
12 RandomForestEntr 0.851276 0.028594 0.344763 0.028594 0.344763 1 True 6
13 RandomForestGini 0.851276 0.028716 0.321282 0.028716 0.321282 1 True 5
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | WeightedEnsemble_L2 | 0.902331 | 0.051067 | 7.458454 | 0.001309 | 0.513563 | 2 | True | 14 |
1 | LightGBMLarge | 0.894562 | 0.001894 | 0.414943 | 0.001894 | 0.414943 | 1 | True | 13 |
2 | CatBoost | 0.894562 | 0.001981 | 0.653966 | 0.001981 | 0.653966 | 1 | True | 7 |
3 | XGBoost | 0.893452 | 0.003316 | 0.114061 | 0.003316 | 0.114061 | 1 | True | 11 |
4 | LightGBM | 0.892342 | 0.003488 | 0.343734 | 0.003488 | 0.343734 | 1 | True | 4 |
5 | NeuralNetTorch | 0.885683 | 0.005610 | 5.186066 | 0.005610 | 5.186066 | 1 | True | 12 |
6 | KNeighborsUnif | 0.877913 | 0.006206 | 0.028794 | 0.006206 | 0.028794 | 1 | True | 1 |
7 | LightGBMXT | 0.876804 | 0.002243 | 0.245948 | 0.002243 | 0.245948 | 1 | True | 3 |
8 | KNeighborsDist | 0.863485 | 0.005649 | 0.024392 | 0.005649 | 0.024392 | 1 | True | 2 |
9 | NeuralNetFastAI | 0.863485 | 0.008246 | 2.861539 | 0.008246 | 2.861539 | 1 | True | 10 |
10 | ExtraTreesGini | 0.860155 | 0.029064 | 0.305516 | 0.029064 | 0.305516 | 1 | True | 8 |
11 | ExtraTreesEntr | 0.857936 | 0.029304 | 0.306215 | 0.029304 | 0.306215 | 1 | True | 9 |
12 | RandomForestEntr | 0.851276 | 0.028594 | 0.344763 | 0.028594 | 0.344763 | 1 | True | 6 |
13 | RandomForestGini | 0.851276 | 0.028716 | 0.321282 | 0.028716 | 0.321282 | 1 | True | 5 |
autogluon2: amt, distace
= df50[["amt","distance_km", "is_fraud"]] df50
= sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
A. 데이터
= TabularDataset(df50_tr)
tr = TabularDataset(df50_test) tst
B. predictor 생성
= TabularPredictor("is_fraud") predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_125208/"
C.적합(fit)
predictr.fit(tr)
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_125208/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail: 746.64 GB / 982.82 GB (76.0%)
Train Data Rows: 9009
Train Data Columns: 2
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 14301.5 MB
Train Data (Original) Memory Usage: 0.14 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 2 | ['amt', 'distance_km']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 2 | ['amt', 'distance_km']
0.0s = Fit runtime
2 features in original data used to generate 2 features in processed data.
Train Data (Processed) Memory Usage: 0.14 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
0.8646 = Validation score (accuracy)
0.01s = Training runtime
0.01s = Validation runtime
Fitting model: KNeighborsDist ...
0.8535 = Validation score (accuracy)
0.0s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBMXT ...
0.8879 = Validation score (accuracy)
0.33s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBM ...
0.8912 = Validation score (accuracy)
0.22s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
0.8701 = Validation score (accuracy)
0.32s = Training runtime
0.03s = Validation runtime
Fitting model: RandomForestEntr ...
0.8735 = Validation score (accuracy)
0.35s = Training runtime
0.03s = Validation runtime
Fitting model: CatBoost ...
0.899 = Validation score (accuracy)
0.51s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.8613 = Validation score (accuracy)
0.28s = Training runtime
0.03s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.8646 = Validation score (accuracy)
0.28s = Training runtime
0.03s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 1: early stopping
0.8624 = Validation score (accuracy)
2.94s = Training runtime
0.01s = Validation runtime
Fitting model: XGBoost ...
0.889 = Validation score (accuracy)
0.18s = Training runtime
0.0s = Validation runtime
Fitting model: NeuralNetTorch ...
0.8857 = Validation score (accuracy)
4.49s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBMLarge ...
0.8779 = Validation score (accuracy)
0.38s = Training runtime
0.0s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.899 = Validation score (accuracy)
0.5s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 11.21s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_125208/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f201b90b790>
predictr.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 CatBoost 0.899001 0.001843 0.508956 0.001843 0.508956 1 True 7
1 WeightedEnsemble_L2 0.899001 0.003138 1.012600 0.001295 0.503644 2 True 14
2 LightGBM 0.891232 0.002272 0.223588 0.002272 0.223588 1 True 4
3 XGBoost 0.889012 0.003450 0.180574 0.003450 0.180574 1 True 11
4 LightGBMXT 0.887902 0.006675 0.331872 0.006675 0.331872 1 True 3
5 NeuralNetTorch 0.885683 0.005594 4.485880 0.005594 4.485880 1 True 12
6 LightGBMLarge 0.877913 0.001962 0.382782 0.001962 0.382782 1 True 13
7 RandomForestEntr 0.873474 0.030227 0.347987 0.030227 0.347987 1 True 6
8 RandomForestGini 0.870144 0.028919 0.315052 0.028919 0.315052 1 True 5
9 KNeighborsUnif 0.864595 0.006738 0.009886 0.006738 0.009886 1 True 1
10 ExtraTreesEntr 0.864595 0.029063 0.283869 0.029063 0.283869 1 True 9
11 NeuralNetFastAI 0.862375 0.011110 2.937038 0.011110 2.937038 1 True 10
12 ExtraTreesGini 0.861265 0.028793 0.284426 0.028793 0.284426 1 True 8
13 KNeighborsDist 0.853496 0.004887 0.004544 0.004887 0.004544 1 True 2
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | CatBoost | 0.899001 | 0.001843 | 0.508956 | 0.001843 | 0.508956 | 1 | True | 7 |
1 | WeightedEnsemble_L2 | 0.899001 | 0.003138 | 1.012600 | 0.001295 | 0.503644 | 2 | True | 14 |
2 | LightGBM | 0.891232 | 0.002272 | 0.223588 | 0.002272 | 0.223588 | 1 | True | 4 |
3 | XGBoost | 0.889012 | 0.003450 | 0.180574 | 0.003450 | 0.180574 | 1 | True | 11 |
4 | LightGBMXT | 0.887902 | 0.006675 | 0.331872 | 0.006675 | 0.331872 | 1 | True | 3 |
5 | NeuralNetTorch | 0.885683 | 0.005594 | 4.485880 | 0.005594 | 4.485880 | 1 | True | 12 |
6 | LightGBMLarge | 0.877913 | 0.001962 | 0.382782 | 0.001962 | 0.382782 | 1 | True | 13 |
7 | RandomForestEntr | 0.873474 | 0.030227 | 0.347987 | 0.030227 | 0.347987 | 1 | True | 6 |
8 | RandomForestGini | 0.870144 | 0.028919 | 0.315052 | 0.028919 | 0.315052 | 1 | True | 5 |
9 | KNeighborsUnif | 0.864595 | 0.006738 | 0.009886 | 0.006738 | 0.009886 | 1 | True | 1 |
10 | ExtraTreesEntr | 0.864595 | 0.029063 | 0.283869 | 0.029063 | 0.283869 | 1 | True | 9 |
11 | NeuralNetFastAI | 0.862375 | 0.011110 | 2.937038 | 0.011110 | 2.937038 | 1 | True | 10 |
12 | ExtraTreesGini | 0.861265 | 0.028793 | 0.284426 | 0.028793 | 0.284426 | 1 | True | 8 |
13 | KNeighborsDist | 0.853496 | 0.004887 | 0.004544 | 0.004887 | 0.004544 | 1 | True | 2 |
autogluon3: amt, time, distace
'trans_date_trans_time'] = pd.to_datetime(df50['trans_date_trans_time'])
df50['trans_date_trans_time'] = (df50['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df50[
= {category: index for index, category in enumerate(df50['category'].unique())}
category_map 'category'] = df50['category'].map(category_map)
df50[
def haversine(lat1, lon1, lat2, lon2):
# 지구의 반지름 (미터)
= 6371.0
radius
# 라디안으로 변환
= np.radians(lat1)
lat1 = np.radians(lon1)
lon1 = np.radians(lat2)
lat2 = np.radians(lon2)
lon2
# Haversine 공식 계산
= lon2 - lon1
dlon = lat2 - lat1
dlat = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
a = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
c = radius * c
distance
return distance
= df50['lat']
customer_lat = df50['long']
customer_lon = df50['merch_lat']
store_lat = df50['merch_long']
store_lon = haversine(customer_lat, customer_lon, store_lat, store_lon)
distances 'distance_km'] = distances df50[
= df50[["amt",'trans_date_trans_time', 'distance_km', "is_fraud"]] df50
= sklearn.model_selection.train_test_split(df50, random_state=42) df50_tr,df50_test
A. 데이터
= TabularDataset(df50_tr)
tr = TabularDataset(df50_test) tst
B. predictor 생성
= TabularPredictor("is_fraud") predictr
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_124455/"
C.적합(fit)
predictr.fit(tr)
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_124455/"
AutoGluon Version: 0.8.2
Python Version: 3.8.18
Operating System: Linux
Platform Machine: x86_64
Platform Version: #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail: 747.06 GB / 982.82 GB (76.0%)
Train Data Rows: 9009
Train Data Columns: 3
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
2 unique label values: [1, 0]
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping: class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
Available Memory: 14638.47 MB
Train Data (Original) Memory Usage: 0.22 MB (0.0% of available memory)
Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
Stage 1 Generators:
Fitting AsTypeFeatureGenerator...
Stage 2 Generators:
Fitting FillNaFeatureGenerator...
Stage 3 Generators:
Fitting IdentityFeatureGenerator...
Stage 4 Generators:
Fitting DropUniqueFeatureGenerator...
Stage 5 Generators:
Fitting DropDuplicatesFeatureGenerator...
Types of features in original data (raw dtype, special dtypes):
('float', []) : 2 | ['amt', 'distance_km']
('int', []) : 1 | ['trans_date_trans_time']
Types of features in processed data (raw dtype, special dtypes):
('float', []) : 2 | ['amt', 'distance_km']
('int', []) : 1 | ['trans_date_trans_time']
0.0s = Fit runtime
3 features in original data used to generate 3 features in processed data.
Train Data (Processed) Memory Usage: 0.22 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 8108, Val Rows: 901
User-specified model hyperparameters to be fit:
{
'NN_TORCH': {},
'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
'CAT': {},
'XGB': {},
'FASTAI': {},
'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif ...
0.727 = Validation score (accuracy)
0.02s = Training runtime
0.01s = Validation runtime
Fitting model: KNeighborsDist ...
0.7236 = Validation score (accuracy)
0.01s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBMXT ...
0.8812 = Validation score (accuracy)
0.27s = Training runtime
0.0s = Validation runtime
Fitting model: LightGBM ...
0.8912 = Validation score (accuracy)
0.19s = Training runtime
0.0s = Validation runtime
Fitting model: RandomForestGini ...
0.8757 = Validation score (accuracy)
0.33s = Training runtime
0.03s = Validation runtime
Fitting model: RandomForestEntr ...
0.8835 = Validation score (accuracy)
0.36s = Training runtime
0.03s = Validation runtime
Fitting model: CatBoost ...
0.8923 = Validation score (accuracy)
0.89s = Training runtime
0.0s = Validation runtime
Fitting model: ExtraTreesGini ...
0.8701 = Validation score (accuracy)
0.29s = Training runtime
0.03s = Validation runtime
Fitting model: ExtraTreesEntr ...
0.8724 = Validation score (accuracy)
0.3s = Training runtime
0.03s = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 4: early stopping
0.8602 = Validation score (accuracy)
3.48s = Training runtime
0.01s = Validation runtime
Fitting model: XGBoost ...
0.8923 = Validation score (accuracy)
0.14s = Training runtime
0.0s = Validation runtime
Fitting model: NeuralNetTorch ...
0.8746 = Validation score (accuracy)
3.71s = Training runtime
0.01s = Validation runtime
Fitting model: LightGBMLarge ...
0.8768 = Validation score (accuracy)
0.35s = Training runtime
0.0s = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
0.9279 = Validation score (accuracy)
0.5s = Training runtime
0.0s = Validation runtime
AutoGluon training complete, total runtime = 11.23s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_124455/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f205bd3bc70>
predictr.leaderboard()
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.927858 0.034292 9.131091 0.001279 0.502710 2 True 14
1 CatBoost 0.892342 0.002311 0.885833 0.002311 0.885833 1 True 7
2 XGBoost 0.892342 0.003765 0.143192 0.003765 0.143192 1 True 11
3 LightGBM 0.891232 0.002339 0.189846 0.002339 0.189846 1 True 4
4 RandomForestEntr 0.883463 0.029521 0.356271 0.029521 0.356271 1 True 6
5 LightGBMXT 0.881243 0.003287 0.271952 0.003287 0.271952 1 True 3
6 LightGBMLarge 0.876804 0.001867 0.351746 0.001867 0.351746 1 True 13
7 RandomForestGini 0.875694 0.030285 0.330684 0.030285 0.330684 1 True 5
8 NeuralNetTorch 0.874584 0.005663 3.705228 0.005663 3.705228 1 True 12
9 ExtraTreesEntr 0.872364 0.030204 0.300827 0.030204 0.300827 1 True 9
10 ExtraTreesGini 0.870144 0.029252 0.293159 0.029252 0.293159 1 True 8
11 NeuralNetFastAI 0.860155 0.008793 3.475210 0.008793 3.475210 1 True 10
12 KNeighborsUnif 0.726970 0.007053 0.015347 0.007053 0.015347 1 True 1
13 KNeighborsDist 0.723640 0.004987 0.005171 0.004987 0.005171 1 True 2
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | WeightedEnsemble_L2 | 0.927858 | 0.034292 | 9.131091 | 0.001279 | 0.502710 | 2 | True | 14 |
1 | CatBoost | 0.892342 | 0.002311 | 0.885833 | 0.002311 | 0.885833 | 1 | True | 7 |
2 | XGBoost | 0.892342 | 0.003765 | 0.143192 | 0.003765 | 0.143192 | 1 | True | 11 |
3 | LightGBM | 0.891232 | 0.002339 | 0.189846 | 0.002339 | 0.189846 | 1 | True | 4 |
4 | RandomForestEntr | 0.883463 | 0.029521 | 0.356271 | 0.029521 | 0.356271 | 1 | True | 6 |
5 | LightGBMXT | 0.881243 | 0.003287 | 0.271952 | 0.003287 | 0.271952 | 1 | True | 3 |
6 | LightGBMLarge | 0.876804 | 0.001867 | 0.351746 | 0.001867 | 0.351746 | 1 | True | 13 |
7 | RandomForestGini | 0.875694 | 0.030285 | 0.330684 | 0.030285 | 0.330684 | 1 | True | 5 |
8 | NeuralNetTorch | 0.874584 | 0.005663 | 3.705228 | 0.005663 | 3.705228 | 1 | True | 12 |
9 | ExtraTreesEntr | 0.872364 | 0.030204 | 0.300827 | 0.030204 | 0.300827 | 1 | True | 9 |
10 | ExtraTreesGini | 0.870144 | 0.029252 | 0.293159 | 0.029252 | 0.293159 | 1 | True | 8 |
11 | NeuralNetFastAI | 0.860155 | 0.008793 | 3.475210 | 0.008793 | 3.475210 | 1 | True | 10 |
12 | KNeighborsUnif | 0.726970 | 0.007053 | 0.015347 | 0.007053 | 0.015347 | 1 | True | 1 |
13 | KNeighborsDist | 0.723640 | 0.004987 | 0.005171 | 0.004987 | 0.005171 | 1 | True | 2 |