[Autogluon] df50 X범주_auto_best

Author

김보람

Published

October 11, 2023

imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import networkx as nx
import sklearn
import xgboost as xgb

# sklearn
from sklearn import model_selection # split함수이용
from sklearn import ensemble # RF,GBM
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# gnn
import torch
import torch.nn.functional as F
import torch_geometric
from torch_geometric.nn import GCNConv


# autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:18: UserWarning: An issue occurred while importing 'pyg-lib'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/libpyg.so: undefined symbol: _ZN2at4_ops12split_Tensor4callERKNS_6TensorEN3c106SymIntEl
  warnings.warn(f"An issue occurred while importing 'pyg-lib'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:31: UserWarning: An issue occurred while importing 'torch-scatter'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_scatter/_scatter_cuda.so: undefined symbol: _ZNK3c107SymBool10guard_boolEPKcl
  warnings.warn(f"An issue occurred while importing 'torch-scatter'. "
/home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_geometric/typing.py:42: UserWarning: An issue occurred while importing 'torch-sparse'. Disabling its usage. Stacktrace: /home/coco/anaconda3/envs/py38/lib/python3.8/site-packages/torch_sparse/_diag_cuda.so: undefined symbol: _ZN3c106detail19maybe_wrap_dim_slowIlEET_S2_S2_b
  warnings.warn(f"An issue occurred while importing 'torch-sparse'. "
def down_sample_textbook(df):
    df_majority = df[df.is_fraud==0].copy()
    df_minority = df[df.is_fraud==1].copy()
    df_maj_dowsampled = sklearn.utils.resample(df_majority, n_samples=len(df_minority), replace=False, random_state=42)
    df_downsampled = pd.concat([df_minority, df_maj_dowsampled])
    return df_downsampled

def compute_time_difference(group):
    n = len(group)
    result = []
    for i in range(n):
        for j in range(n):
            time_difference = abs(group.iloc[i].trans_date_trans_time.value - group.iloc[j].trans_date_trans_time.value)
            result.append([group.iloc[i].name, group.iloc[j].name, time_difference])
    return result


class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(1, 16)
        self.conv2 = GCNConv(16,2)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)
fraudTrain = pd.read_csv("~/Desktop/fraudTrain.csv").iloc[:,1:]
fraudTrain = fraudTrain.assign(trans_date_trans_time= list(map(lambda x: pd.to_datetime(x), fraudTrain.trans_date_trans_time)))
fraudTrain
trans_date_trans_time cc_num merchant category amt first last gender street city ... lat long city_pop job dob trans_num unix_time merch_lat merch_long is_fraud
0 2019-01-01 00:00:00 2.703190e+15 fraud_Rippin, Kub and Mann misc_net 4.97 Jennifer Banks F 561 Perry Cove Moravian Falls ... 36.0788 -81.1781 3495 Psychologist, counselling 1988-03-09 0b242abb623afc578575680df30655b9 1325376018 36.011293 -82.048315 0
1 2019-01-01 00:00:00 6.304230e+11 fraud_Heller, Gutmann and Zieme grocery_pos 107.23 Stephanie Gill F 43039 Riley Greens Suite 393 Orient ... 48.8878 -118.2105 149 Special educational needs teacher 1978-06-21 1f76529f8574734946361c461b024d99 1325376044 49.159047 -118.186462 0
2 2019-01-01 00:00:00 3.885950e+13 fraud_Lind-Buckridge entertainment 220.11 Edward Sanchez M 594 White Dale Suite 530 Malad City ... 42.1808 -112.2620 4154 Nature conservation officer 1962-01-19 a1a22d70485983eac12b5b88dad1cf95 1325376051 43.150704 -112.154481 0
3 2019-01-01 00:01:00 3.534090e+15 fraud_Kutch, Hermiston and Farrell gas_transport 45.00 Jeremy White M 9443 Cynthia Court Apt. 038 Boulder ... 46.2306 -112.1138 1939 Patent attorney 1967-01-12 6b849c168bdad6f867558c3793159a81 1325376076 47.034331 -112.561071 0
4 2019-01-01 00:03:00 3.755340e+14 fraud_Keeling-Crist misc_pos 41.96 Tyler Garcia M 408 Bradley Rest Doe Hill ... 38.4207 -79.4629 99 Dance movement psychotherapist 1986-03-28 a41d7549acf90789359a9aa5346dcb46 1325376186 38.674999 -78.632459 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1048570 2020-03-10 16:07:00 6.011980e+15 fraud_Fadel Inc health_fitness 77.00 Haley Wagner F 05561 Farrell Crescent Annapolis ... 39.0305 -76.5515 92106 Accountant, chartered certified 1943-05-28 45ecd198c65e81e597db22e8d2ef7361 1362931649 38.779464 -76.317042 0
1048571 2020-03-10 16:07:00 4.839040e+15 fraud_Cremin, Hamill and Reichel misc_pos 116.94 Meredith Campbell F 043 Hanson Turnpike Hedrick ... 41.1826 -92.3097 1583 Geochemist 1999-06-28 c00ce51c6ebb7657474a77b9e0b51f34 1362931670 41.400318 -92.726724 0
1048572 2020-03-10 16:08:00 5.718440e+11 fraud_O'Connell, Botsford and Hand home 21.27 Susan Mills F 005 Cody Estates Louisville ... 38.2507 -85.7476 736284 Engineering geologist 1952-04-02 17c9dc8b2a6449ca2473726346e58e6c 1362931711 37.293339 -84.798122 0
1048573 2020-03-10 16:08:00 4.646850e+18 fraud_Thompson-Gleason health_fitness 9.52 Julia Bell F 576 House Crossroad West Sayville ... 40.7320 -73.1000 4056 Film/video editor 1990-06-25 5ca650881b48a6a38754f841c23b77ab 1362931718 39.773077 -72.213209 0
1048574 2020-03-10 16:08:00 2.283740e+15 fraud_Buckridge PLC misc_pos 6.81 Shannon Williams F 9345 Spencer Junctions Suite 183 Alpharetta ... 34.0770 -84.3033 165556 Prison officer 1997-12-27 8d0a575fe635bbde12f1a2bffc126731 1362931730 33.601468 -83.891921 0

1048575 rows × 22 columns

데이터정리

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
df50 = df50.reset_index()
N = len(df50)

autogluon1: amt

df50 = df50[["amt","is_fraud"]]
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_125519/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_125519/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   746.18 GB / 982.82 GB (75.9%)
Train Data Rows:    9009
Train Data Columns: 1
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    12696.17 MB
    Train Data (Original)  Memory Usage: 0.07 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 1 | ['amt']
    0.0s = Fit runtime
    1 features in original data used to generate 1 features in processed data.
    Train Data (Processed) Memory Usage: 0.07 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.02s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ...
    0.8782   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ...
    0.8641   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.885    = Validation score   (accuracy)
    0.5s     = Training   runtime
    0.03s    = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.894    = Validation score   (accuracy)
    0.64s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
    0.856    = Validation score   (accuracy)
    0.35s    = Training   runtime
    0.19s    = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
    0.856    = Validation score   (accuracy)
    0.6s     = Training   runtime
    0.19s    = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8947   = Validation score   (accuracy)
    1.56s    = Training   runtime
    0.0s     = Validation runtime
Fitting model: ExtraTreesGini_BAG_L1 ...
    0.8622   = Validation score   (accuracy)
    0.33s    = Training   runtime
    0.21s    = Validation runtime
Fitting model: ExtraTreesEntr_BAG_L1 ...
    0.8626   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.25s    = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.867    = Validation score   (accuracy)
    9.1s     = Training   runtime
    0.17s    = Validation runtime
Fitting model: XGBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8944   = Validation score   (accuracy)
    0.57s    = Training   runtime
    0.02s    = Validation runtime
Fitting model: NeuralNetTorch_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8888   = Validation score   (accuracy)
    18.41s   = Training   runtime
    0.09s    = Validation runtime
Fitting model: LightGBMLarge_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8941   = Validation score   (accuracy)
    0.85s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.8948   = Validation score   (accuracy)
    2.26s    = Training   runtime
    0.01s    = Validation runtime
AutoGluon training complete, total runtime = 45.37s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_125519/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcc0a6335b0>
predictr.leaderboard()
                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.894772       0.020864   4.666441                0.009758           2.256641            2       True         14
1           CatBoost_BAG_L1   0.894661       0.004361   1.564344                0.004361           1.564344            1       True          7
2            XGBoost_BAG_L1   0.894439       0.022174   0.569362                0.022174           0.569362            1       True         11
3      LightGBMLarge_BAG_L1   0.894106       0.006745   0.845457                0.006745           0.845457            1       True         13
4           LightGBM_BAG_L1   0.893995       0.016963   0.637937                0.016963           0.637937            1       True          4
5     NeuralNetTorch_BAG_L1   0.888778       0.089444  18.408010                0.089444          18.408010            1       True         12
6         LightGBMXT_BAG_L1   0.885004       0.033841   0.504977                0.033841           0.504977            1       True          3
7     KNeighborsUnif_BAG_L1   0.878233       0.009831   0.002898                0.009831           0.002898            1       True          1
8    NeuralNetFastAI_BAG_L1   0.867022       0.171568   9.103081                0.171568           9.103081            1       True         10
9     KNeighborsDist_BAG_L1   0.864136       0.008431   0.003336                0.008431           0.003336            1       True          2
10    ExtraTreesEntr_BAG_L1   0.862582       0.253638   0.310474                0.253638           0.310474            1       True          9
11    ExtraTreesGini_BAG_L1   0.862249       0.210405   0.327562                0.210405           0.327562            1       True          8
12  RandomForestEntr_BAG_L1   0.856033       0.189045   0.599637                0.189045           0.599637            1       True          6
13  RandomForestGini_BAG_L1   0.856033       0.191019   0.349150                0.191019           0.349150            1       True          5
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.894772 0.020864 4.666441 0.009758 2.256641 2 True 14
1 CatBoost_BAG_L1 0.894661 0.004361 1.564344 0.004361 1.564344 1 True 7
2 XGBoost_BAG_L1 0.894439 0.022174 0.569362 0.022174 0.569362 1 True 11
3 LightGBMLarge_BAG_L1 0.894106 0.006745 0.845457 0.006745 0.845457 1 True 13
4 LightGBM_BAG_L1 0.893995 0.016963 0.637937 0.016963 0.637937 1 True 4
5 NeuralNetTorch_BAG_L1 0.888778 0.089444 18.408010 0.089444 18.408010 1 True 12
6 LightGBMXT_BAG_L1 0.885004 0.033841 0.504977 0.033841 0.504977 1 True 3
7 KNeighborsUnif_BAG_L1 0.878233 0.009831 0.002898 0.009831 0.002898 1 True 1
8 NeuralNetFastAI_BAG_L1 0.867022 0.171568 9.103081 0.171568 9.103081 1 True 10
9 KNeighborsDist_BAG_L1 0.864136 0.008431 0.003336 0.008431 0.003336 1 True 2
10 ExtraTreesEntr_BAG_L1 0.862582 0.253638 0.310474 0.253638 0.310474 1 True 9
11 ExtraTreesGini_BAG_L1 0.862249 0.210405 0.327562 0.210405 0.327562 1 True 8
12 RandomForestEntr_BAG_L1 0.856033 0.189045 0.599637 0.189045 0.599637 1 True 6
13 RandomForestGini_BAG_L1 0.856033 0.191019 0.349150 0.191019 0.349150 1 True 5

autogluon2: amt, distace

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
df50 = df50.reset_index()
df50['trans_date_trans_time'] = pd.to_datetime(df50['trans_date_trans_time'])
df50['trans_date_trans_time'] = (df50['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

category_map = {category: index for index, category in enumerate(df50['category'].unique())}
df50['category'] = df50['category'].map(category_map)

def haversine(lat1, lon1, lat2, lon2):
    # 지구의 반지름 (미터)
    radius = 6371.0

    # 라디안으로 변환
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Haversine 공식 계산
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = radius * c

    return distance

customer_lat = df50['lat']
customer_lon = df50['long']
store_lat = df50['merch_lat']
store_lon = df50['merch_long']
distances = haversine(customer_lat, customer_lon, store_lat, store_lon)
df50['distance_km'] = distances
df50 = df50[["amt","distance_km", "is_fraud"]]
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_125738/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_125738/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   738.25 GB / 982.82 GB (75.1%)
Train Data Rows:    9009
Train Data Columns: 2
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    10891.21 MB
    Train Data (Original)  Memory Usage: 0.14 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
    0.0s = Fit runtime
    2 features in original data used to generate 2 features in processed data.
    Train Data (Processed) Memory Usage: 0.14 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.03s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ...
    0.8779   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.02s    = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ...
    0.8716   = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8898   = Validation score   (accuracy)
    0.77s    = Training   runtime
    0.15s    = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.894    = Validation score   (accuracy)
    0.66s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
    0.8788   = Validation score   (accuracy)
    0.41s    = Training   runtime
    0.21s    = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
    0.8793   = Validation score   (accuracy)
    0.58s    = Training   runtime
    0.23s    = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8948   = Validation score   (accuracy)
    2.02s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini_BAG_L1 ...
    0.8771   = Validation score   (accuracy)
    0.31s    = Training   runtime
    0.23s    = Validation runtime
Fitting model: ExtraTreesEntr_BAG_L1 ...
    0.8766   = Validation score   (accuracy)
    0.38s    = Training   runtime
    0.23s    = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8671   = Validation score   (accuracy)
    10.03s   = Training   runtime
    0.11s    = Validation runtime
Fitting model: XGBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8955   = Validation score   (accuracy)
    0.72s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: NeuralNetTorch_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8869   = Validation score   (accuracy)
    14.71s   = Training   runtime
    0.07s    = Validation runtime
Fitting model: LightGBMLarge_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8866   = Validation score   (accuracy)
    0.94s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.8955   = Validation score   (accuracy)
    2.32s    = Training   runtime
    0.01s    = Validation runtime
AutoGluon training complete, total runtime = 42.82s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_125738/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcb15583dc0>
predictr.leaderboard()
                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0            XGBoost_BAG_L1   0.895549       0.026780   0.716487                0.026780           0.716487            1       True         11
1       WeightedEnsemble_L2   0.895549       0.036584   3.040059                0.009804           2.323572            2       True         14
2           CatBoost_BAG_L1   0.894772       0.007168   2.017300                0.007168           2.017300            1       True          7
3           LightGBM_BAG_L1   0.893995       0.009826   0.656309                0.009826           0.656309            1       True          4
4         LightGBMXT_BAG_L1   0.889777       0.153301   0.774388                0.153301           0.774388            1       True          3
5     NeuralNetTorch_BAG_L1   0.886891       0.067880  14.708995                0.067880          14.708995            1       True         12
6      LightGBMLarge_BAG_L1   0.886558       0.011474   0.938887                0.011474           0.938887            1       True         13
7   RandomForestEntr_BAG_L1   0.879343       0.227656   0.576854                0.227656           0.576854            1       True          6
8   RandomForestGini_BAG_L1   0.878788       0.210062   0.409630                0.210062           0.409630            1       True          5
9     KNeighborsUnif_BAG_L1   0.877900       0.019640   0.004492                0.019640           0.004492            1       True          1
10    ExtraTreesGini_BAG_L1   0.877123       0.230533   0.311612                0.230533           0.311612            1       True          8
11    ExtraTreesEntr_BAG_L1   0.876568       0.228856   0.381866                0.228856           0.381866            1       True          9
12    KNeighborsDist_BAG_L1   0.871573       0.010582   0.003571                0.010582           0.003571            1       True          2
13   NeuralNetFastAI_BAG_L1   0.867133       0.106724  10.029284                0.106724          10.029284            1       True         10
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 XGBoost_BAG_L1 0.895549 0.026780 0.716487 0.026780 0.716487 1 True 11
1 WeightedEnsemble_L2 0.895549 0.036584 3.040059 0.009804 2.323572 2 True 14
2 CatBoost_BAG_L1 0.894772 0.007168 2.017300 0.007168 2.017300 1 True 7
3 LightGBM_BAG_L1 0.893995 0.009826 0.656309 0.009826 0.656309 1 True 4
4 LightGBMXT_BAG_L1 0.889777 0.153301 0.774388 0.153301 0.774388 1 True 3
5 NeuralNetTorch_BAG_L1 0.886891 0.067880 14.708995 0.067880 14.708995 1 True 12
6 LightGBMLarge_BAG_L1 0.886558 0.011474 0.938887 0.011474 0.938887 1 True 13
7 RandomForestEntr_BAG_L1 0.879343 0.227656 0.576854 0.227656 0.576854 1 True 6
8 RandomForestGini_BAG_L1 0.878788 0.210062 0.409630 0.210062 0.409630 1 True 5
9 KNeighborsUnif_BAG_L1 0.877900 0.019640 0.004492 0.019640 0.004492 1 True 1
10 ExtraTreesGini_BAG_L1 0.877123 0.230533 0.311612 0.230533 0.311612 1 True 8
11 ExtraTreesEntr_BAG_L1 0.876568 0.228856 0.381866 0.228856 0.381866 1 True 9
12 KNeighborsDist_BAG_L1 0.871573 0.010582 0.003571 0.010582 0.003571 1 True 2
13 NeuralNetFastAI_BAG_L1 0.867133 0.106724 10.029284 0.106724 10.029284 1 True 10

autogluon3: amt, time, distace

_df1 = fraudTrain[fraudTrain["is_fraud"] == 0].sample(frac=0.20, random_state=42)
_df2 = fraudTrain[fraudTrain["is_fraud"] == 1]
df02 = pd.concat([_df1,_df2])
df02.shape
(214520, 22)
df50 = down_sample_textbook(df02)
df50.shape
(12012, 22)
df50 = df50.reset_index()
df50['trans_date_trans_time'] = pd.to_datetime(df50['trans_date_trans_time'])
df50['trans_date_trans_time'] = (df50['trans_date_trans_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

category_map = {category: index for index, category in enumerate(df50['category'].unique())}
df50['category'] = df50['category'].map(category_map)

def haversine(lat1, lon1, lat2, lon2):
    # 지구의 반지름 (미터)
    radius = 6371.0

    # 라디안으로 변환
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)

    # Haversine 공식 계산
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    distance = radius * c

    return distance

customer_lat = df50['lat']
customer_lon = df50['long']
store_lat = df50['merch_lat']
store_lon = df50['merch_long']
distances = haversine(customer_lat, customer_lon, store_lat, store_lon)
df50['distance_km'] = distances
df50 = df50[["amt",'trans_date_trans_time', 'distance_km', "is_fraud"]]
df50_tr,df50_test = sklearn.model_selection.train_test_split(df50, random_state=42)

A. 데이터

tr = TabularDataset(df50_tr)
tst = TabularDataset(df50_test)

B. predictor 생성

predictr = TabularPredictor("is_fraud")
No path specified. Models will be saved in: "AutogluonModels/ag-20231011_125833/"

C.적합(fit)

predictr.fit(tr, presets='best_quality')
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=0, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231011_125833/"
AutoGluon Version:  0.8.2
Python Version:     3.8.18
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #26~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Thu Jul 13 16:27:29 UTC 2
Disk Space Avail:   738.82 GB / 982.82 GB (75.2%)
Train Data Rows:    9009
Train Data Columns: 3
Label Column: is_fraud
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
    2 unique label values:  [1, 0]
    If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
    Available Memory:                    12201.29 MB
    Train Data (Original)  Memory Usage: 0.22 MB (0.0% of available memory)
    Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
    Stage 1 Generators:
        Fitting AsTypeFeatureGenerator...
    Stage 2 Generators:
        Fitting FillNaFeatureGenerator...
    Stage 3 Generators:
        Fitting IdentityFeatureGenerator...
    Stage 4 Generators:
        Fitting DropUniqueFeatureGenerator...
    Stage 5 Generators:
        Fitting DropDuplicatesFeatureGenerator...
    Types of features in original data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
        ('int', [])   : 1 | ['trans_date_trans_time']
    Types of features in processed data (raw dtype, special dtypes):
        ('float', []) : 2 | ['amt', 'distance_km']
        ('int', [])   : 1 | ['trans_date_trans_time']
    0.0s = Fit runtime
    3 features in original data used to generate 3 features in processed data.
    Train Data (Processed) Memory Usage: 0.22 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.04s ...
AutoGluon will gauge predictive performance using evaluation metric: 'accuracy'
    To change this, specify the eval_metric parameter of Predictor()
User-specified model hyperparameters to be fit:
{
    'NN_TORCH': {},
    'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, 'GBMLarge'],
    'CAT': {},
    'XGB': {},
    'FASTAI': {},
    'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'XT': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
    'KNN': [{'weights': 'uniform', 'ag_args': {'name_suffix': 'Unif'}}, {'weights': 'distance', 'ag_args': {'name_suffix': 'Dist'}}],
}
Fitting 13 L1 models ...
Fitting model: KNeighborsUnif_BAG_L1 ...
    0.7325   = Validation score   (accuracy)
    0.01s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: KNeighborsDist_BAG_L1 ...
    0.737    = Validation score   (accuracy)
    0.0s     = Training   runtime
    0.01s    = Validation runtime
Fitting model: LightGBMXT_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8918   = Validation score   (accuracy)
    1.14s    = Training   runtime
    0.17s    = Validation runtime
Fitting model: LightGBM_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.9003   = Validation score   (accuracy)
    0.79s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ...
    0.887    = Validation score   (accuracy)
    0.39s    = Training   runtime
    0.2s     = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ...
    0.8876   = Validation score   (accuracy)
    0.53s    = Training   runtime
    0.21s    = Validation runtime
Fitting model: CatBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8993   = Validation score   (accuracy)
    2.63s    = Training   runtime
    0.01s    = Validation runtime
Fitting model: ExtraTreesGini_BAG_L1 ...
    0.8818   = Validation score   (accuracy)
    0.32s    = Training   runtime
    0.23s    = Validation runtime
Fitting model: ExtraTreesEntr_BAG_L1 ...
    0.8813   = Validation score   (accuracy)
    0.32s    = Training   runtime
    0.24s    = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8664   = Validation score   (accuracy)
    8.79s    = Training   runtime
    0.13s    = Validation runtime
Fitting model: XGBoost_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8993   = Validation score   (accuracy)
    0.79s    = Training   runtime
    0.04s    = Validation runtime
Fitting model: NeuralNetTorch_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8862   = Validation score   (accuracy)
    15.89s   = Training   runtime
    0.07s    = Validation runtime
Fitting model: LightGBMLarge_BAG_L1 ...
    Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
    0.8928   = Validation score   (accuracy)
    1.36s    = Training   runtime
    0.03s    = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
    0.922    = Validation score   (accuracy)
    2.2s     = Training   runtime
    0.01s    = Validation runtime
AutoGluon training complete, total runtime = 43.5s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20231011_125833/")
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7fcb14ccb700>
predictr.leaderboard()
                      model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       WeightedEnsemble_L2   0.921967       1.031116  26.060251                0.009939           2.203469            2       True         14
1           LightGBM_BAG_L1   0.900322       0.031546   0.788781                0.031546           0.788781            1       True          4
2           CatBoost_BAG_L1   0.899323       0.005555   2.631501                0.005555           2.631501            1       True          7
3            XGBoost_BAG_L1   0.899323       0.037676   0.793630                0.037676           0.793630            1       True         11
4      LightGBMLarge_BAG_L1   0.892774       0.034059   1.360121                0.034059           1.360121            1       True         13
5         LightGBMXT_BAG_L1   0.891775       0.173170   1.136529                0.173170           1.136529            1       True          3
6   RandomForestEntr_BAG_L1   0.887557       0.205703   0.534585                0.205703           0.534585            1       True          6
7   RandomForestGini_BAG_L1   0.887002       0.204052   0.386727                0.204052           0.386727            1       True          5
8     NeuralNetTorch_BAG_L1   0.886225       0.067507  15.891491                0.067507          15.891491            1       True         12
9     ExtraTreesGini_BAG_L1   0.881785       0.233411   0.318739                0.233411           0.318739            1       True          8
10    ExtraTreesEntr_BAG_L1   0.881341       0.244429   0.322460                0.244429           0.322460            1       True          9
11   NeuralNetFastAI_BAG_L1   0.866356       0.129635   8.785419                0.129635           8.785419            1       True         10
12    KNeighborsDist_BAG_L1   0.737041       0.008475   0.004331                0.008475           0.004331            1       True          2
13    KNeighborsUnif_BAG_L1   0.732490       0.009005   0.006626                0.009005           0.006626            1       True          1
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 WeightedEnsemble_L2 0.921967 1.031116 26.060251 0.009939 2.203469 2 True 14
1 LightGBM_BAG_L1 0.900322 0.031546 0.788781 0.031546 0.788781 1 True 4
2 CatBoost_BAG_L1 0.899323 0.005555 2.631501 0.005555 2.631501 1 True 7
3 XGBoost_BAG_L1 0.899323 0.037676 0.793630 0.037676 0.793630 1 True 11
4 LightGBMLarge_BAG_L1 0.892774 0.034059 1.360121 0.034059 1.360121 1 True 13
5 LightGBMXT_BAG_L1 0.891775 0.173170 1.136529 0.173170 1.136529 1 True 3
6 RandomForestEntr_BAG_L1 0.887557 0.205703 0.534585 0.205703 0.534585 1 True 6
7 RandomForestGini_BAG_L1 0.887002 0.204052 0.386727 0.204052 0.386727 1 True 5
8 NeuralNetTorch_BAG_L1 0.886225 0.067507 15.891491 0.067507 15.891491 1 True 12
9 ExtraTreesGini_BAG_L1 0.881785 0.233411 0.318739 0.233411 0.318739 1 True 8
10 ExtraTreesEntr_BAG_L1 0.881341 0.244429 0.322460 0.244429 0.322460 1 True 9
11 NeuralNetFastAI_BAG_L1 0.866356 0.129635 8.785419 0.129635 8.785419 1 True 10
12 KNeighborsDist_BAG_L1 0.737041 0.008475 0.004331 0.008475 0.004331 1 True 2
13 KNeighborsUnif_BAG_L1 0.732490 0.009005 0.006626 0.009005 0.006626 1 True 1