[STBDA2023] 14wk-60: 자전거대여 / 하이퍼파라메터 튜닝

Author

김보람

Published

December 10, 2023

14wk-60: 자전거대여 / 하이퍼파라메터 튜닝

최규빈
2023-12-01

1. 강의영상

???

2. Imports

#!pip install autogluon.multimodal 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
warnings.filterwarnings('ignore')
2023-12-10 17:19:01.469099: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-10 17:19:02.107172: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

3. Data

- 자료 다운로드

!kaggle competitions download -c bike-sharing-demand
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
Downloading bike-sharing-demand.zip to /home/coco/Dropbox/Class/STBDA23/posts
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 822kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 821kB/s]
!unzip bike-sharing-demand.zip -d data
Archive:  bike-sharing-demand.zip
  inflating: data/sampleSubmission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          
sampleSubmission = pd.read_csv('data/sampleSubmission.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv') 
!rm -rf data
!rm bike-sharing-demand.zip

4. 기본전처리 및 분석 프로세스

- 전처리

def preprocessing(df_train,df_test):
    df_train_featured = df_train.copy()
    df_test_featured = df_test.copy()
    #----# 
    df_train_featured = df_train_featured.drop(['casual','registered'],axis=1)
    #--#
    df_train_featured['hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_test_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_train_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
    df_test_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
    #--#
    df_train_featured = df_train_featured.drop(['datetime'],axis=1)
    df_test_featured = df_test_featured.drop(['datetime'],axis=1)
    #--#
    df_train_featured = df_train_featured.drop(['atemp'],axis=1)
    df_test_featured = df_test_featured.drop(['atemp'],axis=1)
    return df_train_featured, df_test_featured

- 함수들

def plot(yhat,yyhat):
    df = pd.concat([
        df_train.assign(count_hat = yhat, dataset_type = 'train'),
        df_test.assign(count_hat = yyhat, dataset_type = 'test')
    ])
    df['datetime'] = pd.to_datetime(df['datetime'])
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count',
        hue='dataset_type',
        linestyle='--',
        lw=0.8
    )
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count_hat',
        hue='dataset_type',
        alpha=0.5,
        lw=3
    )
    fig = plt.gcf()
    fig.set_size_inches(8,2)
    plt.xticks(rotation=15); 
    fig.show()
def submit(yyhat):
    sampleSubmission['count'] = yyhat 
    sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
    sampleSubmission.to_csv("submission.csv",index=False)
    !kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
    !rm submission.csv
def auto(df_train, df_test):
    # step1 
    df_train_featured, df_test_featured = preprocessing(df_train, df_test) # preprocessing
    df_train_featured['count'] = np.log1p(df_train_featured['count']) # transform 
    # step2~4 
    yhat,yyhat = fit_predict(df_train_featured,df_test_featured)
    yhat = np.expm1(yhat) # inverse_trans
    yyhat = np.expm1(yyhat) # inverse_trans
    # 시각화 
    plot(yhat,yyhat)
    # 제출 
    submit(yyhat)

5. 하이퍼파라메터 튜닝

- 기본 HP

{
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge"
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "XT": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
    ]
}

- fit_predict 함수 수정

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      RandomForestMSE  -0.401983       0.032302  1.019892                0.032302           1.019892            1       True          1
1  WeightedEnsemble_L2  -0.401983       0.032602  1.022945                0.000299           0.003053            2       True          2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 102kB/s]
Successfully submitted to Bike Sharing Demand
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 0.032302 1.019892 0.032302 1.019892 1 True 1
1 WeightedEnsemble_L2 -0.401983 0.032602 1.022945 0.000299 0.003053 2 True 2

ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html

- 방금 돌린것은 아래와 결과가 동일함.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      RandomForestMSE  -0.401983       0.031664  0.914731                0.031664           0.914731            1       True          1
1  WeightedEnsemble_L2  -0.401983       0.031927  0.917581                0.000263           0.002850            2       True          2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 123kB/s]
Successfully submitted to Bike Sharing Demand
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 0.031664 0.914731 0.031664 0.914731 1 True 1
1 WeightedEnsemble_L2 -0.401983 0.031927 0.917581 0.000263 0.002850 2 True 2

- 알아낸 방법?

df_train_featured, df_test_featured = preprocessing(df_train,df_test)
predictr= TabularPredictor(label='count',verbosity=False)
# step3 
hp = {
    "RF": [
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ]
}
predictr.fit(
    df_train_featured,
    hyperparameters = hp
)
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f33b3eedbe0>
predictr.info()['model_info']['RandomForestMSE']['hyperparameters']
{'n_estimators': 300,
 'max_leaf_nodes': 15000,
 'n_jobs': -1,
 'random_state': 0,
 'bootstrap': True,
 'criterion': 'squared_error'}

- RF에서 더 다양한 파라메터를 실험해보자.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat 
auto(df_train,df_test)
                     model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  RandomForest(500,10000)  -0.401733       0.058357  2.050068                0.058357           2.050068            1       True          5
1      WeightedEnsemble_L2  -0.401733       0.058645  2.149383                0.000288           0.099315            2       True          7
2  RandomForest(500,15000)  -0.401733       0.061309  2.016577                0.061309           2.016577            1       True          6
3  RandomForest(300,10000)  -0.401983       0.031878  1.039235                0.031878           1.039235            1       True          1
4  RandomForest(300,15000)  -0.401983       0.032004  1.069615                0.032004           1.069615            1       True          2
5  RandomForest(400,15000)  -0.402192       0.040670  1.483031                0.040670           1.483031            1       True          4
6  RandomForest(400,10000)  -0.402192       0.041378  1.327342                0.041378           1.327342            1       True          3
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 152kB/s]
Successfully submitted to Bike Sharing Demand
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForest(500,10000) -0.401733 0.058357 2.050068 0.058357 2.050068 1 True 5
1 WeightedEnsemble_L2 -0.401733 0.058645 2.149383 0.000288 0.099315 2 True 7
2 RandomForest(500,15000) -0.401733 0.061309 2.016577 0.061309 2.016577 1 True 6
3 RandomForest(300,10000) -0.401983 0.031878 1.039235 0.031878 1.039235 1 True 1
4 RandomForest(300,15000) -0.401983 0.032004 1.069615 0.032004 1.069615 1 True 2
5 RandomForest(400,15000) -0.402192 0.040670 1.483031 0.040670 1.483031 1 True 4
6 RandomForest(400,10000) -0.402192 0.041378 1.327342 0.041378 1.327342 1 True 3