14wk-60: 자전거대여 / 하이퍼파라메터 튜닝

최규빈
2023-12-01

1. 강의영상

2. Imports

#!pip install autogluon.multimodal

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
warnings.filterwarnings('ignore')

2023-12-10 17:19:01.469099: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-10 17:19:02.107172: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT

3. Data

- 자료 다운로드

!kaggle competitions download -c bike-sharing-demand

Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
Downloading bike-sharing-demand.zip to /home/coco/Dropbox/Class/STBDA23/posts
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 822kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 821kB/s]

!unzip bike-sharing-demand.zip -d data

Archive:  bike-sharing-demand.zip
  inflating: data/sampleSubmission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv

sampleSubmission = pd.read_csv('data/sampleSubmission.csv')
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

!rm -rf data
!rm bike-sharing-demand.zip

4. 기본전처리 및 분석 프로세스

- 전처리

def preprocessing(df_train,df_test):
    df_train_featured = df_train.copy()
    df_test_featured = df_test.copy()
    #----# 
    df_train_featured = df_train_featured.drop(['casual','registered'],axis=1)
    #--#
    df_train_featured['hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_test_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
    df_train_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
    df_test_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
    #--#
    df_train_featured = df_train_featured.drop(['datetime'],axis=1)
    df_test_featured = df_test_featured.drop(['datetime'],axis=1)
    #--#
    df_train_featured = df_train_featured.drop(['atemp'],axis=1)
    df_test_featured = df_test_featured.drop(['atemp'],axis=1)
    return df_train_featured, df_test_featured

- 함수들

def plot(yhat,yyhat):
    df = pd.concat([
        df_train.assign(count_hat = yhat, dataset_type = 'train'),
        df_test.assign(count_hat = yyhat, dataset_type = 'test')
    ])
    df['datetime'] = pd.to_datetime(df['datetime'])
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count',
        hue='dataset_type',
        linestyle='--',
        lw=0.8
    )
    sns.lineplot(
        df.sort_values('datetime')[:(24*28)],
        x='datetime',y='count_hat',
        hue='dataset_type',
        alpha=0.5,
        lw=3
    )
    fig = plt.gcf()
    fig.set_size_inches(8,2)
    plt.xticks(rotation=15); 
    fig.show()

def submit(yyhat):
    sampleSubmission['count'] = yyhat 
    sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
    sampleSubmission.to_csv("submission.csv",index=False)
    !kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
    !rm submission.csv

def auto(df_train, df_test):
    # step1 
    df_train_featured, df_test_featured = preprocessing(df_train, df_test) # preprocessing
    df_train_featured['count'] = np.log1p(df_train_featured['count']) # transform 
    # step2~4 
    yhat,yyhat = fit_predict(df_train_featured,df_test_featured)
    yhat = np.expm1(yhat) # inverse_trans
    yyhat = np.expm1(yyhat) # inverse_trans
    # 시각화 
    plot(yhat,yyhat)
    # 제출 
    submit(yyhat)

5. 하이퍼파라메터 튜닝

- 기본 HP

{
    "NN_TORCH": {},
    "GBM": [
        {"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
        {},
        "GBMLarge"
    ],
    "CAT": {},
    "XGB": {},
    "FASTAI": {},
    "RF": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "XT": [
        {"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ],
    "KNN": [
        {"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
        {"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
    ]
}

- fit_predict 함수 수정

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat

auto(df_train,df_test)

                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      RandomForestMSE  -0.401983       0.032302  1.019892                0.032302           1.019892            1       True          1
1  WeightedEnsemble_L2  -0.401983       0.032602  1.022945                0.000299           0.003053            2       True          2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 102kB/s]
Successfully submitted to Bike Sharing Demand

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	RandomForestMSE	-0.401983	0.032302	1.019892	0.032302	1.019892	1	True	1
1	WeightedEnsemble_L2	-0.401983	0.032602	1.022945	0.000299	0.003053	2	True	2

ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html

LightGBM model: https://lightgbm.readthedocs.io/en/latest/
CatBoost model: https://catboost.ai/
XGBoost model: https://xgboost.readthedocs.io/en/latest/
Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
Extra Trees model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
Linear model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

- 방금 돌린것은 아래와 결과가 동일함.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [
            {"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
        ]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat

auto(df_train,df_test)

                 model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      RandomForestMSE  -0.401983       0.031664  0.914731                0.031664           0.914731            1       True          1
1  WeightedEnsemble_L2  -0.401983       0.031927  0.917581                0.000263           0.002850            2       True          2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 123kB/s]
Successfully submitted to Bike Sharing Demand

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	RandomForestMSE	-0.401983	0.031664	0.914731	0.031664	0.914731	1	True	1
1	WeightedEnsemble_L2	-0.401983	0.031927	0.917581	0.000263	0.002850	2	True	2

- 알아낸 방법?

df_train_featured, df_test_featured = preprocessing(df_train,df_test)

predictr= TabularPredictor(label='count',verbosity=False)
# step3 
hp = {
    "RF": [
        {"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
    ]
}
predictr.fit(
    df_train_featured,
    hyperparameters = hp
)

<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f33b3eedbe0>

predictr.info()['model_info']['RandomForestMSE']['hyperparameters']

{'n_estimators': 300,
 'max_leaf_nodes': 15000,
 'n_jobs': -1,
 'random_state': 0,
 'bootstrap': True,
 'criterion': 'squared_error'}

- RF에서 더 다양한 파라메터를 실험해보자.

def fit_predict(df_train_featured, df_test_featured):
    # step1 
    # step2
    predictr= TabularPredictor(label='count',verbosity=False)
    # step3 
    hp = {
        "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
    }
    predictr.fit(
        df_train_featured,
        hyperparameters = hp
    )
    # step4 
    yhat = predictr.predict(df_train_featured)
    yyhat = predictr.predict(df_test_featured)
    # display
    display(predictr.leaderboard())
    return yhat, yyhat

auto(df_train,df_test)

                     model  score_val  pred_time_val  fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0  RandomForest(500,10000)  -0.401733       0.058357  2.050068                0.058357           2.050068            1       True          5
1      WeightedEnsemble_L2  -0.401733       0.058645  2.149383                0.000288           0.099315            2       True          7
2  RandomForest(500,15000)  -0.401733       0.061309  2.016577                0.061309           2.016577            1       True          6
3  RandomForest(300,10000)  -0.401983       0.031878  1.039235                0.031878           1.039235            1       True          1
4  RandomForest(300,15000)  -0.401983       0.032004  1.069615                0.032004           1.069615            1       True          2
5  RandomForest(400,15000)  -0.402192       0.040670  1.483031                0.040670           1.483031            1       True          4
6  RandomForest(400,10000)  -0.402192       0.041378  1.327342                0.041378           1.327342            1       True          3
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 152kB/s]
Successfully submitted to Bike Sharing Demand

	model	score_val	pred_time_val	fit_time	pred_time_val_marginal	fit_time_marginal	stack_level	can_infer	fit_order
0	RandomForest(500,10000)	-0.401733	0.058357	2.050068	0.058357	2.050068	1	True	5
1	WeightedEnsemble_L2	-0.401733	0.058645	2.149383	0.000288	0.099315	2	True	7
2	RandomForest(500,15000)	-0.401733	0.061309	2.016577	0.061309	2.016577	1	True	6
3	RandomForest(300,10000)	-0.401983	0.031878	1.039235	0.031878	1.039235	1	True	1
4	RandomForest(300,15000)	-0.401983	0.032004	1.069615	0.032004	1.069615	1	True	2
5	RandomForest(400,15000)	-0.402192	0.040670	1.483031	0.040670	1.483031	1	True	4
6	RandomForest(400,10000)	-0.402192	0.041378	1.327342	0.041378	1.327342	1	True	3