#!pip install autogluon.multimodal
14wk-60: 자전거대여 / 하이퍼파라메터 튜닝
최규빈
2023-12-01
1. 강의영상
2. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing
#---#}
from autogluon.tabular import TabularPredictor
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor
from autogluon.common import space
#---#
import IPython
import os
import warnings
'ignore') warnings.filterwarnings(
2023-12-10 17:19:01.469099: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-10 17:19:02.107172: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
3. Data
-
자료 다운로드
!kaggle competitions download -c bike-sharing-demand
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
Downloading bike-sharing-demand.zip to /home/coco/Dropbox/Class/STBDA23/posts
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 822kB/s]
100%|█████████████████████████████████████████| 189k/189k [00:00<00:00, 821kB/s]
!unzip bike-sharing-demand.zip -d data
Archive: bike-sharing-demand.zip
inflating: data/sampleSubmission.csv
inflating: data/test.csv
inflating: data/train.csv
= pd.read_csv('data/sampleSubmission.csv')
sampleSubmission = pd.read_csv('data/train.csv')
df_train = pd.read_csv('data/test.csv') df_test
!rm -rf data
!rm bike-sharing-demand.zip
4. 기본전처리 및 분석 프로세스
-
전처리
def preprocessing(df_train,df_test):
= df_train.copy()
df_train_featured = df_test.copy()
df_test_featured #----#
= df_train_featured.drop(['casual','registered'],axis=1)
df_train_featured #--#
'hour'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.hour
df_train_featured['hour'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.hour
df_test_featured['weekday'] = df_train_featured['datetime'].apply(pd.to_datetime).dt.weekday
df_train_featured['weekday'] = df_test_featured['datetime'].apply(pd.to_datetime).dt.weekday
df_test_featured[#--#
= df_train_featured.drop(['datetime'],axis=1)
df_train_featured = df_test_featured.drop(['datetime'],axis=1)
df_test_featured #--#
= df_train_featured.drop(['atemp'],axis=1)
df_train_featured = df_test_featured.drop(['atemp'],axis=1)
df_test_featured return df_train_featured, df_test_featured
-
함수들
def plot(yhat,yyhat):
= pd.concat([
df = yhat, dataset_type = 'train'),
df_train.assign(count_hat = yyhat, dataset_type = 'test')
df_test.assign(count_hat
])'datetime'] = pd.to_datetime(df['datetime'])
df[
sns.lineplot('datetime')[:(24*28)],
df.sort_values(='datetime',y='count',
x='dataset_type',
hue='--',
linestyle=0.8
lw
)
sns.lineplot('datetime')[:(24*28)],
df.sort_values(='datetime',y='count_hat',
x='dataset_type',
hue=0.5,
alpha=3
lw
)= plt.gcf()
fig 8,2)
fig.set_size_inches(=15);
plt.xticks(rotation fig.show()
def submit(yyhat):
'count'] = yyhat
sampleSubmission['count'] = sampleSubmission['count'].apply(lambda x: x if x>0 else 0)
sampleSubmission["submission.csv",index=False)
sampleSubmission.to_csv(!kaggle competitions submit -c bike-sharing-demand -f submission.csv -m "Message"
!rm submission.csv
def auto(df_train, df_test):
# step1
= preprocessing(df_train, df_test) # preprocessing
df_train_featured, df_test_featured 'count'] = np.log1p(df_train_featured['count']) # transform
df_train_featured[# step2~4
= fit_predict(df_train_featured,df_test_featured)
yhat,yyhat = np.expm1(yhat) # inverse_trans
yhat = np.expm1(yyhat) # inverse_trans
yyhat # 시각화
plot(yhat,yyhat)# 제출
submit(yyhat)
5. 하이퍼파라메터 튜닝
-
기본 HP
{"NN_TORCH": {},
"GBM": [
"extra_trees": True, "ag_args": {"name_suffix": "XT"}},
{
{},"GBMLarge"
],"CAT": {},
"XGB": {},
"FASTAI": {},
"RF": [
"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
{"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
{"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
],"XT": [
"criterion": "gini", "ag_args": {"name_suffix": "Gini", "problem_types": ["binary", "multiclass"]}},
{"criterion": "entropy", "ag_args": {"name_suffix": "Entr", "problem_types": ["binary", "multiclass"]}},
{"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
],"KNN": [
"weights": "uniform", "ag_args": {"name_suffix": "Unif"}},
{"weights": "distance", "ag_args": {"name_suffix": "Dist"}}
{
] }
-
fit_predict 함수 수정
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 0.032302 1.019892 0.032302 1.019892 1 True 1
1 WeightedEnsemble_L2 -0.401983 0.032602 1.022945 0.000299 0.003053 2 True 2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 102kB/s]
Successfully submitted to Bike Sharing Demand
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | RandomForestMSE | -0.401983 | 0.032302 | 1.019892 | 0.032302 | 1.019892 | 1 | True | 1 |
1 | WeightedEnsemble_L2 | -0.401983 | 0.032602 | 1.022945 | 0.000299 | 0.003053 | 2 | True | 2 |
ref: https://auto.gluon.ai/0.8.1/api/autogluon.tabular.models.html
- LightGBM model: https://lightgbm.readthedocs.io/en/latest/
- CatBoost model: https://catboost.ai/
- XGBoost model: https://xgboost.readthedocs.io/en/latest/
- Random Forest model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
- Extra Trees model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
- Linear model (scikit-learn): https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
-
방금 돌린것은 아래와 결과가 동일함.
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"n_estimators":300, "criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForestMSE -0.401983 0.031664 0.914731 0.031664 0.914731 1 True 1
1 WeightedEnsemble_L2 -0.401983 0.031927 0.917581 0.000263 0.002850 2 True 2
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 243k/243k [00:02<00:00, 123kB/s]
Successfully submitted to Bike Sharing Demand
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | RandomForestMSE | -0.401983 | 0.031664 | 0.914731 | 0.031664 | 0.914731 | 1 | True | 1 |
1 | WeightedEnsemble_L2 | -0.401983 | 0.031927 | 0.917581 | 0.000263 | 0.002850 | 2 | True | 2 |
-
알아낸 방법?
= preprocessing(df_train,df_test) df_train_featured, df_test_featured
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [
"criterion": "squared_error", "ag_args": {"name_suffix": "MSE", "problem_types": ["regression"]}}
{
]
}
predictr.fit(
df_train_featured,= hp
hyperparameters )
<autogluon.tabular.predictor.predictor.TabularPredictor at 0x7f33b3eedbe0>
'model_info']['RandomForestMSE']['hyperparameters'] predictr.info()[
{'n_estimators': 300,
'max_leaf_nodes': 15000,
'n_jobs': -1,
'random_state': 0,
'bootstrap': True,
'criterion': 'squared_error'}
-
RF에서 더 다양한 파라메터를 실험해보자.
def fit_predict(df_train_featured, df_test_featured):
# step1
# step2
= TabularPredictor(label='count',verbosity=False)
predictr# step3
= {
hp "RF": [ {"criterion": "squared_error", "n_estimators":i, "max_leaf_nodes":j, "ag_args": {"name_suffix": f"({i},{j})"}} for i in [300,400,500] for j in [10000,15000]]
}
predictr.fit(
df_train_featured,= hp
hyperparameters
)# step4
= predictr.predict(df_train_featured)
yhat = predictr.predict(df_test_featured)
yyhat # display
display(predictr.leaderboard())return yhat, yyhat
auto(df_train,df_test)
model score_val pred_time_val fit_time pred_time_val_marginal fit_time_marginal stack_level can_infer fit_order
0 RandomForest(500,10000) -0.401733 0.058357 2.050068 0.058357 2.050068 1 True 5
1 WeightedEnsemble_L2 -0.401733 0.058645 2.149383 0.000288 0.099315 2 True 7
2 RandomForest(500,15000) -0.401733 0.061309 2.016577 0.061309 2.016577 1 True 6
3 RandomForest(300,10000) -0.401983 0.031878 1.039235 0.031878 1.039235 1 True 1
4 RandomForest(300,15000) -0.401983 0.032004 1.069615 0.032004 1.069615 1 True 2
5 RandomForest(400,15000) -0.402192 0.040670 1.483031 0.040670 1.483031 1 True 4
6 RandomForest(400,10000) -0.402192 0.041378 1.327342 0.041378 1.327342 1 True 3
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
100%|█████████████████████████████████████████| 242k/242k [00:01<00:00, 152kB/s]
Successfully submitted to Bike Sharing Demand
model | score_val | pred_time_val | fit_time | pred_time_val_marginal | fit_time_marginal | stack_level | can_infer | fit_order | |
---|---|---|---|---|---|---|---|---|---|
0 | RandomForest(500,10000) | -0.401733 | 0.058357 | 2.050068 | 0.058357 | 2.050068 | 1 | True | 5 |
1 | WeightedEnsemble_L2 | -0.401733 | 0.058645 | 2.149383 | 0.000288 | 0.099315 | 2 | True | 7 |
2 | RandomForest(500,15000) | -0.401733 | 0.061309 | 2.016577 | 0.061309 | 2.016577 | 1 | True | 6 |
3 | RandomForest(300,10000) | -0.401983 | 0.031878 | 1.039235 | 0.031878 | 1.039235 | 1 | True | 1 |
4 | RandomForest(300,15000) | -0.401983 | 0.032004 | 1.069615 | 0.032004 | 1.069615 | 1 | True | 2 |
5 | RandomForest(400,15000) | -0.402192 | 0.040670 | 1.483031 | 0.040670 | 1.483031 | 1 | True | 4 |
6 | RandomForest(400,10000) | -0.402192 | 0.041378 | 1.327342 | 0.041378 | 1.327342 | 1 | True | 3 |