import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import pickle
#---#
from autogluon.multimodal import MultiModalPredictor # from autogluon.tabular import TabularPredictor
#---#
import warnings
'ignore') warnings.filterwarnings(
ref
import
data
!kaggle competitions download -c santander-product-recommendation
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
santander-product-recommendation.zip: Skipping, found more recently modified local copy (use --force to force download)
!unzip santander-product-recommendation.zip -d data
Archive: santander-product-recommendation.zip
inflating: data/sample_submission.csv.zip
inflating: data/test_ver2.csv.zip
inflating: data/train_ver2.csv.zip
= pd.read_csv('data/train_ver2.csv.zip')
df_train = pd.read_csv('data/test_ver2.csv.zip')
df_test = pd.read_csv('data/sample_submission.csv.zip') sample_submission
!rm -rf data
!rm nlp-getting-started.zip
rm: cannot remove 'nlp-getting-started.zip': No such file or directory
df_train.head()
fecha_dato | ncodpers | ind_empleado | pais_residencia | sexo | age | fecha_alta | ind_nuevo | antiguedad | indrel | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-28 | 1375586 | N | ES | H | 35 | 2015-01-12 | 0.0 | 6 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
1 | 2015-01-28 | 1050611 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
2 | 2015-01-28 | 1050612 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
3 | 2015-01-28 | 1050613 | N | ES | H | 22 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
4 | 2015-01-28 | 1050614 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
5 rows × 48 columns
데이터전처리
= df_train.columns[24:].tolist() prods
= df_train[prods].fillna(0.0).astype(np.int8) df_train[prods]
-
24개 제품 중 하나도 보유하지 않는 고객 데이터 제거
= df_train[prods].sum(axis=1) == 0
no_product = df_train[~no_product] df_train
df_train[~변수명]
을 사용하면 제거!
-
tr/ts 합치기
for col in df_train.columns[24:]:
= 0
df_test[col] = pd.concat([df_train,df_test], axis=0) df
= [] features
= ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']
categorical_cols for col in categorical_cols:
= df[col].factorize(na_sentinel=-99)
df[col], _ += categorical_cols features
'age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)
df[
'antiguedad'].replace(' NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)
df[
'renta'].replace(' NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)
df[
'indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8) df[
+= ['age','antiguedad','renta','ind_nuevo','indrel','indrel_1mes','ind_actividad_cliente'] features
-
날짜 변수 월/연 추출
'fecha_alta_month'] = df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8) df[
'fecah_alta_year'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16) df[
+= ['fecha_alta_month','fecah_alta_year'] features
'ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
df[+= ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year'] features
-
그 외 변수 결측값: -99
-99, inplace=True) df.fillna(
-
날짜 숫자로 변환
2015-01-28
:1,2015-02-28
:2, …,2016-06-28
:18
def date_to_int(str_date):
= [int(a) for a in str_date.strip().split("-")]
Y, M, D = (int(Y) - 2015) * 12 + int(M)
int_date return int_date
'int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8) df[
= df.copy()
df_lag = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns]
df_lag.columns 'int_date'] += 1 df_lag[
= df.merge(df_lag, on=['ncodpers', 'int_date'], how='left') df_trn
-
저번 달 제품 정보가 존재하지 않으면 0으로 대체
for prod in prods:
= prod + '_prev'
prev 0, inplace=True)
df_trn[prev].fillna(-99, inplace=True) df_trn.fillna(
-
lag-1 변수 추가
+= [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods] features
tr/test/val
= ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28'] use_dates
= df_trn[df_trn['fecha_dato'].isin(use_dates)]
trn = df_trn[df_trn['fecha_dato'] == '2016-06-28']
tst del df_trn
-
훈련 데이터에 신규 구매 건수 추출
= []
X = []
Y for i, prod in enumerate(prods):
= prod + '_prev'
prev = trn[(trn[prod] == 1) & (trn[prev] == 0)]
prX = np.zeros(prX.shape[0], dtype=np.int8) + i
prY
X.append(prX)
Y.append(prY)= pd.concat(X)
XY = np.hstack(Y)
Y 'y'] = Y XY[
= '2016-05-28'
vld_date = XY[XY['fecha_dato'] != vld_date]
XY_trn = XY[XY['fecha_dato'] == vld_date] XY_vld
학습
XGBoost
max_depth
: 트리 모델의 최대 깊이
eta
: 학습률
colsample_bytree
: 변수 샘플링 기준, 보통 0.6~0.9
colsample_bylevel
: 트리의 레벨 별 훈련 데이터의 변수 샘플링해주는 비율
-
parameter 설정
= {
param 'booster': 'gbtree',
'max_depth': 8,
'nthread': 4,
'num_class': len(prods),
'objective': 'multi:softprob',
'silent': 1,
'eval_metric': 'mlogloss',
'eta': 0.1,
'min_child_weight': 10,
'colsample_bytree': 0.8,
'colsample_bylevel': 0.9,
'seed': 2018,
}
= XY_trn[features].to_numpy()
X_trn = XY_trn['y'].to_numpy()
Y_trn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features) dtrn
= XY_vld[features].to_numpy()
X_vld = XY_vld['y'].to_numpy()
Y_vld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features) dvld
= [(dtrn, 'train'), (dvld, 'eval')]
watch_list = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20) model
[17:18:26] WARNING: ../src/learner.cc:767:
Parameters: { "silent" } are not used.
[0] train-mlogloss:2.68947 eval-mlogloss:2.70092
[1] train-mlogloss:2.45325 eval-mlogloss:2.46499
[2] train-mlogloss:2.27879 eval-mlogloss:2.29092
[3] train-mlogloss:2.14556 eval-mlogloss:2.16003
[4] train-mlogloss:2.03233 eval-mlogloss:2.04872
[5] train-mlogloss:1.93794 eval-mlogloss:1.95545
[6] train-mlogloss:1.85559 eval-mlogloss:1.87363
[7] train-mlogloss:1.78461 eval-mlogloss:1.80326
[8] train-mlogloss:1.72369 eval-mlogloss:1.74273
[9] train-mlogloss:1.67043 eval-mlogloss:1.69015
[10] train-mlogloss:1.62181 eval-mlogloss:1.64162
[11] train-mlogloss:1.57870 eval-mlogloss:1.59880
[12] train-mlogloss:1.54061 eval-mlogloss:1.56100
[13] train-mlogloss:1.50500 eval-mlogloss:1.52598
[14] train-mlogloss:1.47335 eval-mlogloss:1.49488
[15] train-mlogloss:1.44403 eval-mlogloss:1.46575
[16] train-mlogloss:1.41845 eval-mlogloss:1.44053
[17] train-mlogloss:1.39342 eval-mlogloss:1.41575
[18] train-mlogloss:1.37211 eval-mlogloss:1.39459
[19] train-mlogloss:1.35270 eval-mlogloss:1.37554
[20] train-mlogloss:1.33333 eval-mlogloss:1.35648
[21] train-mlogloss:1.31591 eval-mlogloss:1.33916
[22] train-mlogloss:1.30018 eval-mlogloss:1.32380
[23] train-mlogloss:1.28495 eval-mlogloss:1.30887
[24] train-mlogloss:1.27107 eval-mlogloss:1.29518
[25] train-mlogloss:1.25847 eval-mlogloss:1.28290
[26] train-mlogloss:1.24638 eval-mlogloss:1.27103
[27] train-mlogloss:1.23520 eval-mlogloss:1.26012
[28] train-mlogloss:1.22499 eval-mlogloss:1.25028
[29] train-mlogloss:1.21556 eval-mlogloss:1.24139
[30] train-mlogloss:1.20649 eval-mlogloss:1.23269
[31] train-mlogloss:1.19831 eval-mlogloss:1.22480
[32] train-mlogloss:1.19049 eval-mlogloss:1.21743
[33] train-mlogloss:1.18344 eval-mlogloss:1.21077
[34] train-mlogloss:1.17619 eval-mlogloss:1.20392
[35] train-mlogloss:1.16950 eval-mlogloss:1.19754
[36] train-mlogloss:1.16325 eval-mlogloss:1.19175
[37] train-mlogloss:1.15747 eval-mlogloss:1.18643
[38] train-mlogloss:1.15168 eval-mlogloss:1.18111
[39] train-mlogloss:1.14627 eval-mlogloss:1.17612
[40] train-mlogloss:1.14141 eval-mlogloss:1.17162
[41] train-mlogloss:1.13655 eval-mlogloss:1.16708
[42] train-mlogloss:1.13203 eval-mlogloss:1.16290
[43] train-mlogloss:1.12802 eval-mlogloss:1.15938
[44] train-mlogloss:1.12406 eval-mlogloss:1.15580
[45] train-mlogloss:1.12017 eval-mlogloss:1.15236
[46] train-mlogloss:1.11651 eval-mlogloss:1.14907
[47] train-mlogloss:1.11301 eval-mlogloss:1.14614
[48] train-mlogloss:1.10974 eval-mlogloss:1.14323
[49] train-mlogloss:1.10664 eval-mlogloss:1.14053
[50] train-mlogloss:1.10371 eval-mlogloss:1.13796
[51] train-mlogloss:1.10103 eval-mlogloss:1.13574
[52] train-mlogloss:1.09843 eval-mlogloss:1.13355
[53] train-mlogloss:1.09584 eval-mlogloss:1.13136
[54] train-mlogloss:1.09342 eval-mlogloss:1.12943
[55] train-mlogloss:1.09101 eval-mlogloss:1.12737
[56] train-mlogloss:1.08873 eval-mlogloss:1.12562
[57] train-mlogloss:1.08636 eval-mlogloss:1.12386
[58] train-mlogloss:1.08435 eval-mlogloss:1.12223
[59] train-mlogloss:1.08241 eval-mlogloss:1.12074
[60] train-mlogloss:1.08043 eval-mlogloss:1.11916
[61] train-mlogloss:1.07846 eval-mlogloss:1.11763
[62] train-mlogloss:1.07669 eval-mlogloss:1.11620
[63] train-mlogloss:1.07490 eval-mlogloss:1.11491
[64] train-mlogloss:1.07326 eval-mlogloss:1.11365
[65] train-mlogloss:1.07174 eval-mlogloss:1.11257
[66] train-mlogloss:1.07017 eval-mlogloss:1.11150
[67] train-mlogloss:1.06875 eval-mlogloss:1.11046
[68] train-mlogloss:1.06726 eval-mlogloss:1.10943
[69] train-mlogloss:1.06587 eval-mlogloss:1.10842
[70] train-mlogloss:1.06443 eval-mlogloss:1.10761
[71] train-mlogloss:1.06320 eval-mlogloss:1.10675
[72] train-mlogloss:1.06210 eval-mlogloss:1.10597
[73] train-mlogloss:1.06084 eval-mlogloss:1.10515
[74] train-mlogloss:1.05970 eval-mlogloss:1.10451
[75] train-mlogloss:1.05853 eval-mlogloss:1.10374
[76] train-mlogloss:1.05740 eval-mlogloss:1.10305
[77] train-mlogloss:1.05628 eval-mlogloss:1.10243
[78] train-mlogloss:1.05520 eval-mlogloss:1.10186
[79] train-mlogloss:1.05409 eval-mlogloss:1.10132
[80] train-mlogloss:1.05311 eval-mlogloss:1.10074
[81] train-mlogloss:1.05214 eval-mlogloss:1.10026
[82] train-mlogloss:1.05108 eval-mlogloss:1.09970
[83] train-mlogloss:1.05016 eval-mlogloss:1.09912
[84] train-mlogloss:1.04908 eval-mlogloss:1.09859
[85] train-mlogloss:1.04827 eval-mlogloss:1.09812
[86] train-mlogloss:1.04739 eval-mlogloss:1.09769
[87] train-mlogloss:1.04654 eval-mlogloss:1.09727
[88] train-mlogloss:1.04570 eval-mlogloss:1.09689
[89] train-mlogloss:1.04478 eval-mlogloss:1.09649
[90] train-mlogloss:1.04391 eval-mlogloss:1.09613
[91] train-mlogloss:1.04309 eval-mlogloss:1.09579
[92] train-mlogloss:1.04230 eval-mlogloss:1.09548
[93] train-mlogloss:1.04152 eval-mlogloss:1.09519
[94] train-mlogloss:1.04072 eval-mlogloss:1.09484
[95] train-mlogloss:1.04003 eval-mlogloss:1.09456
[96] train-mlogloss:1.03926 eval-mlogloss:1.09429
[97] train-mlogloss:1.03849 eval-mlogloss:1.09404
[98] train-mlogloss:1.03778 eval-mlogloss:1.09376
[99] train-mlogloss:1.03714 eval-mlogloss:1.09352
[100] train-mlogloss:1.03645 eval-mlogloss:1.09329
[101] train-mlogloss:1.03578 eval-mlogloss:1.09307
[102] train-mlogloss:1.03522 eval-mlogloss:1.09283
[103] train-mlogloss:1.03463 eval-mlogloss:1.09256
[104] train-mlogloss:1.03393 eval-mlogloss:1.09233
[105] train-mlogloss:1.03315 eval-mlogloss:1.09210
[106] train-mlogloss:1.03244 eval-mlogloss:1.09192
[107] train-mlogloss:1.03189 eval-mlogloss:1.09173
[108] train-mlogloss:1.03125 eval-mlogloss:1.09155
[109] train-mlogloss:1.03053 eval-mlogloss:1.09139
[110] train-mlogloss:1.02985 eval-mlogloss:1.09124
[111] train-mlogloss:1.02925 eval-mlogloss:1.09107
[112] train-mlogloss:1.02862 eval-mlogloss:1.09089
[113] train-mlogloss:1.02805 eval-mlogloss:1.09078
[114] train-mlogloss:1.02734 eval-mlogloss:1.09064
[115] train-mlogloss:1.02655 eval-mlogloss:1.09049
[116] train-mlogloss:1.02593 eval-mlogloss:1.09034
[117] train-mlogloss:1.02528 eval-mlogloss:1.09021
[118] train-mlogloss:1.02473 eval-mlogloss:1.09011
[119] train-mlogloss:1.02409 eval-mlogloss:1.09003
[120] train-mlogloss:1.02350 eval-mlogloss:1.08995
[121] train-mlogloss:1.02297 eval-mlogloss:1.08986
[122] train-mlogloss:1.02245 eval-mlogloss:1.08976
[123] train-mlogloss:1.02182 eval-mlogloss:1.08969
[124] train-mlogloss:1.02116 eval-mlogloss:1.08958
[125] train-mlogloss:1.02049 eval-mlogloss:1.08951
[126] train-mlogloss:1.01983 eval-mlogloss:1.08939
[127] train-mlogloss:1.01928 eval-mlogloss:1.08928
[128] train-mlogloss:1.01866 eval-mlogloss:1.08921
[129] train-mlogloss:1.01810 eval-mlogloss:1.08906
[130] train-mlogloss:1.01736 eval-mlogloss:1.08893
[131] train-mlogloss:1.01683 eval-mlogloss:1.08883
[132] train-mlogloss:1.01634 eval-mlogloss:1.08878
[133] train-mlogloss:1.01579 eval-mlogloss:1.08873
[134] train-mlogloss:1.01526 eval-mlogloss:1.08865
[135] train-mlogloss:1.01460 eval-mlogloss:1.08852
[136] train-mlogloss:1.01403 eval-mlogloss:1.08848
[137] train-mlogloss:1.01352 eval-mlogloss:1.08843
[138] train-mlogloss:1.01279 eval-mlogloss:1.08834
[139] train-mlogloss:1.01231 eval-mlogloss:1.08829
[140] train-mlogloss:1.01172 eval-mlogloss:1.08824
[141] train-mlogloss:1.01118 eval-mlogloss:1.08820
[142] train-mlogloss:1.01062 eval-mlogloss:1.08812
[143] train-mlogloss:1.01010 eval-mlogloss:1.08803
[144] train-mlogloss:1.00950 eval-mlogloss:1.08799
[145] train-mlogloss:1.00900 eval-mlogloss:1.08794
[146] train-mlogloss:1.00853 eval-mlogloss:1.08789
[147] train-mlogloss:1.00807 eval-mlogloss:1.08784
[148] train-mlogloss:1.00755 eval-mlogloss:1.08781
[149] train-mlogloss:1.00691 eval-mlogloss:1.08776
[150] train-mlogloss:1.00629 eval-mlogloss:1.08774
[151] train-mlogloss:1.00562 eval-mlogloss:1.08766
[152] train-mlogloss:1.00517 eval-mlogloss:1.08763
[153] train-mlogloss:1.00449 eval-mlogloss:1.08754
[154] train-mlogloss:1.00404 eval-mlogloss:1.08752
[155] train-mlogloss:1.00352 eval-mlogloss:1.08747
[156] train-mlogloss:1.00294 eval-mlogloss:1.08742
[157] train-mlogloss:1.00235 eval-mlogloss:1.08735
[158] train-mlogloss:1.00190 eval-mlogloss:1.08737
[159] train-mlogloss:1.00144 eval-mlogloss:1.08731
[160] train-mlogloss:1.00084 eval-mlogloss:1.08728
[161] train-mlogloss:1.00024 eval-mlogloss:1.08726
[162] train-mlogloss:0.99963 eval-mlogloss:1.08723
[163] train-mlogloss:0.99917 eval-mlogloss:1.08726
[164] train-mlogloss:0.99848 eval-mlogloss:1.08727
[165] train-mlogloss:0.99792 eval-mlogloss:1.08723
[166] train-mlogloss:0.99743 eval-mlogloss:1.08721
[167] train-mlogloss:0.99683 eval-mlogloss:1.08718
[168] train-mlogloss:0.99622 eval-mlogloss:1.08719
[169] train-mlogloss:0.99559 eval-mlogloss:1.08715
[170] train-mlogloss:0.99502 eval-mlogloss:1.08717
[171] train-mlogloss:0.99459 eval-mlogloss:1.08714
[172] train-mlogloss:0.99398 eval-mlogloss:1.08707
[173] train-mlogloss:0.99335 eval-mlogloss:1.08699
[174] train-mlogloss:0.99277 eval-mlogloss:1.08697
[175] train-mlogloss:0.99213 eval-mlogloss:1.08695
[176] train-mlogloss:0.99156 eval-mlogloss:1.08695
[177] train-mlogloss:0.99105 eval-mlogloss:1.08690
[178] train-mlogloss:0.99045 eval-mlogloss:1.08688
[179] train-mlogloss:0.98992 eval-mlogloss:1.08686
[180] train-mlogloss:0.98923 eval-mlogloss:1.08683
[181] train-mlogloss:0.98876 eval-mlogloss:1.08680
[182] train-mlogloss:0.98814 eval-mlogloss:1.08679
[183] train-mlogloss:0.98757 eval-mlogloss:1.08675
[184] train-mlogloss:0.98707 eval-mlogloss:1.08678
[185] train-mlogloss:0.98656 eval-mlogloss:1.08680
[186] train-mlogloss:0.98600 eval-mlogloss:1.08678
[187] train-mlogloss:0.98555 eval-mlogloss:1.08679
[188] train-mlogloss:0.98491 eval-mlogloss:1.08678
[189] train-mlogloss:0.98429 eval-mlogloss:1.08675
[190] train-mlogloss:0.98373 eval-mlogloss:1.08672
[191] train-mlogloss:0.98317 eval-mlogloss:1.08676
[192] train-mlogloss:0.98268 eval-mlogloss:1.08670
[193] train-mlogloss:0.98213 eval-mlogloss:1.08667
[194] train-mlogloss:0.98147 eval-mlogloss:1.08663
[195] train-mlogloss:0.98090 eval-mlogloss:1.08659
[196] train-mlogloss:0.98040 eval-mlogloss:1.08655
[197] train-mlogloss:0.97987 eval-mlogloss:1.08656
[198] train-mlogloss:0.97937 eval-mlogloss:1.08653
[199] train-mlogloss:0.97892 eval-mlogloss:1.08649
[200] train-mlogloss:0.97848 eval-mlogloss:1.08648
[201] train-mlogloss:0.97792 eval-mlogloss:1.08652
[202] train-mlogloss:0.97742 eval-mlogloss:1.08652
[203] train-mlogloss:0.97700 eval-mlogloss:1.08652
[204] train-mlogloss:0.97654 eval-mlogloss:1.08652
[205] train-mlogloss:0.97612 eval-mlogloss:1.08653
[206] train-mlogloss:0.97572 eval-mlogloss:1.08654
[207] train-mlogloss:0.97514 eval-mlogloss:1.08648
[208] train-mlogloss:0.97472 eval-mlogloss:1.08645
[209] train-mlogloss:0.97415 eval-mlogloss:1.08638
[210] train-mlogloss:0.97366 eval-mlogloss:1.08635
[211] train-mlogloss:0.97321 eval-mlogloss:1.08637
[212] train-mlogloss:0.97274 eval-mlogloss:1.08634
[213] train-mlogloss:0.97227 eval-mlogloss:1.08636
[214] train-mlogloss:0.97182 eval-mlogloss:1.08637
[215] train-mlogloss:0.97142 eval-mlogloss:1.08640
[216] train-mlogloss:0.97105 eval-mlogloss:1.08637
[217] train-mlogloss:0.97060 eval-mlogloss:1.08630
[218] train-mlogloss:0.97028 eval-mlogloss:1.08634
[219] train-mlogloss:0.96980 eval-mlogloss:1.08627
[220] train-mlogloss:0.96934 eval-mlogloss:1.08625
[221] train-mlogloss:0.96883 eval-mlogloss:1.08628
[222] train-mlogloss:0.96837 eval-mlogloss:1.08630
[223] train-mlogloss:0.96800 eval-mlogloss:1.08635
[224] train-mlogloss:0.96752 eval-mlogloss:1.08638
[225] train-mlogloss:0.96716 eval-mlogloss:1.08639
[226] train-mlogloss:0.96686 eval-mlogloss:1.08639
[227] train-mlogloss:0.96648 eval-mlogloss:1.08641
[228] train-mlogloss:0.96591 eval-mlogloss:1.08638
[229] train-mlogloss:0.96536 eval-mlogloss:1.08637
[230] train-mlogloss:0.96486 eval-mlogloss:1.08638
[231] train-mlogloss:0.96440 eval-mlogloss:1.08636
[232] train-mlogloss:0.96395 eval-mlogloss:1.08633
[233] train-mlogloss:0.96350 eval-mlogloss:1.08635
[234] train-mlogloss:0.96280 eval-mlogloss:1.08639
[235] train-mlogloss:0.96212 eval-mlogloss:1.08630
[236] train-mlogloss:0.96175 eval-mlogloss:1.08636
[237] train-mlogloss:0.96135 eval-mlogloss:1.08637
[238] train-mlogloss:0.96081 eval-mlogloss:1.08640
[239] train-mlogloss:0.96025 eval-mlogloss:1.08643
[240] train-mlogloss:0.95976 eval-mlogloss:1.08641
# MAP@7 평가 척도를 위한 준비작업이다.
# 고객 식별 번호를 추출한다.
= trn[trn['fecha_dato'] == vld_date]
vld = vld['ncodpers'] ncodpers_vld
# 검증 데이터에서 신규 구매를 구한다.
for prod in prods:
= prod + '_prev'
prev = prod + '_add'
padd = vld[prod] - vld[prev]
vld[padd] = vld[[prod + '_add' for prod in prods]]
add_vld = [list() for i in range(len(ncodpers_vld))] add_vld_list
# 고객별 신규 구매 정답 값을 add_vld_list에 저장하고, 총 count를 count_vld에 저장한다.
= 0
count_vld for ncodper in range(len(ncodpers_vld)):
for prod in range(len(prods)):
if add_vld[ncodper, prod] > 0:
add_vld_list[ncodper].append(prod)+= 1
count_vld
# 검증 데이터에서 얻을 수 있는 MAP@7 최고점을 미리 구한다. (0.042663)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))
# 검증 데이터에 대한 예측 값을 구한다.
= vld.as_matrix(columns=features)
X_vld = vld.as_matrix(columns=['y'])
Y_vld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
dvld = model.predict(dvld, ntree_limit=best_ntree_limit)
preds_vld
# 저번 달에 보유한 제품은 신규 구매가 불가하기 때문에, 확률값에서 미리 1을 빼준다
= preds_vld - vld.as_matrix(columns=[prod + '_prev' for prod in prods])
preds_vld
# 검증 데이터 예측 상위 7개를 추출한다.
= []
result_vld for ncodper, pred in zip(ncodpers_vld, preds_vld):
= [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
y_prods for y,p,ip in y_prods])
result_vld.append([ip
# 검증 데이터에서의 MAP@7 점수를 구한다. (0.036466)
print(mapk(add_vld_list, result_vld, 7, 0.0))
# XGBoost 모델을 전체 훈련 데이터로 재학습한다!
= XY.as_matrix(columns=features)
X_all = XY.as_matrix(columns=['y'])
Y_all = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
dall = [(dall, 'train')]
watch_list
# 트리 개수를 늘어난 데이터 양만큼 비례해서 증가한다.
= int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))
best_ntree_limit
# XGBoost 모델 재학습!
= xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)
model
# 변수 중요도를 출력해본다. 예상하던 변수가 상위로 올라와 있는가?
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
print(kv)
# 캐글 제출을 위하여 테스트 데이터에 대한 예측 값을 구한다.
= tst.as_matrix(columns=features)
X_tst = xgb.DMatrix(X_tst, feature_names=features)
dtst = model.predict(dtst, ntree_limit=best_ntree_limit)
preds_tst = tst.as_matrix(columns=['ncodpers'])
ncodpers_tst = preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])
preds_tst
# 제출 파일을 생성한다.
= open('../model/xgb.baseline.2015-06-28', 'w')
submit_file 'ncodpers,added_products\n')
submit_file.write(for ncodper, pred in zip(ncodpers_tst, preds_tst):
= [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
y_prods = [p for y,p,ip in y_prods]
y_prods '{},{}\n'.format(int(ncodper), ' '.join(y_prods))) submit_file.write(