[Kaggle] 02. Santander Product Recommendation Competition

Author

김보람

Published

December 19, 2023

ref

[Kaggle] Santander Product Recommendation Competition

kaggle 우승작으로 배우는 머신러닝 탐구생활

import

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import pickle

#---#
from autogluon.multimodal import MultiModalPredictor # from autogluon.tabular import TabularPredictor
#---#
import warnings
warnings.filterwarnings('ignore')

data

!kaggle competitions download -c santander-product-recommendation
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
santander-product-recommendation.zip: Skipping, found more recently modified local copy (use --force to force download)
!unzip santander-product-recommendation.zip -d data 
Archive:  santander-product-recommendation.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test_ver2.csv.zip  
  inflating: data/train_ver2.csv.zip  
df_train = pd.read_csv('data/train_ver2.csv.zip')
df_test = pd.read_csv('data/test_ver2.csv.zip')
sample_submission = pd.read_csv('data/sample_submission.csv.zip')
!rm -rf data
!rm nlp-getting-started.zip
rm: cannot remove 'nlp-getting-started.zip': No such file or directory
df_train.head()
fecha_dato ncodpers ind_empleado pais_residencia sexo age fecha_alta ind_nuevo antiguedad indrel ... ind_hip_fin_ult1 ind_plan_fin_ult1 ind_pres_fin_ult1 ind_reca_fin_ult1 ind_tjcr_fin_ult1 ind_valo_fin_ult1 ind_viv_fin_ult1 ind_nomina_ult1 ind_nom_pens_ult1 ind_recibo_ult1
0 2015-01-28 1375586 N ES H 35 2015-01-12 0.0 6 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
1 2015-01-28 1050611 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
2 2015-01-28 1050612 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
3 2015-01-28 1050613 N ES H 22 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0
4 2015-01-28 1050614 N ES V 23 2012-08-10 0.0 35 1.0 ... 0 0 0 0 0 0 0 0.0 0.0 0

5 rows × 48 columns

데이터전처리

prods = df_train.columns[24:].tolist()
df_train[prods] = df_train[prods].fillna(0.0).astype(np.int8)

- 24개 제품 중 하나도 보유하지 않는 고객 데이터 제거

no_product = df_train[prods].sum(axis=1) == 0
df_train = df_train[~no_product]

df_train[~변수명]을 사용하면 제거!

- tr/ts 합치기

for col in df_train.columns[24:]:
    df_test[col] = 0
df = pd.concat([df_train,df_test], axis=0)
features = []
categorical_cols = ['ind_empleado', 'pais_residencia', 'sexo', 'tiprel_1mes', 'indresi', 'indext', 'conyuemp', 'canal_entrada', 'indfall', 'tipodom', 'nomprov', 'segmento']
for col in categorical_cols:
    df[col], _ = df[col].factorize(na_sentinel=-99)
features += categorical_cols
df['age'].replace(' NA', -99, inplace=True)
df['age'] = df['age'].astype(np.int8)

df['antiguedad'].replace('     NA', -99, inplace=True)
df['antiguedad'] = df['antiguedad'].astype(np.int8)

df['renta'].replace('         NA', -99, inplace=True)
df['renta'].fillna(-99, inplace=True)
df['renta'] = df['renta'].astype(float).astype(np.int8)

df['indrel_1mes'].replace('P', 5, inplace=True)
df['indrel_1mes'].fillna(-99, inplace=True)
df['indrel_1mes'] = df['indrel_1mes'].astype(float).astype(np.int8)
features += ['age','antiguedad','renta','ind_nuevo','indrel','indrel_1mes','ind_actividad_cliente']

- 날짜 변수 월/연 추출

df['fecha_alta_month'] =  df['fecha_alta'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['fecah_alta_year'] = df['fecha_alta'].map(lambda x:0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['fecha_alta_month','fecah_alta_year']
df['ult_fec_cli_1t_month'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[1])).astype(np.int8)
df['ult_fec_cli_1t_year'] = df['ult_fec_cli_1t'].map(lambda x: 0.0 if x.__class__ is float else float(x.split('-')[0])).astype(np.int16)
features += ['ult_fec_cli_1t_month', 'ult_fec_cli_1t_year']

- 그 외 변수 결측값: -99

df.fillna(-99, inplace=True)

- 날짜 숫자로 변환

  • 2015-01-28:1, 2015-02-28:2, …, 2016-06-28:18
def date_to_int(str_date):
    Y, M, D = [int(a) for a in str_date.strip().split("-")]
    int_date = (int(Y) - 2015) * 12 + int(M)
    return int_date
df['int_date'] = df['fecha_dato'].map(date_to_int).astype(np.int8)
df_lag = df.copy()
df_lag.columns = [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in df.columns]
df_lag['int_date'] += 1
df_trn = df.merge(df_lag, on=['ncodpers', 'int_date'], how='left')

- 저번 달 제품 정보가 존재하지 않으면 0으로 대체

for prod in prods:
    prev = prod + '_prev'
    df_trn[prev].fillna(0, inplace=True)
df_trn.fillna(-99, inplace=True)

- lag-1 변수 추가

features += [feature + '_prev' for feature in features]
features += [prod + '_prev' for prod in prods]

tr/test/val

use_dates = ['2016-01-28', '2016-02-28', '2016-03-28', '2016-04-28', '2016-05-28']
trn = df_trn[df_trn['fecha_dato'].isin(use_dates)]
tst = df_trn[df_trn['fecha_dato'] == '2016-06-28']
del df_trn

- 훈련 데이터에 신규 구매 건수 추출

X = []
Y = []
for i, prod in enumerate(prods):
    prev = prod + '_prev'
    prX = trn[(trn[prod] == 1) & (trn[prev] == 0)]
    prY = np.zeros(prX.shape[0], dtype=np.int8) + i
    X.append(prX)
    Y.append(prY)
XY = pd.concat(X)
Y = np.hstack(Y)
XY['y'] = Y
vld_date = '2016-05-28'
XY_trn = XY[XY['fecha_dato'] != vld_date]
XY_vld = XY[XY['fecha_dato'] == vld_date]

학습

XGBoost

max_depth: 트리 모델의 최대 깊이

eta: 학습률

colsample_bytree: 변수 샘플링 기준, 보통 0.6~0.9

colsample_bylevel: 트리의 레벨 별 훈련 데이터의 변수 샘플링해주는 비율

- parameter 설정

param = {
    'booster': 'gbtree',
    'max_depth': 8,
    'nthread': 4,
    'num_class': len(prods),
    'objective': 'multi:softprob',
    'silent': 1,
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'min_child_weight': 10,
    'colsample_bytree': 0.8,
    'colsample_bylevel': 0.9,
    'seed': 2018,
    }
X_trn = XY_trn[features].to_numpy()
Y_trn = XY_trn['y'].to_numpy()
dtrn = xgb.DMatrix(X_trn, label=Y_trn, feature_names=features)
X_vld = XY_vld[features].to_numpy()
Y_vld = XY_vld['y'].to_numpy()
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
watch_list = [(dtrn, 'train'), (dvld, 'eval')]
model = xgb.train(param, dtrn, num_boost_round=1000, evals=watch_list, early_stopping_rounds=20)
[17:18:26] WARNING: ../src/learner.cc:767: 
Parameters: { "silent" } are not used.

[0] train-mlogloss:2.68947  eval-mlogloss:2.70092
[1] train-mlogloss:2.45325  eval-mlogloss:2.46499
[2] train-mlogloss:2.27879  eval-mlogloss:2.29092
[3] train-mlogloss:2.14556  eval-mlogloss:2.16003
[4] train-mlogloss:2.03233  eval-mlogloss:2.04872
[5] train-mlogloss:1.93794  eval-mlogloss:1.95545
[6] train-mlogloss:1.85559  eval-mlogloss:1.87363
[7] train-mlogloss:1.78461  eval-mlogloss:1.80326
[8] train-mlogloss:1.72369  eval-mlogloss:1.74273
[9] train-mlogloss:1.67043  eval-mlogloss:1.69015
[10]    train-mlogloss:1.62181  eval-mlogloss:1.64162
[11]    train-mlogloss:1.57870  eval-mlogloss:1.59880
[12]    train-mlogloss:1.54061  eval-mlogloss:1.56100
[13]    train-mlogloss:1.50500  eval-mlogloss:1.52598
[14]    train-mlogloss:1.47335  eval-mlogloss:1.49488
[15]    train-mlogloss:1.44403  eval-mlogloss:1.46575
[16]    train-mlogloss:1.41845  eval-mlogloss:1.44053
[17]    train-mlogloss:1.39342  eval-mlogloss:1.41575
[18]    train-mlogloss:1.37211  eval-mlogloss:1.39459
[19]    train-mlogloss:1.35270  eval-mlogloss:1.37554
[20]    train-mlogloss:1.33333  eval-mlogloss:1.35648
[21]    train-mlogloss:1.31591  eval-mlogloss:1.33916
[22]    train-mlogloss:1.30018  eval-mlogloss:1.32380
[23]    train-mlogloss:1.28495  eval-mlogloss:1.30887
[24]    train-mlogloss:1.27107  eval-mlogloss:1.29518
[25]    train-mlogloss:1.25847  eval-mlogloss:1.28290
[26]    train-mlogloss:1.24638  eval-mlogloss:1.27103
[27]    train-mlogloss:1.23520  eval-mlogloss:1.26012
[28]    train-mlogloss:1.22499  eval-mlogloss:1.25028
[29]    train-mlogloss:1.21556  eval-mlogloss:1.24139
[30]    train-mlogloss:1.20649  eval-mlogloss:1.23269
[31]    train-mlogloss:1.19831  eval-mlogloss:1.22480
[32]    train-mlogloss:1.19049  eval-mlogloss:1.21743
[33]    train-mlogloss:1.18344  eval-mlogloss:1.21077
[34]    train-mlogloss:1.17619  eval-mlogloss:1.20392
[35]    train-mlogloss:1.16950  eval-mlogloss:1.19754
[36]    train-mlogloss:1.16325  eval-mlogloss:1.19175
[37]    train-mlogloss:1.15747  eval-mlogloss:1.18643
[38]    train-mlogloss:1.15168  eval-mlogloss:1.18111
[39]    train-mlogloss:1.14627  eval-mlogloss:1.17612
[40]    train-mlogloss:1.14141  eval-mlogloss:1.17162
[41]    train-mlogloss:1.13655  eval-mlogloss:1.16708
[42]    train-mlogloss:1.13203  eval-mlogloss:1.16290
[43]    train-mlogloss:1.12802  eval-mlogloss:1.15938
[44]    train-mlogloss:1.12406  eval-mlogloss:1.15580
[45]    train-mlogloss:1.12017  eval-mlogloss:1.15236
[46]    train-mlogloss:1.11651  eval-mlogloss:1.14907
[47]    train-mlogloss:1.11301  eval-mlogloss:1.14614
[48]    train-mlogloss:1.10974  eval-mlogloss:1.14323
[49]    train-mlogloss:1.10664  eval-mlogloss:1.14053
[50]    train-mlogloss:1.10371  eval-mlogloss:1.13796
[51]    train-mlogloss:1.10103  eval-mlogloss:1.13574
[52]    train-mlogloss:1.09843  eval-mlogloss:1.13355
[53]    train-mlogloss:1.09584  eval-mlogloss:1.13136
[54]    train-mlogloss:1.09342  eval-mlogloss:1.12943
[55]    train-mlogloss:1.09101  eval-mlogloss:1.12737
[56]    train-mlogloss:1.08873  eval-mlogloss:1.12562
[57]    train-mlogloss:1.08636  eval-mlogloss:1.12386
[58]    train-mlogloss:1.08435  eval-mlogloss:1.12223
[59]    train-mlogloss:1.08241  eval-mlogloss:1.12074
[60]    train-mlogloss:1.08043  eval-mlogloss:1.11916
[61]    train-mlogloss:1.07846  eval-mlogloss:1.11763
[62]    train-mlogloss:1.07669  eval-mlogloss:1.11620
[63]    train-mlogloss:1.07490  eval-mlogloss:1.11491
[64]    train-mlogloss:1.07326  eval-mlogloss:1.11365
[65]    train-mlogloss:1.07174  eval-mlogloss:1.11257
[66]    train-mlogloss:1.07017  eval-mlogloss:1.11150
[67]    train-mlogloss:1.06875  eval-mlogloss:1.11046
[68]    train-mlogloss:1.06726  eval-mlogloss:1.10943
[69]    train-mlogloss:1.06587  eval-mlogloss:1.10842
[70]    train-mlogloss:1.06443  eval-mlogloss:1.10761
[71]    train-mlogloss:1.06320  eval-mlogloss:1.10675
[72]    train-mlogloss:1.06210  eval-mlogloss:1.10597
[73]    train-mlogloss:1.06084  eval-mlogloss:1.10515
[74]    train-mlogloss:1.05970  eval-mlogloss:1.10451
[75]    train-mlogloss:1.05853  eval-mlogloss:1.10374
[76]    train-mlogloss:1.05740  eval-mlogloss:1.10305
[77]    train-mlogloss:1.05628  eval-mlogloss:1.10243
[78]    train-mlogloss:1.05520  eval-mlogloss:1.10186
[79]    train-mlogloss:1.05409  eval-mlogloss:1.10132
[80]    train-mlogloss:1.05311  eval-mlogloss:1.10074
[81]    train-mlogloss:1.05214  eval-mlogloss:1.10026
[82]    train-mlogloss:1.05108  eval-mlogloss:1.09970
[83]    train-mlogloss:1.05016  eval-mlogloss:1.09912
[84]    train-mlogloss:1.04908  eval-mlogloss:1.09859
[85]    train-mlogloss:1.04827  eval-mlogloss:1.09812
[86]    train-mlogloss:1.04739  eval-mlogloss:1.09769
[87]    train-mlogloss:1.04654  eval-mlogloss:1.09727
[88]    train-mlogloss:1.04570  eval-mlogloss:1.09689
[89]    train-mlogloss:1.04478  eval-mlogloss:1.09649
[90]    train-mlogloss:1.04391  eval-mlogloss:1.09613
[91]    train-mlogloss:1.04309  eval-mlogloss:1.09579
[92]    train-mlogloss:1.04230  eval-mlogloss:1.09548
[93]    train-mlogloss:1.04152  eval-mlogloss:1.09519
[94]    train-mlogloss:1.04072  eval-mlogloss:1.09484
[95]    train-mlogloss:1.04003  eval-mlogloss:1.09456
[96]    train-mlogloss:1.03926  eval-mlogloss:1.09429
[97]    train-mlogloss:1.03849  eval-mlogloss:1.09404
[98]    train-mlogloss:1.03778  eval-mlogloss:1.09376
[99]    train-mlogloss:1.03714  eval-mlogloss:1.09352
[100]   train-mlogloss:1.03645  eval-mlogloss:1.09329
[101]   train-mlogloss:1.03578  eval-mlogloss:1.09307
[102]   train-mlogloss:1.03522  eval-mlogloss:1.09283
[103]   train-mlogloss:1.03463  eval-mlogloss:1.09256
[104]   train-mlogloss:1.03393  eval-mlogloss:1.09233
[105]   train-mlogloss:1.03315  eval-mlogloss:1.09210
[106]   train-mlogloss:1.03244  eval-mlogloss:1.09192
[107]   train-mlogloss:1.03189  eval-mlogloss:1.09173
[108]   train-mlogloss:1.03125  eval-mlogloss:1.09155
[109]   train-mlogloss:1.03053  eval-mlogloss:1.09139
[110]   train-mlogloss:1.02985  eval-mlogloss:1.09124
[111]   train-mlogloss:1.02925  eval-mlogloss:1.09107
[112]   train-mlogloss:1.02862  eval-mlogloss:1.09089
[113]   train-mlogloss:1.02805  eval-mlogloss:1.09078
[114]   train-mlogloss:1.02734  eval-mlogloss:1.09064
[115]   train-mlogloss:1.02655  eval-mlogloss:1.09049
[116]   train-mlogloss:1.02593  eval-mlogloss:1.09034
[117]   train-mlogloss:1.02528  eval-mlogloss:1.09021
[118]   train-mlogloss:1.02473  eval-mlogloss:1.09011
[119]   train-mlogloss:1.02409  eval-mlogloss:1.09003
[120]   train-mlogloss:1.02350  eval-mlogloss:1.08995
[121]   train-mlogloss:1.02297  eval-mlogloss:1.08986
[122]   train-mlogloss:1.02245  eval-mlogloss:1.08976
[123]   train-mlogloss:1.02182  eval-mlogloss:1.08969
[124]   train-mlogloss:1.02116  eval-mlogloss:1.08958
[125]   train-mlogloss:1.02049  eval-mlogloss:1.08951
[126]   train-mlogloss:1.01983  eval-mlogloss:1.08939
[127]   train-mlogloss:1.01928  eval-mlogloss:1.08928
[128]   train-mlogloss:1.01866  eval-mlogloss:1.08921
[129]   train-mlogloss:1.01810  eval-mlogloss:1.08906
[130]   train-mlogloss:1.01736  eval-mlogloss:1.08893
[131]   train-mlogloss:1.01683  eval-mlogloss:1.08883
[132]   train-mlogloss:1.01634  eval-mlogloss:1.08878
[133]   train-mlogloss:1.01579  eval-mlogloss:1.08873
[134]   train-mlogloss:1.01526  eval-mlogloss:1.08865
[135]   train-mlogloss:1.01460  eval-mlogloss:1.08852
[136]   train-mlogloss:1.01403  eval-mlogloss:1.08848
[137]   train-mlogloss:1.01352  eval-mlogloss:1.08843
[138]   train-mlogloss:1.01279  eval-mlogloss:1.08834
[139]   train-mlogloss:1.01231  eval-mlogloss:1.08829
[140]   train-mlogloss:1.01172  eval-mlogloss:1.08824
[141]   train-mlogloss:1.01118  eval-mlogloss:1.08820
[142]   train-mlogloss:1.01062  eval-mlogloss:1.08812
[143]   train-mlogloss:1.01010  eval-mlogloss:1.08803
[144]   train-mlogloss:1.00950  eval-mlogloss:1.08799
[145]   train-mlogloss:1.00900  eval-mlogloss:1.08794
[146]   train-mlogloss:1.00853  eval-mlogloss:1.08789
[147]   train-mlogloss:1.00807  eval-mlogloss:1.08784
[148]   train-mlogloss:1.00755  eval-mlogloss:1.08781
[149]   train-mlogloss:1.00691  eval-mlogloss:1.08776
[150]   train-mlogloss:1.00629  eval-mlogloss:1.08774
[151]   train-mlogloss:1.00562  eval-mlogloss:1.08766
[152]   train-mlogloss:1.00517  eval-mlogloss:1.08763
[153]   train-mlogloss:1.00449  eval-mlogloss:1.08754
[154]   train-mlogloss:1.00404  eval-mlogloss:1.08752
[155]   train-mlogloss:1.00352  eval-mlogloss:1.08747
[156]   train-mlogloss:1.00294  eval-mlogloss:1.08742
[157]   train-mlogloss:1.00235  eval-mlogloss:1.08735
[158]   train-mlogloss:1.00190  eval-mlogloss:1.08737
[159]   train-mlogloss:1.00144  eval-mlogloss:1.08731
[160]   train-mlogloss:1.00084  eval-mlogloss:1.08728
[161]   train-mlogloss:1.00024  eval-mlogloss:1.08726
[162]   train-mlogloss:0.99963  eval-mlogloss:1.08723
[163]   train-mlogloss:0.99917  eval-mlogloss:1.08726
[164]   train-mlogloss:0.99848  eval-mlogloss:1.08727
[165]   train-mlogloss:0.99792  eval-mlogloss:1.08723
[166]   train-mlogloss:0.99743  eval-mlogloss:1.08721
[167]   train-mlogloss:0.99683  eval-mlogloss:1.08718
[168]   train-mlogloss:0.99622  eval-mlogloss:1.08719
[169]   train-mlogloss:0.99559  eval-mlogloss:1.08715
[170]   train-mlogloss:0.99502  eval-mlogloss:1.08717
[171]   train-mlogloss:0.99459  eval-mlogloss:1.08714
[172]   train-mlogloss:0.99398  eval-mlogloss:1.08707
[173]   train-mlogloss:0.99335  eval-mlogloss:1.08699
[174]   train-mlogloss:0.99277  eval-mlogloss:1.08697
[175]   train-mlogloss:0.99213  eval-mlogloss:1.08695
[176]   train-mlogloss:0.99156  eval-mlogloss:1.08695
[177]   train-mlogloss:0.99105  eval-mlogloss:1.08690
[178]   train-mlogloss:0.99045  eval-mlogloss:1.08688
[179]   train-mlogloss:0.98992  eval-mlogloss:1.08686
[180]   train-mlogloss:0.98923  eval-mlogloss:1.08683
[181]   train-mlogloss:0.98876  eval-mlogloss:1.08680
[182]   train-mlogloss:0.98814  eval-mlogloss:1.08679
[183]   train-mlogloss:0.98757  eval-mlogloss:1.08675
[184]   train-mlogloss:0.98707  eval-mlogloss:1.08678
[185]   train-mlogloss:0.98656  eval-mlogloss:1.08680
[186]   train-mlogloss:0.98600  eval-mlogloss:1.08678
[187]   train-mlogloss:0.98555  eval-mlogloss:1.08679
[188]   train-mlogloss:0.98491  eval-mlogloss:1.08678
[189]   train-mlogloss:0.98429  eval-mlogloss:1.08675
[190]   train-mlogloss:0.98373  eval-mlogloss:1.08672
[191]   train-mlogloss:0.98317  eval-mlogloss:1.08676
[192]   train-mlogloss:0.98268  eval-mlogloss:1.08670
[193]   train-mlogloss:0.98213  eval-mlogloss:1.08667
[194]   train-mlogloss:0.98147  eval-mlogloss:1.08663
[195]   train-mlogloss:0.98090  eval-mlogloss:1.08659
[196]   train-mlogloss:0.98040  eval-mlogloss:1.08655
[197]   train-mlogloss:0.97987  eval-mlogloss:1.08656
[198]   train-mlogloss:0.97937  eval-mlogloss:1.08653
[199]   train-mlogloss:0.97892  eval-mlogloss:1.08649
[200]   train-mlogloss:0.97848  eval-mlogloss:1.08648
[201]   train-mlogloss:0.97792  eval-mlogloss:1.08652
[202]   train-mlogloss:0.97742  eval-mlogloss:1.08652
[203]   train-mlogloss:0.97700  eval-mlogloss:1.08652
[204]   train-mlogloss:0.97654  eval-mlogloss:1.08652
[205]   train-mlogloss:0.97612  eval-mlogloss:1.08653
[206]   train-mlogloss:0.97572  eval-mlogloss:1.08654
[207]   train-mlogloss:0.97514  eval-mlogloss:1.08648
[208]   train-mlogloss:0.97472  eval-mlogloss:1.08645
[209]   train-mlogloss:0.97415  eval-mlogloss:1.08638
[210]   train-mlogloss:0.97366  eval-mlogloss:1.08635
[211]   train-mlogloss:0.97321  eval-mlogloss:1.08637
[212]   train-mlogloss:0.97274  eval-mlogloss:1.08634
[213]   train-mlogloss:0.97227  eval-mlogloss:1.08636
[214]   train-mlogloss:0.97182  eval-mlogloss:1.08637
[215]   train-mlogloss:0.97142  eval-mlogloss:1.08640
[216]   train-mlogloss:0.97105  eval-mlogloss:1.08637
[217]   train-mlogloss:0.97060  eval-mlogloss:1.08630
[218]   train-mlogloss:0.97028  eval-mlogloss:1.08634
[219]   train-mlogloss:0.96980  eval-mlogloss:1.08627
[220]   train-mlogloss:0.96934  eval-mlogloss:1.08625
[221]   train-mlogloss:0.96883  eval-mlogloss:1.08628
[222]   train-mlogloss:0.96837  eval-mlogloss:1.08630
[223]   train-mlogloss:0.96800  eval-mlogloss:1.08635
[224]   train-mlogloss:0.96752  eval-mlogloss:1.08638
[225]   train-mlogloss:0.96716  eval-mlogloss:1.08639
[226]   train-mlogloss:0.96686  eval-mlogloss:1.08639
[227]   train-mlogloss:0.96648  eval-mlogloss:1.08641
[228]   train-mlogloss:0.96591  eval-mlogloss:1.08638
[229]   train-mlogloss:0.96536  eval-mlogloss:1.08637
[230]   train-mlogloss:0.96486  eval-mlogloss:1.08638
[231]   train-mlogloss:0.96440  eval-mlogloss:1.08636
[232]   train-mlogloss:0.96395  eval-mlogloss:1.08633
[233]   train-mlogloss:0.96350  eval-mlogloss:1.08635
[234]   train-mlogloss:0.96280  eval-mlogloss:1.08639
[235]   train-mlogloss:0.96212  eval-mlogloss:1.08630
[236]   train-mlogloss:0.96175  eval-mlogloss:1.08636
[237]   train-mlogloss:0.96135  eval-mlogloss:1.08637
[238]   train-mlogloss:0.96081  eval-mlogloss:1.08640
[239]   train-mlogloss:0.96025  eval-mlogloss:1.08643
[240]   train-mlogloss:0.95976  eval-mlogloss:1.08641
# MAP@7 평가 척도를 위한 준비작업이다.
# 고객 식별 번호를 추출한다.
vld = trn[trn['fecha_dato'] == vld_date]
ncodpers_vld = vld['ncodpers']
# 검증 데이터에서 신규 구매를 구한다.
for prod in prods:
    prev = prod + '_prev'
    padd = prod + '_add'
    vld[padd] = vld[prod] - vld[prev]    
add_vld = vld[[prod + '_add' for prod in prods]]
add_vld_list = [list() for i in range(len(ncodpers_vld))]
# 고객별 신규 구매 정답 값을 add_vld_list에 저장하고, 총 count를 count_vld에 저장한다.
count_vld = 0
for ncodper in range(len(ncodpers_vld)):
    for prod in range(len(prods)):
        if add_vld[ncodper, prod] > 0:
            add_vld_list[ncodper].append(prod)
            count_vld += 1


# 검증 데이터에서 얻을 수 있는 MAP@7 최고점을 미리 구한다. (0.042663)
print(mapk(add_vld_list, add_vld_list, 7, 0.0))


# 검증 데이터에 대한 예측 값을 구한다.
X_vld = vld.as_matrix(columns=features)
Y_vld = vld.as_matrix(columns=['y'])
dvld = xgb.DMatrix(X_vld, label=Y_vld, feature_names=features)
preds_vld = model.predict(dvld, ntree_limit=best_ntree_limit)



# 저번 달에 보유한 제품은 신규 구매가 불가하기 때문에, 확률값에서 미리 1을 빼준다
preds_vld = preds_vld - vld.as_matrix(columns=[prod + '_prev' for prod in prods])



# 검증 데이터 예측 상위 7개를 추출한다.
result_vld = []
for ncodper, pred in zip(ncodpers_vld, preds_vld):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    result_vld.append([ip for y,p,ip in y_prods])

# 검증 데이터에서의 MAP@7 점수를 구한다. (0.036466)
print(mapk(add_vld_list, result_vld, 7, 0.0))



# XGBoost 모델을 전체 훈련 데이터로 재학습한다!
X_all = XY.as_matrix(columns=features)
Y_all = XY.as_matrix(columns=['y'])
dall = xgb.DMatrix(X_all, label=Y_all, feature_names=features)
watch_list = [(dall, 'train')]

# 트리 개수를 늘어난 데이터 양만큼 비례해서 증가한다.
best_ntree_limit = int(best_ntree_limit * (len(XY_trn) + len(XY_vld)) / len(XY_trn))

# XGBoost 모델 재학습!
model = xgb.train(param, dall, num_boost_round=best_ntree_limit, evals=watch_list)


# 변수 중요도를 출력해본다. 예상하던 변수가 상위로 올라와 있는가?
print("Feature importance:")
for kv in sorted([(k,v) for k,v in model.get_fscore().items()], key=lambda kv: kv[1], reverse=True):
    print(kv)


# 캐글 제출을 위하여 테스트 데이터에 대한 예측 값을 구한다.
X_tst = tst.as_matrix(columns=features)
dtst = xgb.DMatrix(X_tst, feature_names=features)
preds_tst = model.predict(dtst, ntree_limit=best_ntree_limit)
ncodpers_tst = tst.as_matrix(columns=['ncodpers'])
preds_tst = preds_tst - tst.as_matrix(columns=[prod + '_prev' for prod in prods])


# 제출 파일을 생성한다.
submit_file = open('../model/xgb.baseline.2015-06-28', 'w')
submit_file.write('ncodpers,added_products\n')
for ncodper, pred in zip(ncodpers_tst, preds_tst):
    y_prods = [(y,p,ip) for y,p,ip in zip(pred, prods, range(len(prods)))]
    y_prods = sorted(y_prods, key=lambda a: a[0], reverse=True)[:7]
    y_prods = [p for y,p,ip in y_prods]
    submit_file.write('{},{}\n'.format(int(ncodper), ' '.join(y_prods)))