import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#---#
from autogluon.multimodal import MultiModalPredictor # from autogluon.tabular import TabularPredictor
#---#
import warnings
'ignore') warnings.filterwarnings(
ref
import
data
부동산 계약금 지불, 부동산 자산 활용 등 재정적 의사결정을 돕기 위한 고객 맞춤형 제품 추천
문제 유형: Multi-class Classification
평가 척도: Mean Average Precision
!kaggle competitions download -c santander-product-recommendation
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /home/coco/.kaggle/kaggle.json'
Downloading santander-product-recommendation.zip to /home/coco/Dropbox/Class/KS/posts
100%|████████████████████████████████████████| 228M/228M [00:11<00:00, 24.7MB/s]
100%|████████████████████████████████████████| 228M/228M [00:11<00:00, 21.6MB/s]
!unzip santander-product-recommendation.zip -d data
Archive: santander-product-recommendation.zip
inflating: data/sample_submission.csv.zip
inflating: data/test_ver2.csv.zip
inflating: data/train_ver2.csv.zip
= pd.read_csv('data/train_ver2.csv.zip')
df_train = pd.read_csv('data/test_ver2.csv.zip')
df_test = pd.read_csv('data/sample_submission.csv.zip') sample_submission
!rm -rf data
!rm nlp-getting-started.zip
rm: cannot remove 'nlp-getting-started.zip': No such file or directory
df_train.head()
fecha_dato | ncodpers | ind_empleado | pais_residencia | sexo | age | fecha_alta | ind_nuevo | antiguedad | indrel | ... | ind_hip_fin_ult1 | ind_plan_fin_ult1 | ind_pres_fin_ult1 | ind_reca_fin_ult1 | ind_tjcr_fin_ult1 | ind_valo_fin_ult1 | ind_viv_fin_ult1 | ind_nomina_ult1 | ind_nom_pens_ult1 | ind_recibo_ult1 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-28 | 1375586 | N | ES | H | 35 | 2015-01-12 | 0.0 | 6 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
1 | 2015-01-28 | 1050611 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
2 | 2015-01-28 | 1050612 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
3 | 2015-01-28 | 1050613 | N | ES | H | 22 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
4 | 2015-01-28 | 1050614 | N | ES | V | 23 | 2012-08-10 | 0.0 | 35 | 1.0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0 |
5 rows × 48 columns
df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13647309 entries, 0 to 13647308
Data columns (total 48 columns):
# Column Dtype
--- ------ -----
0 fecha_dato object
1 ncodpers int64
2 ind_empleado object
3 pais_residencia object
4 sexo object
5 age object
6 fecha_alta object
7 ind_nuevo float64
8 antiguedad object
9 indrel float64
10 ult_fec_cli_1t object
11 indrel_1mes object
12 tiprel_1mes object
13 indresi object
14 indext object
15 conyuemp object
16 canal_entrada object
17 indfall object
18 tipodom float64
19 cod_prov float64
20 nomprov object
21 ind_actividad_cliente float64
22 renta float64
23 segmento object
24 ind_ahor_fin_ult1 int64
25 ind_aval_fin_ult1 int64
26 ind_cco_fin_ult1 int64
27 ind_cder_fin_ult1 int64
28 ind_cno_fin_ult1 int64
29 ind_ctju_fin_ult1 int64
30 ind_ctma_fin_ult1 int64
31 ind_ctop_fin_ult1 int64
32 ind_ctpp_fin_ult1 int64
33 ind_deco_fin_ult1 int64
34 ind_deme_fin_ult1 int64
35 ind_dela_fin_ult1 int64
36 ind_ecue_fin_ult1 int64
37 ind_fond_fin_ult1 int64
38 ind_hip_fin_ult1 int64
39 ind_plan_fin_ult1 int64
40 ind_pres_fin_ult1 int64
41 ind_reca_fin_ult1 int64
42 ind_tjcr_fin_ult1 int64
43 ind_valo_fin_ult1 int64
44 ind_viv_fin_ult1 int64
45 ind_nomina_ult1 float64
46 ind_nom_pens_ult1 float64
47 ind_recibo_ult1 int64
dtypes: float64(8), int64(23), object(17)
memory usage: 4.9+ GB
-
for loop함수: 모든 변수의 첫 5줄 미리보기
for col in df_train.columns:
print('{}\n'.format(df_train[col].head()))
0 2015-01-28
1 2015-01-28
2 2015-01-28
3 2015-01-28
4 2015-01-28
Name: fecha_dato, dtype: object
0 1375586
1 1050611
2 1050612
3 1050613
4 1050614
Name: ncodpers, dtype: int64
0 N
1 N
2 N
3 N
4 N
Name: ind_empleado, dtype: object
0 ES
1 ES
2 ES
3 ES
4 ES
Name: pais_residencia, dtype: object
0 H
1 V
2 V
3 H
4 V
Name: sexo, dtype: object
0 35
1 23
2 23
3 22
4 23
Name: age, dtype: object
0 2015-01-12
1 2012-08-10
2 2012-08-10
3 2012-08-10
4 2012-08-10
Name: fecha_alta, dtype: object
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: ind_nuevo, dtype: float64
0 6
1 35
2 35
3 35
4 35
Name: antiguedad, dtype: object
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
Name: indrel, dtype: float64
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
Name: ult_fec_cli_1t, dtype: object
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
Name: indrel_1mes, dtype: object
0 A
1 I
2 I
3 I
4 A
Name: tiprel_1mes, dtype: object
0 S
1 S
2 S
3 S
4 S
Name: indresi, dtype: object
0 N
1 S
2 N
3 N
4 N
Name: indext, dtype: object
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
Name: conyuemp, dtype: object
0 KHL
1 KHE
2 KHE
3 KHD
4 KHE
Name: canal_entrada, dtype: object
0 N
1 N
2 N
3 N
4 N
Name: indfall, dtype: object
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
Name: tipodom, dtype: float64
0 29.0
1 13.0
2 13.0
3 50.0
4 50.0
Name: cod_prov, dtype: float64
0 MALAGA
1 CIUDAD REAL
2 CIUDAD REAL
3 ZARAGOZA
4 ZARAGOZA
Name: nomprov, dtype: object
0 1.0
1 0.0
2 0.0
3 0.0
4 1.0
Name: ind_actividad_cliente, dtype: float64
0 87218.10
1 35548.74
2 122179.11
3 119775.54
4 NaN
Name: renta, dtype: float64
0 02 - PARTICULARES
1 03 - UNIVERSITARIO
2 03 - UNIVERSITARIO
3 03 - UNIVERSITARIO
4 03 - UNIVERSITARIO
Name: segmento, dtype: object
0 0
1 0
2 0
3 0
4 0
Name: ind_ahor_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_aval_fin_ult1, dtype: int64
0 1
1 1
2 1
3 0
4 1
Name: ind_cco_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_cder_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_cno_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_ctju_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_ctma_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_ctop_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_ctpp_fin_ult1, dtype: int64
0 0
1 0
2 0
3 1
4 0
Name: ind_deco_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_deme_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_dela_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_ecue_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_fond_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_hip_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_plan_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_pres_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_reca_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_tjcr_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_valo_fin_ult1, dtype: int64
0 0
1 0
2 0
3 0
4 0
Name: ind_viv_fin_ult1, dtype: int64
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: ind_nomina_ult1, dtype: float64
0 0.0
1 0.0
2 0.0
3 0.0
4 0.0
Name: ind_nom_pens_ult1, dtype: float64
0 0
1 0
2 0
3 0
4 0
Name: ind_recibo_ult1, dtype: int64
fecha_alta: 날짜 변수가 datetime이 아닌 object
age: object이므로 정수형 변환 필요
renta: 가구 총수입 나타내는 변수. NaN값 -> 전처리 필요
df_train.shape
(13647309, 48)
df_test.shape
(929615, 24)
set(df_train) - set(df_test)
{'ind_ahor_fin_ult1',
'ind_aval_fin_ult1',
'ind_cco_fin_ult1',
'ind_cder_fin_ult1',
'ind_cno_fin_ult1',
'ind_ctju_fin_ult1',
'ind_ctma_fin_ult1',
'ind_ctop_fin_ult1',
'ind_ctpp_fin_ult1',
'ind_deco_fin_ult1',
'ind_dela_fin_ult1',
'ind_deme_fin_ult1',
'ind_ecue_fin_ult1',
'ind_fond_fin_ult1',
'ind_hip_fin_ult1',
'ind_nom_pens_ult1',
'ind_nomina_ult1',
'ind_plan_fin_ult1',
'ind_pres_fin_ult1',
'ind_reca_fin_ult1',
'ind_recibo_ult1',
'ind_tjcr_fin_ult1',
'ind_valo_fin_ult1',
'ind_viv_fin_ult1'}
- 고객이 신규로 구매할 제품을 예측하기
sample_submission.head()
ncodpers | added_products | |
---|---|---|
0 | 15889 | ind_tjcr_fin_ult1 |
1 | 15890 | ind_tjcr_fin_ult1 |
2 | 15892 | ind_tjcr_fin_ult1 |
3 | 15893 | ind_tjcr_fin_ult1 |
4 | 15894 | ind_tjcr_fin_ult1 |
EDA
수치형 변수
= [col for col in df_train.columns[:24] if df_train[col].dtype in ['int64','float64']]
num_cols df_train[num_cols].describe()
ncodpers | ind_nuevo | indrel | tipodom | cod_prov | ind_actividad_cliente | renta | |
---|---|---|---|---|---|---|---|
count | 1.364731e+07 | 1.361958e+07 | 1.361958e+07 | 13619574.0 | 1.355372e+07 | 1.361958e+07 | 1.085293e+07 |
mean | 8.349042e+05 | 5.956184e-02 | 1.178399e+00 | 1.0 | 2.657147e+01 | 4.578105e-01 | 1.342543e+05 |
std | 4.315650e+05 | 2.366733e-01 | 4.177469e+00 | 0.0 | 1.278402e+01 | 4.982169e-01 | 2.306202e+05 |
min | 1.588900e+04 | 0.000000e+00 | 1.000000e+00 | 1.0 | 1.000000e+00 | 0.000000e+00 | 1.202730e+03 |
25% | 4.528130e+05 | 0.000000e+00 | 1.000000e+00 | 1.0 | 1.500000e+01 | 0.000000e+00 | 6.871098e+04 |
50% | 9.318930e+05 | 0.000000e+00 | 1.000000e+00 | 1.0 | 2.800000e+01 | 0.000000e+00 | 1.018500e+05 |
75% | 1.199286e+06 | 0.000000e+00 | 1.000000e+00 | 1.0 | 3.500000e+01 | 1.000000e+00 | 1.559560e+05 |
max | 1.553689e+06 | 1.000000e+00 | 9.900000e+01 | 1.0 | 5.200000e+01 | 1.000000e+00 | 2.889440e+07 |
범주형 변수
= [col for col in df_train.columns[:24] if df_train[col].dtype in ['object']]
cat_cols df_train[cat_cols].describe()
fecha_dato | ind_empleado | pais_residencia | sexo | age | fecha_alta | antiguedad | ult_fec_cli_1t | indrel_1mes | tiprel_1mes | indresi | indext | conyuemp | canal_entrada | indfall | nomprov | segmento | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 13647309 | 13619575 | 13619575 | 13619505 | 13647309 | 13619575 | 13647309 | 24793 | 13497528.0 | 13497528 | 13619575 | 13619575 | 1808 | 13461183 | 13619575 | 13553718 | 13457941 |
unique | 17 | 5 | 118 | 2 | 235 | 6756 | 507 | 223 | 13.0 | 5 | 2 | 2 | 2 | 162 | 2 | 52 | 3 |
top | 2016-05-28 | N | ES | V | 23 | 2014-07-28 | 0 | 2015-12-24 | 1.0 | I | S | N | N | KHE | N | MADRID | 02 - PARTICULARES |
freq | 931453 | 13610977 | 13553710 | 7424252 | 542682 | 57389 | 134335 | 763 | 7277607.0 | 7304875 | 13553711 | 12974839 | 1791 | 4055270 | 13584813 | 4409600 | 7960220 |
-
고유값
for col in cat_cols:
= np.unique(df_train[col].astype(str))
uniq print('-' * 50)
print('# col {}, n_uniq {}, uniq {}'.format(col, len(uniq), uniq))
--------------------------------------------------
# col fecha_dato, n_uniq 17, uniq ['2015-01-28' '2015-02-28' '2015-03-28' '2015-04-28' '2015-05-28'
'2015-06-28' '2015-07-28' '2015-08-28' '2015-09-28' '2015-10-28'
'2015-11-28' '2015-12-28' '2016-01-28' '2016-02-28' '2016-03-28'
'2016-04-28' '2016-05-28']
--------------------------------------------------
# col ind_empleado, n_uniq 6, uniq ['A' 'B' 'F' 'N' 'S' 'nan']
--------------------------------------------------
# col pais_residencia, n_uniq 119, uniq ['AD' 'AE' 'AL' 'AO' 'AR' 'AT' 'AU' 'BA' 'BE' 'BG' 'BM' 'BO' 'BR' 'BY'
'BZ' 'CA' 'CD' 'CF' 'CG' 'CH' 'CI' 'CL' 'CM' 'CN' 'CO' 'CR' 'CU' 'CZ'
'DE' 'DJ' 'DK' 'DO' 'DZ' 'EC' 'EE' 'EG' 'ES' 'ET' 'FI' 'FR' 'GA' 'GB'
'GE' 'GH' 'GI' 'GM' 'GN' 'GQ' 'GR' 'GT' 'GW' 'HK' 'HN' 'HR' 'HU' 'IE'
'IL' 'IN' 'IS' 'IT' 'JM' 'JP' 'KE' 'KH' 'KR' 'KW' 'KZ' 'LB' 'LT' 'LU'
'LV' 'LY' 'MA' 'MD' 'MK' 'ML' 'MM' 'MR' 'MT' 'MX' 'MZ' 'NG' 'NI' 'NL'
'NO' 'NZ' 'OM' 'PA' 'PE' 'PH' 'PK' 'PL' 'PR' 'PT' 'PY' 'QA' 'RO' 'RS'
'RU' 'SA' 'SE' 'SG' 'SK' 'SL' 'SN' 'SV' 'TG' 'TH' 'TN' 'TR' 'TW' 'UA'
'US' 'UY' 'VE' 'VN' 'ZA' 'ZW' 'nan']
--------------------------------------------------
# col sexo, n_uniq 3, uniq ['H' 'V' 'nan']
--------------------------------------------------
# col age, n_uniq 219, uniq [' 2' ' 3' ' 4' ' 5' ' 6' ' 7' ' 8' ' 9' ' 10' ' 11' ' 12' ' 13'
' 14' ' 15' ' 16' ' 17' ' 18' ' 19' ' 20' ' 21' ' 22' ' 23' ' 24' ' 25'
' 26' ' 27' ' 28' ' 29' ' 30' ' 31' ' 32' ' 33' ' 34' ' 35' ' 36' ' 37'
' 38' ' 39' ' 40' ' 41' ' 42' ' 43' ' 44' ' 45' ' 46' ' 47' ' 48' ' 49'
' 50' ' 51' ' 52' ' 53' ' 54' ' 55' ' 56' ' 57' ' 58' ' 59' ' 60' ' 61'
' 62' ' 63' ' 64' ' 65' ' 66' ' 67' ' 68' ' 69' ' 70' ' 71' ' 72' ' 73'
' 74' ' 75' ' 76' ' 77' ' 78' ' 79' ' 80' ' 81' ' 82' ' 83' ' 84' ' 85'
' 86' ' 87' ' 88' ' 89' ' 90' ' 91' ' 92' ' 93' ' 94' ' 95' ' 96' ' 97'
' 98' ' 99' ' NA' '10' '100' '101' '102' '103' '104' '105' '106' '107'
'108' '109' '11' '110' '111' '112' '113' '114' '115' '116' '117' '12'
'126' '127' '13' '14' '15' '16' '163' '164' '17' '18' '19' '2' '20' '21'
'22' '23' '24' '25' '26' '27' '28' '29' '3' '30' '31' '32' '33' '34' '35'
'36' '37' '38' '39' '4' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49'
'5' '50' '51' '52' '53' '54' '55' '56' '57' '58' '59' '6' '60' '61' '62'
'63' '64' '65' '66' '67' '68' '69' '7' '70' '71' '72' '73' '74' '75' '76'
'77' '78' '79' '8' '80' '81' '82' '83' '84' '85' '86' '87' '88' '89' '9'
'90' '91' '92' '93' '94' '95' '96' '97' '98' '99']
--------------------------------------------------
# col fecha_alta, n_uniq 6757, uniq ['1995-01-16' '1995-01-17' '1995-01-23' ... '2016-05-30' '2016-05-31'
'nan']
--------------------------------------------------
# col antiguedad, n_uniq 506, uniq [' 0' ' 1' ' 2' ' 3' ' 4' ' 5' ' 6'
' 7' ' 8' ' 9' ' 10' ' 11' ' 12' ' 13'
' 14' ' 15' ' 16' ' 17' ' 18' ' 19' ' 20'
' 21' ' 22' ' 23' ' 24' ' 25' ' 26' ' 27'
' 28' ' 29' ' 30' ' 31' ' 32' ' 33' ' 34'
' 35' ' 36' ' 37' ' 38' ' 39' ' 40' ' 41'
' 42' ' 43' ' 44' ' 45' ' 46' ' 47' ' 48'
' 49' ' 50' ' 51' ' 52' ' 53' ' 54' ' 55'
' 56' ' 57' ' 58' ' 59' ' 60' ' 61' ' 62'
' 63' ' 64' ' 65' ' 66' ' 67' ' 68' ' 69'
' 70' ' 71' ' 72' ' 73' ' 74' ' 75' ' 76'
' 77' ' 78' ' 79' ' 80' ' 81' ' 82' ' 83'
' 84' ' 85' ' 86' ' 87' ' 88' ' 89' ' 90'
' 91' ' 92' ' 93' ' 94' ' 95' ' 96' ' 97'
' 98' ' 99' ' NA' ' 100' ' 101' ' 102' ' 103'
' 104' ' 105' ' 106' ' 107' ' 108' ' 109' ' 110'
' 111' ' 112' ' 113' ' 114' ' 115' ' 116' ' 117'
' 118' ' 119' ' 120' ' 121' ' 122' ' 123' ' 124'
' 125' ' 126' ' 127' ' 128' ' 129' ' 130' ' 131'
' 132' ' 133' ' 134' ' 135' ' 136' ' 137' ' 138'
' 139' ' 140' ' 141' ' 142' ' 143' ' 144' ' 145'
' 146' ' 147' ' 148' ' 149' ' 150' ' 151' ' 152'
' 153' ' 154' ' 155' ' 156' ' 157' ' 158' ' 159'
' 160' ' 161' ' 162' ' 163' ' 164' ' 165' ' 166'
' 167' ' 168' ' 169' ' 170' ' 171' ' 172' ' 173'
' 174' ' 175' ' 176' ' 177' ' 178' ' 179' ' 180'
' 181' ' 182' ' 183' ' 184' ' 185' ' 186' ' 187'
' 188' ' 189' ' 190' ' 191' ' 192' ' 193' ' 194'
' 195' ' 196' ' 197' ' 198' ' 199' ' 200' ' 201'
' 202' ' 203' ' 204' ' 205' ' 206' ' 207' ' 208'
' 209' ' 210' ' 211' ' 212' ' 213' ' 214' ' 215'
' 216' ' 217' ' 218' ' 219' ' 220' ' 221' ' 222'
' 223' ' 224' ' 225' ' 226' ' 227' ' 228' ' 229'
' 230' ' 231' ' 232' ' 233' ' 234' ' 235' ' 236'
' 237' ' 238' ' 239' ' 240' ' 241' ' 242' ' 243'
' 244' ' 245' ' 246' '-999999' '0' '1' '10' '100' '101' '102'
'103' '104' '105' '106' '107' '108' '109' '11' '110' '111' '112' '113'
'114' '115' '116' '117' '118' '119' '12' '120' '121' '122' '123' '124'
'125' '126' '127' '128' '129' '13' '130' '131' '132' '133' '134' '135'
'136' '137' '138' '139' '14' '140' '141' '142' '143' '144' '145' '146'
'147' '148' '149' '15' '150' '151' '152' '153' '154' '155' '156' '157'
'158' '159' '16' '160' '161' '162' '163' '164' '165' '166' '167' '168'
'169' '17' '170' '171' '172' '173' '174' '175' '176' '177' '178' '179'
'18' '180' '181' '182' '183' '184' '185' '186' '187' '188' '189' '19'
'190' '191' '192' '193' '194' '195' '196' '197' '198' '199' '2' '20'
'200' '201' '202' '203' '204' '205' '206' '207' '208' '209' '21' '210'
'211' '212' '213' '214' '215' '216' '217' '218' '219' '22' '220' '221'
'222' '223' '224' '225' '226' '227' '228' '229' '23' '230' '231' '232'
'233' '234' '235' '236' '237' '238' '239' '24' '240' '241' '242' '243'
'244' '245' '246' '247' '248' '249' '25' '250' '251' '252' '253' '254'
'255' '256' '26' '27' '28' '29' '3' '30' '31' '32' '33' '34' '35' '36'
'37' '38' '39' '4' '40' '41' '42' '43' '44' '45' '46' '47' '48' '49' '5'
'50' '51' '52' '53' '54' '55' '56' '57' '58' '59' '6' '60' '61' '62' '63'
'64' '65' '66' '67' '68' '69' '7' '70' '71' '72' '73' '74' '75' '76' '77'
'78' '79' '8' '80' '81' '82' '83' '84' '85' '86' '87' '88' '89' '9' '90'
'91' '92' '93' '94' '95' '96' '97' '98' '99']
--------------------------------------------------
# col ult_fec_cli_1t, n_uniq 224, uniq ['2015-07-01' '2015-07-02' '2015-07-03' '2015-07-06' '2015-07-07'
'2015-07-08' '2015-07-09' '2015-07-10' '2015-07-13' '2015-07-14'
'2015-07-15' '2015-07-16' '2015-07-17' '2015-07-20' '2015-07-21'
'2015-07-22' '2015-07-23' '2015-07-24' '2015-07-27' '2015-07-28'
'2015-07-29' '2015-07-30' '2015-08-03' '2015-08-04' '2015-08-05'
'2015-08-06' '2015-08-07' '2015-08-10' '2015-08-11' '2015-08-12'
'2015-08-13' '2015-08-14' '2015-08-17' '2015-08-18' '2015-08-19'
'2015-08-20' '2015-08-21' '2015-08-24' '2015-08-25' '2015-08-26'
'2015-08-27' '2015-08-28' '2015-09-01' '2015-09-02' '2015-09-03'
'2015-09-04' '2015-09-07' '2015-09-08' '2015-09-09' '2015-09-10'
'2015-09-11' '2015-09-14' '2015-09-15' '2015-09-16' '2015-09-17'
'2015-09-18' '2015-09-21' '2015-09-22' '2015-09-23' '2015-09-24'
'2015-09-25' '2015-09-28' '2015-09-29' '2015-10-01' '2015-10-02'
'2015-10-05' '2015-10-06' '2015-10-07' '2015-10-08' '2015-10-09'
'2015-10-13' '2015-10-14' '2015-10-15' '2015-10-16' '2015-10-19'
'2015-10-20' '2015-10-21' '2015-10-22' '2015-10-23' '2015-10-26'
'2015-10-27' '2015-10-28' '2015-10-29' '2015-11-02' '2015-11-03'
'2015-11-04' '2015-11-05' '2015-11-06' '2015-11-09' '2015-11-10'
'2015-11-11' '2015-11-12' '2015-11-13' '2015-11-16' '2015-11-17'
'2015-11-18' '2015-11-19' '2015-11-20' '2015-11-23' '2015-11-24'
'2015-11-25' '2015-11-26' '2015-11-27' '2015-12-01' '2015-12-02'
'2015-12-03' '2015-12-04' '2015-12-07' '2015-12-09' '2015-12-10'
'2015-12-11' '2015-12-14' '2015-12-15' '2015-12-16' '2015-12-17'
'2015-12-18' '2015-12-21' '2015-12-22' '2015-12-23' '2015-12-24'
'2015-12-28' '2015-12-29' '2015-12-30' '2016-01-04' '2016-01-05'
'2016-01-07' '2016-01-08' '2016-01-11' '2016-01-12' '2016-01-13'
'2016-01-14' '2016-01-15' '2016-01-18' '2016-01-19' '2016-01-20'
'2016-01-21' '2016-01-22' '2016-01-25' '2016-01-26' '2016-01-27'
'2016-01-28' '2016-02-01' '2016-02-02' '2016-02-03' '2016-02-04'
'2016-02-05' '2016-02-08' '2016-02-09' '2016-02-10' '2016-02-11'
'2016-02-12' '2016-02-15' '2016-02-16' '2016-02-17' '2016-02-18'
'2016-02-19' '2016-02-22' '2016-02-23' '2016-02-24' '2016-02-25'
'2016-02-26' '2016-03-01' '2016-03-02' '2016-03-03' '2016-03-04'
'2016-03-07' '2016-03-08' '2016-03-09' '2016-03-10' '2016-03-11'
'2016-03-14' '2016-03-15' '2016-03-16' '2016-03-17' '2016-03-18'
'2016-03-21' '2016-03-22' '2016-03-23' '2016-03-24' '2016-03-28'
'2016-03-29' '2016-03-30' '2016-04-01' '2016-04-04' '2016-04-05'
'2016-04-06' '2016-04-07' '2016-04-08' '2016-04-11' '2016-04-12'
'2016-04-13' '2016-04-14' '2016-04-15' '2016-04-18' '2016-04-19'
'2016-04-20' '2016-04-21' '2016-04-22' '2016-04-25' '2016-04-26'
'2016-04-27' '2016-04-28' '2016-05-02' '2016-05-03' '2016-05-04'
'2016-05-05' '2016-05-06' '2016-05-09' '2016-05-10' '2016-05-11'
'2016-05-12' '2016-05-13' '2016-05-16' '2016-05-17' '2016-05-18'
'2016-05-19' '2016-05-20' '2016-05-23' '2016-05-24' '2016-05-25'
'2016-05-26' '2016-05-27' '2016-05-30' 'nan']
--------------------------------------------------
# col indrel_1mes, n_uniq 10, uniq ['1' '1.0' '2' '2.0' '3' '3.0' '4' '4.0' 'P' 'nan']
--------------------------------------------------
# col tiprel_1mes, n_uniq 6, uniq ['A' 'I' 'N' 'P' 'R' 'nan']
--------------------------------------------------
# col indresi, n_uniq 3, uniq ['N' 'S' 'nan']
--------------------------------------------------
# col indext, n_uniq 3, uniq ['N' 'S' 'nan']
--------------------------------------------------
# col conyuemp, n_uniq 3, uniq ['N' 'S' 'nan']
--------------------------------------------------
# col canal_entrada, n_uniq 163, uniq ['004' '007' '013' '025' 'K00' 'KAA' 'KAB' 'KAC' 'KAD' 'KAE' 'KAF' 'KAG'
'KAH' 'KAI' 'KAJ' 'KAK' 'KAL' 'KAM' 'KAN' 'KAO' 'KAP' 'KAQ' 'KAR' 'KAS'
'KAT' 'KAU' 'KAV' 'KAW' 'KAY' 'KAZ' 'KBB' 'KBD' 'KBE' 'KBF' 'KBG' 'KBH'
'KBJ' 'KBL' 'KBM' 'KBN' 'KBO' 'KBP' 'KBQ' 'KBR' 'KBS' 'KBU' 'KBV' 'KBW'
'KBX' 'KBY' 'KBZ' 'KCA' 'KCB' 'KCC' 'KCD' 'KCE' 'KCF' 'KCG' 'KCH' 'KCI'
'KCJ' 'KCK' 'KCL' 'KCM' 'KCN' 'KCO' 'KCP' 'KCQ' 'KCR' 'KCS' 'KCT' 'KCU'
'KCV' 'KCX' 'KDA' 'KDB' 'KDC' 'KDD' 'KDE' 'KDF' 'KDG' 'KDH' 'KDI' 'KDL'
'KDM' 'KDN' 'KDO' 'KDP' 'KDQ' 'KDR' 'KDS' 'KDT' 'KDU' 'KDV' 'KDW' 'KDX'
'KDY' 'KDZ' 'KEA' 'KEB' 'KEC' 'KED' 'KEE' 'KEF' 'KEG' 'KEH' 'KEI' 'KEJ'
'KEK' 'KEL' 'KEM' 'KEN' 'KEO' 'KEQ' 'KES' 'KEU' 'KEV' 'KEW' 'KEY' 'KEZ'
'KFA' 'KFB' 'KFC' 'KFD' 'KFE' 'KFF' 'KFG' 'KFH' 'KFI' 'KFJ' 'KFK' 'KFL'
'KFM' 'KFN' 'KFP' 'KFR' 'KFS' 'KFT' 'KFU' 'KFV' 'KGC' 'KGN' 'KGU' 'KGV'
'KGW' 'KGX' 'KGY' 'KHA' 'KHC' 'KHD' 'KHE' 'KHF' 'KHK' 'KHL' 'KHM' 'KHN'
'KHO' 'KHP' 'KHQ' 'KHR' 'KHS' 'RED' 'nan']
--------------------------------------------------
# col indfall, n_uniq 3, uniq ['N' 'S' 'nan']
--------------------------------------------------
# col nomprov, n_uniq 53, uniq ['ALAVA' 'ALBACETE' 'ALICANTE' 'ALMERIA' 'ASTURIAS' 'AVILA' 'BADAJOZ'
'BALEARS, ILLES' 'BARCELONA' 'BIZKAIA' 'BURGOS' 'CACERES' 'CADIZ'
'CANTABRIA' 'CASTELLON' 'CEUTA' 'CIUDAD REAL' 'CORDOBA' 'CORUÑA, A'
'CUENCA' 'GIPUZKOA' 'GIRONA' 'GRANADA' 'GUADALAJARA' 'HUELVA' 'HUESCA'
'JAEN' 'LEON' 'LERIDA' 'LUGO' 'MADRID' 'MALAGA' 'MELILLA' 'MURCIA'
'NAVARRA' 'OURENSE' 'PALENCIA' 'PALMAS, LAS' 'PONTEVEDRA' 'RIOJA, LA'
'SALAMANCA' 'SANTA CRUZ DE TENERIFE' 'SEGOVIA' 'SEVILLA' 'SORIA'
'TARRAGONA' 'TERUEL' 'TOLEDO' 'VALENCIA' 'VALLADOLID' 'ZAMORA' 'ZARAGOZA'
'nan']
--------------------------------------------------
# col segmento, n_uniq 4, uniq ['01 - TOP' '02 - PARTICULARES' '03 - UNIVERSITARIO' 'nan']
데이터 분석
fecha_dato: 년-월-일로 28일자를 기준으로 17개
ind_empleado: 결측치
pais_residencia: 나라(대문자 2개), 결측치
sexo: 결측치
,, 결측치 핵 많음
변수명 | 내용 | 데이터 타입 | 특징 | 변수 아이디어 |
---|---|---|---|---|
fecha_data | 월별 날짜 데이터 | object | 년도, 월 데이터 별도 추출 | |
age | 나이 | obeject ->int | 나이데이터가 정수형이 아니므로 정제 필요 |
위 내용은 책에서 정리한 거고.. 추후 이렇게 정리하는 분석 방법이 필요
시각화
= ['ncodpers', 'renta'] # 고객 고유 식별 번호, 총 수입 (고유값 많으므로 제거)
skip_cols for col in df_train.columns:
if col in skip_cols:
continue
print('-' * 50)
print('col:',col)
= plt.subplots(figsize=(20,15))
f, ax =col, data=df_train, alpha=0.5)
sns.countplot(x plt.show()
--------------------------------------------------
col: fecha_dato
--------------------------------------------------
col: ind_empleado
--------------------------------------------------
col: pais_residencia
--------------------------------------------------
col: sexo
--------------------------------------------------
col: age
--------------------------------------------------
col: fecha_alta
--------------------------------------------------
col: ind_nuevo
--------------------------------------------------
col: antiguedad
--------------------------------------------------
col: indrel
--------------------------------------------------
col: ult_fec_cli_1t
--------------------------------------------------
col: indrel_1mes
--------------------------------------------------
col: tiprel_1mes
--------------------------------------------------
col: indresi
--------------------------------------------------
col: indext
--------------------------------------------------
col: conyuemp
--------------------------------------------------
col: canal_entrada
--------------------------------------------------
col: indfall
--------------------------------------------------
col: tipodom
--------------------------------------------------
col: cod_prov
--------------------------------------------------
col: nomprov
--------------------------------------------------
col: ind_actividad_cliente
--------------------------------------------------
col: segmento
--------------------------------------------------
col: ind_ahor_fin_ult1
--------------------------------------------------
col: ind_aval_fin_ult1
--------------------------------------------------
col: ind_cco_fin_ult1
--------------------------------------------------
col: ind_cder_fin_ult1
--------------------------------------------------
col: ind_cno_fin_ult1
--------------------------------------------------
col: ind_ctju_fin_ult1
--------------------------------------------------
col: ind_ctma_fin_ult1
--------------------------------------------------
col: ind_ctop_fin_ult1
--------------------------------------------------
col: ind_ctpp_fin_ult1
--------------------------------------------------
col: ind_deco_fin_ult1
--------------------------------------------------
col: ind_deme_fin_ult1
--------------------------------------------------
col: ind_dela_fin_ult1
--------------------------------------------------
col: ind_ecue_fin_ult1
--------------------------------------------------
col: ind_fond_fin_ult1
--------------------------------------------------
col: ind_hip_fin_ult1
--------------------------------------------------
col: ind_plan_fin_ult1
--------------------------------------------------
col: ind_pres_fin_ult1
--------------------------------------------------
col: ind_reca_fin_ult1
--------------------------------------------------
col: ind_tjcr_fin_ult1
--------------------------------------------------
col: ind_valo_fin_ult1
--------------------------------------------------
col: ind_viv_fin_ult1
--------------------------------------------------
col: ind_nomina_ult1
--------------------------------------------------
col: ind_nom_pens_ult1
--------------------------------------------------
col: ind_recibo_ult1
월별 금융 제품 보유 데이터 누적 막대 그래프 시각화
= []
label_over_time for i in range(len(label_cols)):
= df_train.groupby(['fecha_dato'])[label_cols[i]].agg('sum')
label_sum
label_over_time.append(label_sum.tolist())
= []
label_sum_over_time for i in range(len(label_cols)):
sum(axis=0))
label_sum_over_time.append(np.asarray(label_over_time[i:]).
= ['#F5B7B1','#D2B4DE','#AED6F1','#A2D9CE','#ABEBC6','#F9E79F','#F5CBA7','#CCD1D1']
color_list
= plt.subplots(figsize=(30, 15))
f, ax for i in range(len(label_cols)):
=months, y=label_sum_over_time[i], color = color_list[i%8], alpha=0.7)
sns.barplot(x
0,0),1,1,fc=color_list[i%8], edgecolor = 'none') for i in range(len(label_cols))], label_cols, loc=1, ncol = 2, prop={'size':16}) plt.legend([plt.Rectangle((
<matplotlib.legend.Legend at 0x7f1f78271070>
누적 막대 그래프 상대값 시각화
= (label_sum_over_time / (1.*np.asarray(label_sum_over_time).max(axis=0))) * 100
label_sum_percent
= plt.subplots(figsize=(30, 15))
f, ax for i in range(len(label_cols)):
=months, y=label_sum_percent[i], color = color_list[i%8], alpha=0.7)
sns.barplot(x
0,0),1,1,fc=color_list[i%8], edgecolor = 'none') for i in range(len(label_cols))], \
plt.legend([plt.Rectangle((=1, ncol = 2, prop={'size':16}) label_cols, loc
<matplotlib.legend.Legend at 0x7f1f770de610>
- 상대값으로 보기 위해서 퍼센테이지 나눈거!
신규 구매 데이터 생성 및 시각화
- 책에서는 예측해야할 ’신규 구매’에 초점을 맞추고 있어서 새로운 데이터를 추출했다.—> 일단 난 안할래 흠냐
# 제품 변수를 prods에 list형태로 저장한다
= trn.columns[24:].tolist()
prods
# 날짜를 숫자로 변환하는 함수이다. 2015-01-28은 1, 2016-06-28은 18로 변환된다
def date_to_int(str_date):
= [int(a) for a in str_date.strip().split("-")]
Y, M, D = (int(Y) - 2015) * 12 + int(M)
int_date return int_date
# 날짜를 숫자로 변환하여 int_date에 저장한다
'int_date'] = trn['fecha_dato'].map(date_to_int).astype(np.int8)
trn[
# 데이터를 복사하고, int_date 날짜에 1을 더하여 lag를 생성한다. 변수명에 _prev를 추가한다.
= trn.copy()
trn_lag 'int_date'] += 1
trn_lag[= [col + '_prev' if col not in ['ncodpers', 'int_date'] else col for col in trn.columns]
trn_lag.columns
# 원본 데이터와 lag 데이터를 ncodper와 int_date 기준으로 합친다. Lag 데이터의 int_date는 1 밀려있기 때문에, 저번달의 제품 정보가 삽입된다.
= trn.merge(trn_lag, on=['ncodpers','int_date'], how='left')
df_trn
# 메모리 효율을 위해 불필요한 변수를 메모리에서 제거한다
del trn, trn_lag
# 저번달의 제품 정보가 존재하지 않을 경우를 대비하여 0으로 대체한다.
for prod in prods:
= prod + '_prev'
prev 0, inplace=True)
df_trn[prev].fillna(
# 원본 데이터에서의 제품 보유 여부 – lag데이터에서의 제품 보유 여부를 비교하여 신규 구매 변수 padd를 구한다
for prod in prods:
= prod + '_add'
padd = prod + '_prev'
prev = ((df_trn[prod] == 1) & (df_trn[prev] == 0)).astype(np.int8)
df_trn[padd]
# 신규 구매 변수만을 추출하여 labels에 저장한다.
= [prod + '_add' for prod in prods]
add_cols = df_trn[add_cols].copy()
labels = prods
labels.columns #labels.to_csv('../input/labels.csv', index=False)
# 코드 1-12. 신규 구매 누적 막대 그래프를 시각화하기
#labels = pd.read_csv('../input/labels.csv').astype(int)
= trn['fecha_dato']
fecha_dato
'date'] = fecha_dato.fecha_dato
labels[= np.unique(fecha_dato.fecha_dato).tolist()
months = labels.columns.tolist()[:24]
label_cols
= []
label_over_time for i in range(len(label_cols)):
'date'])[label_cols[i]].agg('sum').tolist())
label_over_time.append(labels.groupby([
= []
label_sum_over_time for i in range(len(label_cols)):
sum(axis=0))
label_sum_over_time.append(np.asarray(label_over_time[i:]).
= ['#F5B7B1','#D2B4DE','#AED6F1','#A2D9CE','#ABEBC6','#F9E79F','#F5CBA7','#CCD1D1']
color_list
= plt.subplots(figsize=(30, 15))
f, ax for i in range(len(label_cols)):
=months, y=label_sum_over_time[i], color = color_list[i%8], alpha=0.7)
sns.barplot(x
0,0),1,1,fc=color_list[i%8], edgecolor = 'none') for i in range(len(label_cols))], label_cols, loc=1, ncol = 2, prop={'size':16}) plt.legend([plt.Rectangle((
<matplotlib.legend.Legend at 0x7fb5f6211550>
# 코드 1-13. 신규 구매 누적 막대 그래프를 상대값으로 시각화하기
= (label_sum_over_time / (1.*np.asarray(label_sum_over_time).max(axis=0))) * 100
label_sum_percent
= plt.subplots(figsize=(30, 15))
f, ax for i in range(len(label_cols)):
=months, y=label_sum_percent[i], color = color_list[i%8], alpha=0.7)
sns.barplot(x
0,0),1,1,fc=color_list[i%8], edgecolor = 'none') for i in range(len(label_cols))], \
plt.legend([plt.Rectangle((=1, ncol = 2, prop={'size':16}) label_cols, loc
<matplotlib.legend.Legend at 0x7fb5f682a898>