import numpy as np
import pandas as pd
import sklearn.linear_model
해당 자료는 전북대학교 최규빈 교수님 2023학년도 2학기 빅데이터분석특강 자료임
03wk-011: Medical Cost, 회귀분석
최규빈
2023-09-21
1. 강의영상
https://youtu.be/playlist?list=PLQqh36zP38-zwUq3ZIN2SNas0l8htamMO&si=dH2sszdMPMFTGeEV
2. Import
3. Data 불러오기
-
캐글에서 Medical Cost Personal Datasets
download
-
Data Load
= pd.read_csv('insurance.csv')
df df
age | sex | bmi | children | smoker | region | charges | |
---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 |
... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 |
1338 rows × 7 columns
4. 분석
A. Data 정리
df.columns
Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')
= pd.get_dummies(df.drop(['charges'],axis=1))
X = df[['charges']] y
X
age | bmi | children | sex_female | sex_male | smoker_no | smoker_yes | region_northeast | region_northwest | region_southeast | region_southwest | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 19 | 27.900 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
1 | 18 | 33.770 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
2 | 28 | 33.000 | 3 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
3 | 33 | 22.705 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
4 | 32 | 28.880 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | 30.970 | 3 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 |
1334 | 18 | 31.920 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
1335 | 18 | 36.850 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1336 | 21 | 25.800 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 |
1337 | 61 | 29.070 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
1338 rows × 11 columns
y
charges | |
---|---|
0 | 16884.92400 |
1 | 1725.55230 |
2 | 4449.46200 |
3 | 21984.47061 |
4 | 3866.85520 |
... | ... |
1333 | 10600.54830 |
1334 | 2205.98080 |
1335 | 1629.83350 |
1336 | 2007.94500 |
1337 | 29141.36030 |
1338 rows × 1 columns
B. Predictor 생성
= sklearn.linear_model.LinearRegression() predictr
C. 학습
predictr.fit(X,y)
LinearRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
D. 예측
= predictr.predict(X)) df.assign(yhat
age | sex | bmi | children | smoker | region | charges | yhat | |
---|---|---|---|---|---|---|---|---|
0 | 19 | female | 27.900 | 0 | yes | southwest | 16884.92400 | 25293.713028 |
1 | 18 | male | 33.770 | 1 | no | southeast | 1725.55230 | 3448.602834 |
2 | 28 | male | 33.000 | 3 | no | southeast | 4449.46200 | 6706.988491 |
3 | 33 | male | 22.705 | 0 | no | northwest | 21984.47061 | 3754.830163 |
4 | 32 | male | 28.880 | 0 | no | northwest | 3866.85520 | 5592.493386 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
1333 | 50 | male | 30.970 | 3 | no | northwest | 10600.54830 | 12351.323686 |
1334 | 18 | female | 31.920 | 0 | no | northeast | 2205.98080 | 3511.930809 |
1335 | 18 | female | 36.850 | 0 | no | southeast | 1629.83350 | 4149.132486 |
1336 | 21 | female | 25.800 | 0 | no | southwest | 2007.94500 | 1246.584939 |
1337 | 61 | female | 29.070 | 0 | yes | northwest | 29141.36030 | 37085.623268 |
1338 rows × 8 columns
E. 평가
# R^2 predictr.score(X,y)
0.7509130345985207
0.7 이상이면 망한모형까지는 아님 (대회용으로는 부적절할 수 있으나 대충 쓸 수는 있는 정도)
5. 계수해석
-
상수항 해석
predictr.intercept_
array([-666.93771994])
- 기본적인 보험료는 -666이라는 의미
-
계수해석
'name':list(X.columns), 'coef':predictr.coef_.reshape(-1)}) pd.DataFrame({
name | coef | |
---|---|---|
0 | age | 256.856353 |
1 | bmi | 339.193454 |
2 | children | 475.500545 |
3 | sex_female | 65.657180 |
4 | sex_male | -65.657180 |
5 | smoker_no | -11924.267271 |
6 | smoker_yes | 11924.267271 |
7 | region_northeast | 587.009235 |
8 | region_northwest | 234.045336 |
9 | region_southeast | -448.012814 |
10 | region_southwest | -373.041756 |
지역은 잘 모르겠으나 나머지는 꽤 그럴듯해 보임
나이 먹으면 256만큼 증가
여성이면 65만큼 더 증가