import
library (MASS) #lm.ridge
library (car) #vif
library (caret) #예측
library (ggplot2)
library (glmnet) #Ridge, Lasso
library (tidyverse)
Loading required package: carData
Loading required package: ggplot2
Loading required package: lattice
Loading required package: Matrix
Loaded glmnet 4.1-7
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
✔ tibble 3.2.0 ✔ dplyr 1.1.0
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.4 ✔ forcats 1.0.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand() masks Matrix::expand()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ purrr::lift() masks caret::lift()
✖ tidyr::pack() masks Matrix::pack()
✖ dplyr::recode() masks car::recode()
✖ dplyr::select() masks MASS::select()
✖ purrr::some() masks car::some()
✖ tidyr::unpack() masks Matrix::unpack()
get_high_vif_variables <- function (data, threshold) {
vif_values <- vif (data)
high_vif_variables <- names (vif_values[vif_values > threshold])
return (high_vif_variables)
}
데이터셋
picher <- read.csv ("~/Dropbox/coco/posts/Applied statistics/picher_stats_2017.csv" )
head (picher)
A data.frame: 6 × 22
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
1
켈리
SK
16
7
0
0
0
30
30
190.0
⋯
0.76
0.342
73.7
3.60
6.91
3.69
3.44
6.62
140000
85000
2
소사
LG
11
11
1
0
0
30
29
185.1
⋯
0.53
0.319
67.1
3.88
6.80
3.52
3.41
6.08
120000
50000
3
양현종
KIA
20
6
0
0
0
31
31
193.1
⋯
0.79
0.332
72.1
3.44
6.54
3.94
3.82
5.64
230000
150000
4
차우찬
LG
10
7
0
0
0
28
28
175.2
⋯
1.02
0.298
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
0.91
0.323
74.1
3.80
6.13
4.36
4.31
4.38
111000
85000
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
⋯
1.12
0.289
76.1
3.04
6.52
4.42
4.32
3.94
85000
35000
dt2018 <- data.frame (new_col = picher$ 연봉.2018. )
dt2017 <- data.frame (new_col = picher$ 연봉.2017. )
dt <- subset (picher, select = - c (연봉.2017. ,연봉.2018. ))
A data.frame: 6 × 20
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
1
켈리
SK
16
7
0
0
0
30
30
190.0
8.95
2.13
0.76
0.342
73.7
3.60
6.91
3.69
3.44
6.62
2
소사
LG
11
11
1
0
0
30
29
185.1
7.43
1.85
0.53
0.319
67.1
3.88
6.80
3.52
3.41
6.08
3
양현종
KIA
20
6
0
0
0
31
31
193.1
7.36
2.09
0.79
0.332
72.1
3.44
6.54
3.94
3.82
5.64
4
차우찬
LG
10
7
0
0
0
28
28
175.2
8.04
1.95
1.02
0.298
75.0
3.43
6.11
4.20
4.03
4.63
5
레일리
롯데
13
7
0
0
0
30
30
187.1
7.49
2.11
0.91
0.323
74.1
3.80
6.13
4.36
4.31
4.38
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
7.42
1.74
1.12
0.289
76.1
3.04
6.52
4.42
4.32
3.94
dt <- cbind (dt, new_col = dt2017)
names (dt)[length (names (dt))] <- "연봉.2017."
dt <- cbind (dt, new_col = dt2018)
names (dt)[length (names (dt))] <- "연봉.2018."
A data.frame: 6 × 22
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
1
켈리
SK
16
7
0
0
0
30
30
190.0
⋯
0.76
0.342
73.7
3.60
6.91
3.69
3.44
6.62
85000
140000
2
소사
LG
11
11
1
0
0
30
29
185.1
⋯
0.53
0.319
67.1
3.88
6.80
3.52
3.41
6.08
50000
120000
3
양현종
KIA
20
6
0
0
0
31
31
193.1
⋯
0.79
0.332
72.1
3.44
6.54
3.94
3.82
5.64
150000
230000
4
차우찬
LG
10
7
0
0
0
28
28
175.2
⋯
1.02
0.298
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
0.91
0.323
74.1
3.80
6.13
4.36
4.31
4.38
85000
111000
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
⋯
1.12
0.289
76.1
3.04
6.52
4.42
4.32
3.94
35000
85000
dt <- subset (dt, select = - c (팀명,선수명))
A data.frame: 6 × 20
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
1
16
7
0
0
0
30
30
190.0
8.95
2.13
0.76
0.342
73.7
3.60
6.91
3.69
3.44
6.62
85000
140000
2
11
11
1
0
0
30
29
185.1
7.43
1.85
0.53
0.319
67.1
3.88
6.80
3.52
3.41
6.08
50000
120000
3
20
6
0
0
0
31
31
193.1
7.36
2.09
0.79
0.332
72.1
3.44
6.54
3.94
3.82
5.64
150000
230000
4
10
7
0
0
0
28
28
175.2
8.04
1.95
1.02
0.298
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
5
13
7
0
0
0
30
30
187.1
7.49
2.11
0.91
0.323
74.1
3.80
6.13
4.36
4.31
4.38
85000
111000
6
8
10
0
0
0
26
26
160.0
7.42
1.74
1.12
0.289
76.1
3.04
6.52
4.42
4.32
3.94
35000
85000
회귀직선적합
model1(원본)
model1 <- lm (연봉.2018. ~ ., dt)
summary (model1)
Call:
lm(formula = 연봉.2018. ~ ., data = dt)
Residuals:
Min 1Q Median 3Q Max
-46529 -2418 424 2649 47773
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.513e+04 1.826e+04 0.829 0.4087
승 1.004e+03 5.375e+02 1.869 0.0639 .
패 -1.836e+02 5.504e+02 -0.334 0.7392
세 -2.112e+01 2.713e+02 -0.078 0.9381
홀드 -1.817e+01 3.161e+02 -0.057 0.9542
블론 4.535e+02 7.610e+02 0.596 0.5522
경기 -1.760e+02 1.456e+02 -1.209 0.2289
선발 -6.719e+02 4.616e+02 -1.456 0.1479
이닝 7.425e+01 1.156e+02 0.642 0.5217
삼진.9 -4.603e+02 2.349e+03 -0.196 0.8449
볼넷.9 1.194e+03 2.256e+03 0.529 0.5976
홈런.9 4.874e+03 1.413e+04 0.345 0.7306
BABIP -9.997e+03 1.486e+04 -0.673 0.5022
LOB. -4.350e+01 1.299e+02 -0.335 0.7382
ERA -7.413e+01 5.693e+02 -0.130 0.8966
RA9.WAR -7.584e+02 1.487e+03 -0.510 0.6109
FIP -6.436e+03 4.477e+04 -0.144 0.8859
kFIP 3.805e+03 3.593e+04 0.106 0.9158
WAR 8.559e+03 1.789e+03 4.783 4.55e-06 ***
연봉.2017. 8.755e-01 4.444e-02 19.698 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9198 on 132 degrees of freedom
Multiple R-squared: 0.9228, Adjusted R-squared: 0.9116
F-statistic: 82.99 on 19 and 132 DF, p-value: < 2.2e-16
model1 <- lm (연봉.2018. ~ + WAR+ 연봉.2017. , dt)
summary (model1)
Call:
lm(formula = 연봉.2018. ~ +WAR + 연봉.2017., data = dt)
Residuals:
Min 1Q Median 3Q Max
-50442 -1849 758 2050 56166
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -576.58811 889.09610 -0.649 0.518
WAR 7007.17364 761.83979 9.198 3.03e-16 ***
연봉.2017. 0.89926 0.04022 22.360 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9124 on 149 degrees of freedom
Multiple R-squared: 0.9142, Adjusted R-squared: 0.913
F-statistic: 793.8 on 2 and 149 DF, p-value: < 2.2e-16
WAR 1.84059685810784 연봉.2017. 1.84059685810784
high_vif_vars <- get_high_vif_variables (model1, threshold)
print (high_vif_vars)
high_vif_vars <- get_high_vif_variables (model1, threshold)
print (high_vif_vars)
pairs (dt,panel= panel.smooth)
dt_numeric <- dt[, sapply (dt, is.numeric)]
cor_matrix <- cor (dt_numeric)
print (round (cor_matrix,2 ))
승 패 세 홀드 블론 경기 선발 이닝 삼진.9 볼넷.9 홈런.9
승 1.00 0.71 0.05 0.09 0.11 0.40 0.77 0.91 0.08 -0.40 -0.12
패 0.71 1.00 0.07 0.10 0.12 0.34 0.77 0.83 0.03 -0.39 -0.06
세 0.05 0.07 1.00 0.11 0.61 0.43 -0.18 0.02 0.17 -0.13 -0.07
홀드 0.09 0.10 0.11 1.00 0.49 0.72 -0.29 0.02 0.19 -0.15 -0.08
블론 0.11 0.12 0.61 0.49 1.00 0.63 -0.26 0.01 0.19 -0.14 -0.06
경기 0.40 0.34 0.43 0.72 0.63 1.00 -0.04 0.38 0.19 -0.36 -0.11
선발 0.77 0.77 -0.18 -0.29 -0.26 -0.04 1.00 0.89 -0.06 -0.31 -0.06
이닝 0.91 0.83 0.02 0.02 0.01 0.38 0.89 1.00 0.04 -0.45 -0.11
삼진.9 0.08 0.03 0.17 0.19 0.19 0.19 -0.06 0.04 1.00 0.11 0.22
볼넷.9 -0.40 -0.39 -0.13 -0.15 -0.14 -0.36 -0.31 -0.45 0.11 1.00 0.30
홈런.9 -0.12 -0.06 -0.07 -0.08 -0.06 -0.11 -0.06 -0.11 0.22 0.30 1.00
BABIP -0.17 -0.13 -0.09 -0.10 -0.11 -0.24 -0.10 -0.19 0.46 0.28 0.36
LOB. 0.13 -0.02 0.17 0.05 0.10 0.11 0.04 0.10 -0.07 -0.15 -0.27
ERA -0.27 -0.19 -0.15 -0.16 -0.16 -0.32 -0.16 -0.29 0.26 0.52 0.63
RA9.WAR 0.85 0.60 0.17 0.00 0.01 0.28 0.74 0.85 0.10 -0.40 -0.19
FIP -0.30 -0.23 -0.20 -0.21 -0.21 -0.35 -0.15 -0.30 -0.15 0.63 0.83
kFIP -0.31 -0.24 -0.23 -0.24 -0.24 -0.37 -0.14 -0.30 -0.32 0.61 0.74
WAR 0.82 0.63 0.08 -0.04 -0.06 0.20 0.76 0.83 0.15 -0.39 -0.21
연봉.2017. 0.63 0.43 0.26 0.00 0.15 0.23 0.49 0.59 0.10 -0.33 -0.10
연봉.2018. 0.71 0.47 0.21 -0.02 0.10 0.21 0.56 0.66 0.10 -0.33 -0.12
BABIP LOB. ERA RA9.WAR FIP kFIP WAR 연봉.2017. 연봉.2018.
승 -0.17 0.13 -0.27 0.85 -0.30 -0.31 0.82 0.63 0.71
패 -0.13 -0.02 -0.19 0.60 -0.23 -0.24 0.63 0.43 0.47
세 -0.09 0.17 -0.15 0.17 -0.20 -0.23 0.08 0.26 0.21
홀드 -0.10 0.05 -0.16 0.00 -0.21 -0.24 -0.04 0.00 -0.02
블론 -0.11 0.10 -0.16 0.01 -0.21 -0.24 -0.06 0.15 0.10
경기 -0.24 0.11 -0.32 0.28 -0.35 -0.37 0.20 0.23 0.21
선발 -0.10 0.04 -0.16 0.74 -0.15 -0.14 0.76 0.49 0.56
이닝 -0.19 0.10 -0.29 0.85 -0.30 -0.30 0.83 0.59 0.66
삼진.9 0.46 -0.07 0.26 0.10 -0.15 -0.32 0.15 0.10 0.10
볼넷.9 0.28 -0.15 0.52 -0.40 0.63 0.61 -0.39 -0.33 -0.33
홈런.9 0.36 -0.27 0.63 -0.19 0.83 0.74 -0.21 -0.10 -0.12
BABIP 1.00 -0.51 0.73 -0.19 0.25 0.17 -0.08 -0.09 -0.10
LOB. -0.51 1.00 -0.72 0.29 -0.29 -0.27 0.14 0.11 0.13
ERA 0.73 -0.72 1.00 -0.34 0.65 0.58 -0.26 -0.20 -0.22
RA9.WAR -0.19 0.29 -0.34 1.00 -0.37 -0.38 0.92 0.64 0.74
FIP 0.25 -0.29 0.65 -0.37 1.00 0.98 -0.39 -0.27 -0.28
kFIP 0.17 -0.27 0.58 -0.38 0.98 1.00 -0.41 -0.28 -0.30
WAR -0.08 0.14 -0.26 0.92 -0.39 -0.41 1.00 0.68 0.79
연봉.2017. -0.09 0.11 -0.20 0.64 -0.27 -0.28 0.68 1.00 0.93
연봉.2018. -0.10 0.13 -0.22 0.74 -0.28 -0.30 0.79 0.93 1.00
다중공산성 해결 방법
VIF계수가 높은 변수 제거
model2(Vif 10 이상인 변수 제거)
model2 <- lm (연봉.2018. ~ .- 경기- 선발- 이닝- 삼진.9 - 볼넷.9 - 홈런.9 - ERA- RA9.WAR- FIP- kFIP, dt)
summary (model2)
Call:
lm(formula = 연봉.2018. ~ . - 경기 - 선발 - 이닝 - 삼진.9 -
볼넷.9 - 홈런.9 - ERA - RA9.WAR - FIP - kFIP, data = dt)
Residuals:
Min 1Q Median 3Q Max
-48657 -1981 511 2303 51073
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.432e+03 7.893e+03 0.815 0.4165
승 4.770e+02 4.061e+02 1.175 0.2421
패 -7.851e+02 3.525e+02 -2.227 0.0275 *
세 -1.172e+02 2.150e+02 -0.545 0.5865
홀드 -1.229e+02 1.973e+02 -0.623 0.5344
블론 6.340e+02 7.188e+02 0.882 0.3792
BABIP -7.810e+03 9.994e+03 -0.781 0.4358
LOB. -4.979e+01 7.793e+01 -0.639 0.5239
WAR 7.298e+03 1.169e+03 6.243 4.67e-09 ***
연봉.2017. 8.846e-01 4.322e-02 20.469 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9149 on 142 degrees of freedom
Multiple R-squared: 0.9178, Adjusted R-squared: 0.9126
F-statistic: 176.1 on 9 and 142 DF, p-value: < 2.2e-16
승 4.44133840452701 패 2.19787784118271 세 1.9291576101908 홀드 1.43155944990414 블론 2.48814143828698 BABIP 1.42670331782564 LOB. 1.47225296358887 WAR 4.30937396392511 연봉.2017. 2.11357639082776
model1에서 다중공산성이 높았던 변수들을 제외하고 lm을 돌렸더니, 회귀모형은 유의하게 나왔고 R^2값도 91%로 높게 나왔지만 model1보다는 R^2값이 조금 적게 나왔다.
다중공산성이 높은 변수를 제외하는 것은 다른 것들도 확인을 해보아야 한다.
VIF제거시 고려사항
VIF계수가 높은 피처 우선 제거하되, FIP, kFIP와 같이 유사한 변수들은 두개 중에서 하나만 제거해보자.
model3 <- lm (연봉.2018. ~ .- FIP, dt)
summary (model3)
Call:
lm(formula = 연봉.2018. ~ . - FIP, data = dt)
Residuals:
Min 1Q Median 3Q Max
-46688 -2466 423 2597 47710
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.406e+04 1.660e+04 0.847 0.399
승 1.007e+03 5.352e+02 1.882 0.062 .
패 -1.723e+02 5.427e+02 -0.317 0.751
세 -2.263e+01 2.701e+02 -0.084 0.933
홀드 -1.779e+01 3.149e+02 -0.056 0.955
블론 4.563e+02 7.579e+02 0.602 0.548
경기 -1.738e+02 1.443e+02 -1.205 0.230
선발 -6.701e+02 4.598e+02 -1.458 0.147
이닝 7.216e+01 1.142e+02 0.632 0.529
삼진.9 -7.714e+02 9.085e+02 -0.849 0.397
볼넷.9 8.998e+02 9.504e+02 0.947 0.346
홈런.9 2.904e+03 3.404e+03 0.853 0.395
BABIP -9.797e+03 1.474e+04 -0.665 0.507
LOB. -4.465e+01 1.292e+02 -0.346 0.730
ERA -8.076e+01 5.654e+02 -0.143 0.887
RA9.WAR -7.473e+02 1.480e+03 -0.505 0.614
kFIP -1.347e+03 2.371e+03 -0.568 0.571
WAR 8.560e+03 1.783e+03 4.802 4.17e-06 ***
연봉.2017. 8.757e-01 4.426e-02 19.787 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9164 on 133 degrees of freedom
Multiple R-squared: 0.9227, Adjusted R-squared: 0.9123
F-statistic: 88.25 on 18 and 133 DF, p-value: < 2.2e-16
승 7.68840921316788 패 5.19140274673014 세 3.03263975401923 홀드 3.63578951116975 블론 2.75775305712261 경기 14.0426530671348 선발 36.1331990754777 이닝 59.3709458069269 삼진.9 11.8666574529657 볼넷.9 9.0682604275144 홈런.9 21.5595297493918 BABIP 3.09205217740503 LOB. 4.03056643053091 ERA 9.9781711774582 RA9.WAR 13.3837520395074 kFIP 39.6977412189025 WAR 9.99134587181084 연봉.2017. 2.20945320867407
VIF계수가 가장 높았떤 FIP를 제거하니 전체적으로 VIF값들이 많이 감소했다. 볼넷의 경우 50에서 9로 감소함
model3 <- lm (연봉.2018. ~ .- FIP- 이닝, dt)
summary (model3)
vif (model3)
Call:
lm(formula = 연봉.2018. ~ . - FIP - 이닝, data = dt)
Residuals:
Min 1Q Median 3Q Max
-47170 -2539 292 2603 47529
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.425e+04 1.656e+04 0.860 0.3912
승 1.053e+03 5.292e+02 1.989 0.0487 *
패 -1.258e+02 5.365e+02 -0.234 0.8150
세 -7.264e+01 2.576e+02 -0.282 0.7784
홀드 -7.025e+01 3.031e+02 -0.232 0.8171
블론 4.745e+02 7.557e+02 0.628 0.5312
경기 -1.021e+02 8.877e+01 -1.150 0.2523
선발 -4.306e+02 2.595e+02 -1.659 0.0994 .
삼진.9 -7.892e+02 9.060e+02 -0.871 0.3853
볼넷.9 8.829e+02 9.479e+02 0.931 0.3533
홈런.9 2.956e+03 3.396e+03 0.871 0.3855
BABIP -1.004e+04 1.470e+04 -0.683 0.4957
LOB. -4.506e+01 1.289e+02 -0.350 0.7272
ERA -6.838e+01 5.637e+02 -0.121 0.9036
RA9.WAR -4.551e+02 1.402e+03 -0.325 0.7460
kFIP -1.349e+03 2.366e+03 -0.570 0.5696
WAR 8.733e+03 1.758e+03 4.968 2.03e-06 ***
연봉.2017. 8.784e-01 4.395e-02 19.984 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9143 on 134 degrees of freedom
Multiple R-squared: 0.9225, Adjusted R-squared: 0.9127
F-statistic: 93.84 on 17 and 134 DF, p-value: < 2.2e-16
승 7.54995904402493 패 5.09588128549648 세 2.77188079102657 홀드 3.38288542173628 블론 2.75379743632109 경기 5.34096225137479 선발 11.5652288817222 삼진.9 11.8553164987874 볼넷.9 9.06115155090058 홈런.9 21.5464901061541 BABIP 3.0899457520436 LOB. 4.03046844648891 ERA 9.96618149145026 RA9.WAR 12.0759322801392 kFIP 39.6977185280435 WAR 9.75717291579296 연봉.2017. 2.18906326524158
그 다음 vif계수값이 높은 ’이닝’을 제거했다.
model3 <- lm (연봉.2018. ~ .- FIP- 이닝- kFIP, dt)
summary (model3)
vif (model3)
Call:
lm(formula = 연봉.2018. ~ . - FIP - 이닝 - kFIP, data = dt)
Residuals:
Min 1Q Median 3Q Max
-47261 -2379 309 2742 47813
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.978e+03 1.054e+04 0.662 0.5090
승 1.055e+03 5.278e+02 1.999 0.0476 *
패 -1.135e+02 5.347e+02 -0.212 0.8323
세 -7.382e+01 2.569e+02 -0.287 0.7743
홀드 -7.661e+01 3.021e+02 -0.254 0.8002
블론 5.038e+02 7.521e+02 0.670 0.5040
경기 -9.923e+01 8.841e+01 -1.122 0.2637
선발 -4.402e+02 2.583e+02 -1.704 0.0906 .
삼진.9 -3.109e+02 3.413e+02 -0.911 0.3639
볼넷.9 4.082e+02 4.514e+02 0.904 0.3675
홈런.9 1.129e+03 1.118e+03 1.010 0.3143
BABIP -9.576e+03 1.464e+04 -0.654 0.5141
LOB. -3.779e+01 1.279e+02 -0.295 0.7681
ERA -8.963e+01 5.611e+02 -0.160 0.8733
RA9.WAR -4.669e+02 1.399e+03 -0.334 0.7390
WAR 8.800e+03 1.749e+03 5.030 1.53e-06 ***
연봉.2017. 8.779e-01 4.384e-02 20.027 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9120 on 135 degrees of freedom
Multiple R-squared: 0.9223, Adjusted R-squared: 0.9131
F-statistic: 100.2 on 16 and 135 DF, p-value: < 2.2e-16
승 7.54945354936935 패 5.08758327243023 세 2.77170233465349 홀드 3.37829494203411 블론 2.74099514800148 경기 5.32417495720081 선발 11.5165551404735 삼진.9 1.691074402967 볼넷.9 2.06526070907551 홈런.9 2.34713984614222 BABIP 3.08046557773165 LOB. 3.99101474806593 ERA 9.92261602645989 RA9.WAR 12.0732896169765 WAR 9.71340804685137 연봉.2017. 2.18834997395971
model3 <- lm (연봉.2018. ~ .- FIP- 이닝- kFIP- RA9.WAR, dt)
summary (model3)
vif (model3)
Call:
lm(formula = 연봉.2018. ~ . - FIP - 이닝 - kFIP - RA9.WAR,
data = dt)
Residuals:
Min 1Q Median 3Q Max
-47256 -2340 228 2820 48394
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.002e+03 1.005e+04 0.796 0.4273
승 1.005e+03 5.044e+02 1.993 0.0483 *
패 -6.188e+01 5.102e+02 -0.121 0.9036
세 -1.005e+02 2.434e+02 -0.413 0.6805
홀드 -8.969e+01 2.986e+02 -0.300 0.7643
블론 5.293e+02 7.457e+02 0.710 0.4790
경기 -1.027e+02 8.749e+01 -1.174 0.2424
선발 -4.671e+02 2.447e+02 -1.909 0.0584 .
삼진.9 -3.052e+02 3.398e+02 -0.898 0.3707
볼넷.9 4.218e+02 4.481e+02 0.941 0.3482
홈런.9 1.154e+03 1.112e+03 1.037 0.3013
BABIP -9.059e+03 1.451e+04 -0.624 0.5334
LOB. -5.310e+01 1.190e+02 -0.446 0.6562
ERA -1.249e+02 5.493e+02 -0.227 0.8205
WAR 8.406e+03 1.289e+03 6.523 1.25e-09 ***
연봉.2017. 8.790e-01 4.358e-02 20.168 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9090 on 136 degrees of freedom
Multiple R-squared: 0.9223, Adjusted R-squared: 0.9137
F-statistic: 107.6 on 15 and 136 DF, p-value: < 2.2e-16
승 6.94042688100102 패 4.66294914095306 세 2.50425253772256 홀드 3.32146243586634 블론 2.71278267026247 경기 5.24904766467952 선발 10.3991198289636 삼진.9 1.68672132907842 볼넷.9 2.04837389574494 홈런.9 2.33709850623971 BABIP 3.046073880605 LOB. 3.47761526760098 ERA 9.57142193166993 WAR 5.30623103354661 연봉.2017. 2.17720905024618
model3 <- lm (연봉.2018. ~ .- FIP- 이닝- kFIP- RA9.WAR- 선발, dt)
summary (model3)
vif (model3)
Call:
lm(formula = 연봉.2018. ~ . - FIP - 이닝 - kFIP - RA9.WAR -
선발, data = dt)
Residuals:
Min 1Q Median 3Q Max
-46776 -2395 374 2597 50018
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7221.2085 10138.7800 0.712 0.4775
승 558.2422 451.0986 1.238 0.2180
패 -758.5773 359.9930 -2.107 0.0369 *
세 -12.1963 241.3026 -0.051 0.9598
홀드 106.5326 283.0186 0.376 0.7072
블론 843.4665 734.3349 1.149 0.2527
경기 -69.6278 86.5803 -0.804 0.4227
삼진.9 -270.1732 342.5480 -0.789 0.4316
볼넷.9 431.0859 452.3679 0.953 0.3423
홈런.9 983.4449 1119.0004 0.879 0.3810
BABIP -8863.3423 14648.1787 -0.605 0.5461
LOB. -57.5239 120.1320 -0.479 0.6328
ERA -88.2397 554.2171 -0.159 0.8737
WAR 7825.6419 1264.4051 6.189 6.57e-09 ***
연봉.2017. 0.8792 0.0440 19.981 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9178 on 137 degrees of freedom
Multiple R-squared: 0.9202, Adjusted R-squared: 0.912
F-statistic: 112.8 on 14 and 137 DF, p-value: < 2.2e-16
승 5.44563629770577 패 2.27744620091112 세 2.41391273581397 홀드 2.9278580003899 블론 2.58069224802437 경기 5.04287223750904 삼진.9 1.68181308100372 볼넷.9 2.04813245831198 홈런.9 2.32207970272157 BABIP 3.04592151858475 LOB. 3.47629848428834 ERA 9.55974332977644 WAR 5.01053061622992 연봉.2017. 2.17719632600132
유의미한 변수는 ’WAR’과 ’연봉(2017)’이다.
정규화
normalize
normalize <- function (x) {
return ((x - mean (x)) / sd (x))
}
df_normalized <- as.data.frame (lapply (dt, normalize))
A data.frame: 6 × 20
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
1
3.313623
1.2271453
-0.3064519
-0.5857052
-0.5435919
0.05943348
2.452068
2.645175
0.6720988
-0.8689998
-0.44238194
0.01678276
0.4466146
-0.5870557
3.174630
-0.9710297
-1.0581252
4.503142
2.7347053
3.912893
2
2.019505
2.5047212
-0.0985024
-0.5857052
-0.5435919
0.05943348
2.349505
2.547755
0.1345315
-0.9875023
-0.66852133
-0.24168646
-0.1227637
-0.5198553
3.114968
-1.0618879
-1.0732645
4.094734
1.3373033
3.266495
3
4.348918
0.9077513
-0.3064519
-0.5857052
-0.5435919
0.11105570
2.554632
2.706808
0.1097751
-0.8859287
-0.41288550
-0.09559517
0.3085835
-0.6254559
2.973948
-0.8374147
-0.8663606
3.761956
5.3298806
6.821679
4
1.760682
1.2271453
-0.3064519
-0.5857052
-0.5435919
-0.04381097
2.246942
2.350927
0.3502657
-0.9451800
-0.18674611
-0.47768010
0.5587649
-0.6278559
2.740722
-0.6984550
-0.7603854
2.998081
3.3335919
2.620098
5
2.537153
1.2271453
-0.3064519
-0.5857052
-0.5435919
0.05943348
2.452068
2.587518
0.1557512
-0.8774643
-0.29489973
-0.19673529
0.4811224
-0.5390554
2.751570
-0.6129414
-0.6190851
2.809003
2.7347053
2.975617
6
1.243035
2.1853272
-0.3064519
-0.5857052
-0.5435919
-0.14705541
2.041816
2.048726
0.1309948
-1.0340569
-0.08842464
-0.57882022
0.6536613
-0.7214564
2.963100
-0.5808738
-0.6140386
2.476226
0.7384167
2.135301
model4 <- lm (연봉.2018. ~ ., df_normalized)
summary (model4)
Call:
lm(formula = 연봉.2018. ~ ., data = df_normalized)
Residuals:
Min 1Q Median 3Q Max
-1.50382 -0.07816 0.01372 0.08561 1.54402
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -5.061e-16 2.411e-02 0.000 1.0000
승 1.254e-01 6.712e-02 1.869 0.0639 .
패 -1.858e-02 5.569e-02 -0.334 0.7392
세 -3.282e-03 4.216e-02 -0.078 0.9381
홀드 -2.652e-03 4.613e-02 -0.057 0.9542
블론 2.395e-02 4.019e-02 0.596 0.5522
경기 -1.102e-01 9.116e-02 -1.209 0.2289
선발 -2.117e-01 1.455e-01 -1.456 0.1479
이닝 1.207e-01 1.879e-01 0.642 0.5217
삼진.9 -4.207e-02 2.146e-01 -0.196 0.8449
볼넷.9 9.115e-02 1.723e-01 0.529 0.5976
홈런.9 1.602e-01 4.643e-01 0.345 0.7306
BABIP -2.875e-02 4.273e-02 -0.673 0.5022
LOB. -1.630e-02 4.866e-02 -0.335 0.7382
ERA -9.982e-03 7.667e-02 -0.130 0.8966
RA9.WAR -4.519e-02 8.862e-02 -0.510 0.6109
FIP -3.892e-01 2.707e+00 -0.144 0.8859
kFIP 2.437e-01 2.301e+00 0.106 0.9158
WAR 3.657e-01 7.647e-02 4.783 4.55e-06 ***
연봉.2017. 7.087e-01 3.598e-02 19.698 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2973 on 132 degrees of freedom
Multiple R-squared: 0.9228, Adjusted R-squared: 0.9116
F-statistic: 82.99 on 19 and 132 DF, p-value: < 2.2e-16
high_vif_vars <- ㅍget_high_vif_variables (model4, threshold)
print (high_vif_vars)
[1] "경기" "선발" "이닝" "삼진.9" "볼넷.9" "홈런.9" "ERA"
[8] "RA9.WAR" "FIP" "kFIP"
승 7.6993466963604 패 5.30032799820294 세 3.03718565967898 홀드 3.63605161357941 블론 2.75956034802382 경기 14.2011271199764 선발 36.160187873294 이닝 60.3244538179135 삼진.9 78.7161704890434 볼넷.9 50.7257985014939 홈런.9 368.399308167005 BABIP 3.11936893312485 LOB. 4.04602533224442 ERA 10.044173823258 RA9.WAR 13.4198973514472 FIP 12525.2424058262 kFIP 9046.04880481494 WAR 9.99177856815799 연봉.2017. 2.2118694256429
변수선택
model3(AIC)
-
AIC(Step)
m0 = lm (연봉.2018. ~ 1 , data = dt)
model3 = step (
m0,
scope = 연봉.2018. ~ 연봉.2017. + 승+ 패+ 세+ 홀드+ 블론+ 경기+ 선발+ 이닝+ 삼진.9 + 볼넷.9 + 홈런.9 + BABIP+ LOB.+ ERA+ RA9.WAR+ FIP+ kFIP+ WAR,
direction = "both" )
Start: AIC=3144.3
연봉.2018. ~ 1
Df Sum of Sq RSS AIC
+ 연봉.2017. 1 1.2511e+11 1.9445e+10 2841.4
+ WAR 1 9.0535e+10 5.4022e+10 2996.7
+ RA9.WAR 1 7.9230e+10 6.5326e+10 3025.6
+ 승 1 7.3377e+10 7.1179e+10 3038.6
+ 이닝 1 6.2759e+10 8.1797e+10 3059.8
+ 선발 1 4.5409e+10 9.9147e+10 3089.0
+ 패 1 3.1910e+10 1.1265e+11 3108.4
+ 볼넷.9 1 1.5661e+10 1.2890e+11 3128.9
+ kFIP 1 1.2591e+10 1.3197e+11 3132.4
+ FIP 1 1.1403e+10 1.3315e+11 3133.8
+ ERA 1 6.7332e+09 1.3782e+11 3139.1
+ 세 1 6.4461e+09 1.3811e+11 3139.4
+ 경기 1 6.3714e+09 1.3819e+11 3139.4
+ LOB. 1 2.2831e+09 1.4227e+11 3143.9
+ 홈런.9 1 1.9575e+09 1.4260e+11 3144.2
<none> 1.4456e+11 3144.3
+ 삼진.9 1 1.5567e+09 1.4300e+11 3144.7
+ BABIP 1 1.5139e+09 1.4304e+11 3144.7
+ 블론 1 1.3815e+09 1.4318e+11 3144.8
+ 홀드 1 4.3499e+07 1.4451e+11 3146.3
Step: AIC=2841.38
연봉.2018. ~ 연봉.2017.
Df Sum of Sq RSS AIC
+ WAR 1 7.0421e+09 1.2403e+10 2775.0
+ RA9.WAR 1 4.9589e+09 1.4486e+10 2798.6
+ 승 1 3.8414e+09 1.5604e+10 2809.9
+ 이닝 1 2.8118e+09 1.6633e+10 2819.6
+ 선발 1 2.1318e+09 1.7313e+10 2825.7
+ 패 1 8.8114e+08 1.8564e+10 2836.3
<none> 1.9445e+10 2841.4
+ 블론 1 2.2022e+08 1.9225e+10 2841.7
+ 세 1 1.7105e+08 1.9274e+10 2842.0
+ kFIP 1 1.6254e+08 1.9283e+10 2842.1
+ FIP 1 1.5483e+08 1.9290e+10 2842.2
+ ERA 1 1.0735e+08 1.9338e+10 2842.5
+ LOB. 1 7.7049e+07 1.9368e+10 2842.8
+ 홈런.9 1 7.3957e+07 1.9371e+10 2842.8
+ 볼넷.9 1 6.4565e+07 1.9381e+10 2842.9
+ BABIP 1 5.6938e+07 1.9388e+10 2842.9
+ 홀드 1 3.8024e+07 1.9407e+10 2843.1
+ 삼진.9 1 5.5081e+06 1.9440e+10 2843.3
+ 경기 1 1.2651e+04 1.9445e+10 2843.4
- 연봉.2017. 1 1.2511e+11 1.4456e+11 3144.3
Step: AIC=2775.03
연봉.2018. ~ 연봉.2017. + WAR
Df Sum of Sq RSS AIC
+ 패 1 2.1336e+08 1.2190e+10 2774.4
+ kFIP 1 1.8769e+08 1.2215e+10 2774.7
+ 선발 1 1.7153e+08 1.2232e+10 2774.9
+ FIP 1 1.6877e+08 1.2234e+10 2774.9
+ 볼넷.9 1 1.6419e+08 1.2239e+10 2775.0
<none> 1.2403e+10 2775.0
+ 이닝 1 1.4704e+08 1.2256e+10 2775.2
+ 홈런.9 1 5.1612e+07 1.2351e+10 2776.4
+ 삼진.9 1 4.8349e+07 1.2355e+10 2776.4
+ 승 1 3.0076e+07 1.2373e+10 2776.7
+ 경기 1 2.7246e+07 1.2376e+10 2776.7
+ BABIP 1 2.4182e+07 1.2379e+10 2776.7
+ ERA 1 1.7077e+07 1.2386e+10 2776.8
+ 블론 1 1.1153e+07 1.2392e+10 2776.9
+ RA9.WAR 1 6.6509e+06 1.2396e+10 2776.9
+ 세 1 4.3325e+06 1.2399e+10 2777.0
+ 홀드 1 3.4824e+06 1.2400e+10 2777.0
+ LOB. 1 6.6018e+05 1.2402e+10 2777.0
- WAR 1 7.0421e+09 1.9445e+10 2841.4
- 연봉.2017. 1 4.1619e+10 5.4022e+10 2996.7
Step: AIC=2774.4
연봉.2018. ~ 연봉.2017. + WAR + 패
Df Sum of Sq RSS AIC
+ kFIP 1 1.9738e+08 1.1992e+10 2773.9
+ 승 1 1.8072e+08 1.2009e+10 2774.1
+ FIP 1 1.7496e+08 1.2015e+10 2774.2
<none> 1.2190e+10 2774.4
- 패 1 2.1336e+08 1.2403e+10 2775.0
+ 볼넷.9 1 1.0330e+08 1.2086e+10 2775.1
+ 홈런.9 1 7.1015e+07 1.2119e+10 2775.5
+ 삼진.9 1 6.6895e+07 1.2123e+10 2775.6
+ 블론 1 4.2173e+07 1.2148e+10 2775.9
+ BABIP 1 4.1954e+07 1.2148e+10 2775.9
+ 선발 1 3.1474e+07 1.2158e+10 2776.0
+ ERA 1 1.3441e+07 1.2176e+10 2776.2
+ 이닝 1 5.8966e+06 1.2184e+10 2776.3
+ 세 1 3.4705e+06 1.2186e+10 2776.3
+ RA9.WAR 1 2.4143e+06 1.2187e+10 2776.4
+ LOB. 1 1.7129e+06 1.2188e+10 2776.4
+ 경기 1 1.1252e+06 1.2189e+10 2776.4
+ 홀드 1 1.8992e+05 1.2190e+10 2776.4
- WAR 1 6.3743e+09 1.8564e+10 2836.3
- 연봉.2017. 1 4.1680e+10 5.3870e+10 2998.3
Step: AIC=2773.92
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP
Df Sum of Sq RSS AIC
+ 승 1 1.6741e+08 1.1825e+10 2773.8
<none> 1.1992e+10 2773.9
+ 블론 1 1.2836e+08 1.1864e+10 2774.3
- kFIP 1 1.9738e+08 1.2190e+10 2774.4
+ 선발 1 1.1764e+08 1.1875e+10 2774.4
- 패 1 2.2305e+08 1.2215e+10 2774.7
+ BABIP 1 7.5190e+07 1.1917e+10 2775.0
+ ERA 1 2.1818e+07 1.1971e+10 2775.6
+ 홀드 1 2.1404e+07 1.1971e+10 2775.6
+ 삼진.9 1 1.9275e+07 1.1973e+10 2775.7
+ 경기 1 1.7028e+07 1.1975e+10 2775.7
+ 이닝 1 1.3041e+07 1.1979e+10 2775.8
+ FIP 1 9.3610e+06 1.1983e+10 2775.8
+ 볼넷.9 1 8.8432e+06 1.1983e+10 2775.8
+ 홈런.9 1 8.7223e+06 1.1984e+10 2775.8
+ LOB. 1 4.0316e+06 1.1988e+10 2775.9
+ RA9.WAR 1 2.0131e+06 1.1990e+10 2775.9
+ 세 1 1.4454e+06 1.1991e+10 2775.9
- WAR 1 6.4941e+09 1.8486e+10 2837.7
- 연봉.2017. 1 4.1735e+10 5.3727e+10 2999.9
Step: AIC=2773.78
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승
Df Sum of Sq RSS AIC
+ 이닝 1 2.1565e+08 1.1609e+10 2773.0
+ 선발 1 1.9668e+08 1.1628e+10 2773.2
<none> 1.1825e+10 2773.8
- 승 1 1.6741e+08 1.1992e+10 2773.9
- kFIP 1 1.8408e+08 1.2009e+10 2774.1
+ 블론 1 8.3012e+07 1.1742e+10 2774.7
+ RA9.WAR 1 6.3182e+07 1.1762e+10 2775.0
+ BABIP 1 4.5875e+07 1.1779e+10 2775.2
+ 볼넷.9 1 1.7921e+07 1.1807e+10 2775.6
+ 삼진.9 1 1.4564e+07 1.1810e+10 2775.6
+ 홈런.9 1 1.2160e+07 1.1813e+10 2775.6
+ ERA 1 8.8026e+06 1.1816e+10 2775.7
+ FIP 1 8.1221e+06 1.1817e+10 2775.7
+ 세 1 5.8214e+06 1.1819e+10 2775.7
+ 홀드 1 5.2671e+06 1.1820e+10 2775.7
+ LOB. 1 3.9758e+05 1.1825e+10 2775.8
+ 경기 1 3.3176e+05 1.1825e+10 2775.8
- 패 1 3.6648e+08 1.2191e+10 2776.4
- WAR 1 3.6353e+09 1.5460e+10 2812.5
- 연봉.2017. 1 3.9188e+10 5.1013e+10 2994.0
Step: AIC=2772.98
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승 + 이닝
Df Sum of Sq RSS AIC
- 패 1 3.1923e+07 1.1641e+10 2771.4
<none> 1.1609e+10 2773.0
- kFIP 1 2.1496e+08 1.1824e+10 2773.8
- 이닝 1 2.1565e+08 1.1825e+10 2773.8
+ BABIP 1 8.7592e+07 1.1522e+10 2773.8
+ 선발 1 5.0414e+07 1.1559e+10 2774.3
+ 블론 1 3.9472e+07 1.1570e+10 2774.5
+ 삼진.9 1 3.3863e+07 1.1575e+10 2774.5
+ ERA 1 3.3525e+07 1.1576e+10 2774.5
+ FIP 1 1.8310e+07 1.1591e+10 2774.7
+ 홈런.9 1 1.2031e+07 1.1597e+10 2774.8
+ RA9.WAR 1 1.0398e+07 1.1599e+10 2774.8
+ LOB. 1 3.1362e+06 1.1606e+10 2774.9
+ 경기 1 1.9500e+06 1.1607e+10 2775.0
+ 볼넷.9 1 1.2880e+06 1.1608e+10 2775.0
+ 세 1 2.2726e+05 1.1609e+10 2775.0
+ 홀드 1 9.3003e+04 1.1609e+10 2775.0
- 승 1 3.7002e+08 1.1979e+10 2775.8
- WAR 1 3.7546e+09 1.5364e+10 2813.6
- 연봉.2017. 1 3.8723e+10 5.0333e+10 2993.9
Step: AIC=2771.4
연봉.2018. ~ 연봉.2017. + WAR + kFIP + 승 + 이닝
Df Sum of Sq RSS AIC
<none> 1.1641e+10 2771.4
+ BABIP 1 9.5915e+07 1.1545e+10 2772.1
- kFIP 1 2.2291e+08 1.1864e+10 2772.3
+ 선발 1 6.0820e+07 1.1580e+10 2772.6
+ ERA 1 4.1760e+07 1.1599e+10 2772.8
+ 삼진.9 1 3.6788e+07 1.1604e+10 2772.9
+ 패 1 3.1923e+07 1.1609e+10 2773.0
+ 블론 1 2.3680e+07 1.1618e+10 2773.1
+ FIP 1 2.0239e+07 1.1621e+10 2773.1
+ 홈런.9 1 1.4166e+07 1.1627e+10 2773.2
+ LOB. 1 7.7349e+06 1.1633e+10 2773.3
+ 경기 1 1.5351e+06 1.1640e+10 2773.4
+ RA9.WAR 1 1.4350e+06 1.1640e+10 2773.4
+ 볼넷.9 1 1.4095e+06 1.1640e+10 2773.4
+ 홀드 1 2.5525e+05 1.1641e+10 2773.4
+ 세 1 6.8181e+04 1.1641e+10 2773.4
- 승 1 4.0011e+08 1.2041e+10 2774.5
- 이닝 1 5.5021e+08 1.2191e+10 2776.4
- WAR 1 3.9604e+09 1.5602e+10 2813.9
- 연봉.2017. 1 3.8795e+10 5.0436e+10 2992.2
summary
Call:
lm(formula = 연봉.2018. ~ 연봉.2017. + WAR + kFIP + 승 +
이닝, data = dt)
Residuals:
Min 1Q Median 3Q Max
-48717 -2879 204 3083 48961
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.691e+03 2.658e+03 -1.012 0.31310
연봉.2017. 8.862e-01 4.018e-02 22.058 < 2e-16 ***
WAR 8.118e+03 1.152e+03 7.048 6.68e-11 ***
kFIP 6.737e+02 4.029e+02 1.672 0.09666 .
승 1.059e+03 4.727e+02 2.240 0.02659 *
이닝 -9.701e+01 3.693e+01 -2.627 0.00954 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8929 on 146 degrees of freedom
Multiple R-squared: 0.9195, Adjusted R-squared: 0.9167
F-statistic: 333.4 on 5 and 146 DF, p-value: < 2.2e-16
AIC를 이용하면 최종 모형은 “연봉.2018. ~ 연봉.2017. + WAR + kFIP+승+이닝” 이다.
연봉.2017. 1.91752518192932 WAR 4.3927072113068 kFIP 1.20722030237251 승 6.3165168650018 이닝 6.53436057823665
연봉.2018. ~ 연봉.2017. + WAR + kFIP + 승 + 이닝
후진
model_back = step (model1, direction = "backward" )
summary (model_back)
Start: AIC=2775.03
연봉.2018. ~ +WAR + 연봉.2017.
Df Sum of Sq RSS AIC
<none> 1.2403e+10 2775.0
- WAR 1 7.0421e+09 1.9445e+10 2841.4
- 연봉.2017. 1 4.1619e+10 5.4022e+10 2996.7
Call:
lm(formula = 연봉.2018. ~ +WAR + 연봉.2017., data = dt)
Residuals:
Min 1Q Median 3Q Max
-50442 -1849 758 2050 56166
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -576.58811 889.09610 -0.649 0.518
WAR 7007.17364 761.83979 9.198 3.03e-16 ***
연봉.2017. 0.89926 0.04022 22.360 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9124 on 149 degrees of freedom
Multiple R-squared: 0.9142, Adjusted R-squared: 0.913
F-statistic: 793.8 on 2 and 149 DF, p-value: < 2.2e-16
WAR 1.84059685810784 연봉.2017. 1.84059685810784
연봉.2018. ~ 승 + 경기 + 선발 + WAR + 연봉.2017.
전진
m0 = lm (연봉.2018. ~ 1 , data = dt)
model_forward = step (
m0,
scope = 연봉.2018. ~ 연봉.2017. + 승+ 패+ 세+ 홀드+ 블론+ 경기+ 선발+ 이닝+ 삼진.9 + 볼넷.9 + 홈런.9 + BABIP+ LOB.+ ERA+ RA9.WAR+ FIP+ kFIP+ WAR,
direction = "forward" )
summary (model_forward)
Start: AIC=3144.3
연봉.2018. ~ 1
Df Sum of Sq RSS AIC
+ 연봉.2017. 1 1.2511e+11 1.9445e+10 2841.4
+ WAR 1 9.0535e+10 5.4022e+10 2996.7
+ RA9.WAR 1 7.9230e+10 6.5326e+10 3025.6
+ 승 1 7.3377e+10 7.1179e+10 3038.6
+ 이닝 1 6.2759e+10 8.1797e+10 3059.8
+ 선발 1 4.5409e+10 9.9147e+10 3089.0
+ 패 1 3.1910e+10 1.1265e+11 3108.4
+ 볼넷.9 1 1.5661e+10 1.2890e+11 3128.9
+ kFIP 1 1.2591e+10 1.3197e+11 3132.4
+ FIP 1 1.1403e+10 1.3315e+11 3133.8
+ ERA 1 6.7332e+09 1.3782e+11 3139.1
+ 세 1 6.4461e+09 1.3811e+11 3139.4
+ 경기 1 6.3714e+09 1.3819e+11 3139.4
+ LOB. 1 2.2831e+09 1.4227e+11 3143.9
+ 홈런.9 1 1.9575e+09 1.4260e+11 3144.2
<none> 1.4456e+11 3144.3
+ 삼진.9 1 1.5567e+09 1.4300e+11 3144.7
+ BABIP 1 1.5139e+09 1.4304e+11 3144.7
+ 블론 1 1.3815e+09 1.4318e+11 3144.8
+ 홀드 1 4.3499e+07 1.4451e+11 3146.3
Step: AIC=2841.38
연봉.2018. ~ 연봉.2017.
Df Sum of Sq RSS AIC
+ WAR 1 7042094427 1.2403e+10 2775.0
+ RA9.WAR 1 4958914952 1.4486e+10 2798.6
+ 승 1 3841387936 1.5604e+10 2809.9
+ 이닝 1 2811807174 1.6633e+10 2819.6
+ 선발 1 2131826098 1.7313e+10 2825.7
+ 패 1 881138122 1.8564e+10 2836.3
<none> 1.9445e+10 2841.4
+ 블론 1 220224080 1.9225e+10 2841.7
+ 세 1 171052899 1.9274e+10 2842.0
+ kFIP 1 162536872 1.9283e+10 2842.1
+ FIP 1 154825743 1.9290e+10 2842.2
+ ERA 1 107350094 1.9338e+10 2842.5
+ LOB. 1 77049296 1.9368e+10 2842.8
+ 홈런.9 1 73957140 1.9371e+10 2842.8
+ 볼넷.9 1 64564811 1.9381e+10 2842.9
+ BABIP 1 56938420 1.9388e+10 2842.9
+ 홀드 1 38023685 1.9407e+10 2843.1
+ 삼진.9 1 5508109 1.9440e+10 2843.3
+ 경기 1 12651 1.9445e+10 2843.4
Step: AIC=2775.03
연봉.2018. ~ 연봉.2017. + WAR
Df Sum of Sq RSS AIC
+ 패 1 213356827 1.2190e+10 2774.4
+ kFIP 1 187694356 1.2215e+10 2774.7
+ 선발 1 171531569 1.2232e+10 2774.9
+ FIP 1 168772833 1.2234e+10 2774.9
+ 볼넷.9 1 164189202 1.2239e+10 2775.0
<none> 1.2403e+10 2775.0
+ 이닝 1 147039192 1.2256e+10 2775.2
+ 홈런.9 1 51612430 1.2351e+10 2776.4
+ 삼진.9 1 48348966 1.2355e+10 2776.4
+ 승 1 30075743 1.2373e+10 2776.7
+ 경기 1 27245510 1.2376e+10 2776.7
+ BABIP 1 24181791 1.2379e+10 2776.7
+ ERA 1 17077047 1.2386e+10 2776.8
+ 블론 1 11153112 1.2392e+10 2776.9
+ RA9.WAR 1 6650871 1.2396e+10 2776.9
+ 세 1 4332494 1.2399e+10 2777.0
+ 홀드 1 3482363 1.2400e+10 2777.0
+ LOB. 1 660176 1.2402e+10 2777.0
Step: AIC=2774.4
연봉.2018. ~ 연봉.2017. + WAR + 패
Df Sum of Sq RSS AIC
+ kFIP 1 197383620 1.1992e+10 2773.9
+ 승 1 180715640 1.2009e+10 2774.1
+ FIP 1 174958135 1.2015e+10 2774.2
<none> 1.2190e+10 2774.4
+ 볼넷.9 1 103300993 1.2086e+10 2775.1
+ 홈런.9 1 71014626 1.2119e+10 2775.5
+ 삼진.9 1 66895356 1.2123e+10 2775.6
+ 블론 1 42172679 1.2148e+10 2775.9
+ BABIP 1 41953578 1.2148e+10 2775.9
+ 선발 1 31473684 1.2158e+10 2776.0
+ ERA 1 13441234 1.2176e+10 2776.2
+ 이닝 1 5896647 1.2184e+10 2776.3
+ 세 1 3470456 1.2186e+10 2776.3
+ RA9.WAR 1 2414340 1.2187e+10 2776.4
+ LOB. 1 1712854 1.2188e+10 2776.4
+ 경기 1 1125166 1.2189e+10 2776.4
+ 홀드 1 189917 1.2190e+10 2776.4
Step: AIC=2773.92
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP
Df Sum of Sq RSS AIC
+ 승 1 167413120 1.1825e+10 2773.8
<none> 1.1992e+10 2773.9
+ 블론 1 128359041 1.1864e+10 2774.3
+ 선발 1 117641927 1.1875e+10 2774.4
+ BABIP 1 75190355 1.1917e+10 2775.0
+ ERA 1 21818455 1.1971e+10 2775.6
+ 홀드 1 21403854 1.1971e+10 2775.6
+ 삼진.9 1 19275489 1.1973e+10 2775.7
+ 경기 1 17028183 1.1975e+10 2775.7
+ 이닝 1 13040981 1.1979e+10 2775.8
+ FIP 1 9361041 1.1983e+10 2775.8
+ 볼넷.9 1 8843181 1.1983e+10 2775.8
+ 홈런.9 1 8722328 1.1984e+10 2775.8
+ LOB. 1 4031616 1.1988e+10 2775.9
+ RA9.WAR 1 2013063 1.1990e+10 2775.9
+ 세 1 1445393 1.1991e+10 2775.9
Step: AIC=2773.78
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승
Df Sum of Sq RSS AIC
+ 이닝 1 215650124 1.1609e+10 2773.0
+ 선발 1 196677432 1.1628e+10 2773.2
<none> 1.1825e+10 2773.8
+ 블론 1 83011867 1.1742e+10 2774.7
+ RA9.WAR 1 63182313 1.1762e+10 2775.0
+ BABIP 1 45874866 1.1779e+10 2775.2
+ 볼넷.9 1 17920561 1.1807e+10 2775.6
+ 삼진.9 1 14563944 1.1810e+10 2775.6
+ 홈런.9 1 12160231 1.1813e+10 2775.6
+ ERA 1 8802604 1.1816e+10 2775.7
+ FIP 1 8122100 1.1817e+10 2775.7
+ 세 1 5821352 1.1819e+10 2775.7
+ 홀드 1 5267064 1.1820e+10 2775.7
+ LOB. 1 397579 1.1825e+10 2775.8
+ 경기 1 331762 1.1825e+10 2775.8
Step: AIC=2772.98
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승 + 이닝
Df Sum of Sq RSS AIC
<none> 1.1609e+10 2773.0
+ BABIP 1 87591503 1.1522e+10 2773.8
+ 선발 1 50413708 1.1559e+10 2774.3
+ 블론 1 39472232 1.1570e+10 2774.5
+ 삼진.9 1 33863019 1.1575e+10 2774.5
+ ERA 1 33524887 1.1576e+10 2774.5
+ FIP 1 18310110 1.1591e+10 2774.7
+ 홈런.9 1 12031455 1.1597e+10 2774.8
+ RA9.WAR 1 10397930 1.1599e+10 2774.8
+ LOB. 1 3136209 1.1606e+10 2774.9
+ 경기 1 1950014 1.1607e+10 2775.0
+ 볼넷.9 1 1288038 1.1608e+10 2775.0
+ 세 1 227255 1.1609e+10 2775.0
+ 홀드 1 93003 1.1609e+10 2775.0
Call:
lm(formula = 연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP +
승 + 이닝, data = dt)
Residuals:
Min 1Q Median 3Q Max
-48378 -2526 133 2563 48361
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.653e+03 2.664e+03 -0.996 0.3209
연봉.2017. 8.856e-01 4.027e-02 21.992 < 2e-16 ***
WAR 8.003e+03 1.169e+03 6.848 1.97e-10 ***
패 -2.708e+02 4.288e+02 -0.631 0.5287
kFIP 6.622e+02 4.042e+02 1.639 0.1035
승 1.025e+03 4.767e+02 2.150 0.0332 *
이닝 -7.811e+01 4.760e+01 -1.641 0.1029
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8948 on 145 degrees of freedom
Multiple R-squared: 0.9197, Adjusted R-squared: 0.9164
F-statistic: 276.8 on 6 and 145 DF, p-value: < 2.2e-16
연봉.2017. 1.9185269678088 WAR 4.50275401281816 패 3.39937428417761 kFIP 1.20965408072733 승 6.3982769107728 이닝 10.8086888355271
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승 + 이닝
PCA(주성분분석)
서로 상관성이 높은 변수들의 선형 결합으로 만들어 기존의 상관성이 높은 변수들을 요약, 축소하는 기법
procomp.result2 <- prcomp (dt2, center= T, scale= T)
summary (procomp.result2)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7
Standard deviation 2.6109 1.8528 1.5587 1.27735 1.07673 0.94635 0.77437
Proportion of Variance 0.3588 0.1807 0.1279 0.08588 0.06102 0.04714 0.03156
Cumulative Proportion 0.3588 0.5395 0.6673 0.75322 0.81424 0.86137 0.89293
PC8 PC9 PC10 PC11 PC12 PC13 PC14
Standard deviation 0.75305 0.60013 0.57118 0.51280 0.43707 0.31874 0.28648
Proportion of Variance 0.02985 0.01896 0.01717 0.01384 0.01005 0.00535 0.00432
Cumulative Proportion 0.92278 0.94173 0.95890 0.97274 0.98280 0.98815 0.99247
PC15 PC16 PC17 PC18 PC19
Standard deviation 0.27007 0.21161 0.1236 0.10061 0.006742
Proportion of Variance 0.00384 0.00236 0.0008 0.00053 0.000000
Cumulative Proportion 0.99630 0.99866 0.9995 1.00000 1.000000
dt.pca <- princomp (dt2, cor= TRUE )
Importance of components:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5
Standard deviation 2.6109027 1.8528422 1.5587351 1.2773530 1.07672785
Proportion of Variance 0.3587796 0.1806855 0.1278766 0.0858753 0.06101805
Cumulative Proportion 0.3587796 0.5394651 0.6673417 0.7532170 0.81423504
Comp.6 Comp.7 Comp.8 Comp.9 Comp.10
Standard deviation 0.94635475 0.77436931 0.75305028 0.60012648 0.57117946
Proportion of Variance 0.04713617 0.03156041 0.02984656 0.01895536 0.01717084
Cumulative Proportion 0.86137122 0.89293163 0.92277819 0.94173355 0.95890439
Comp.11 Comp.12 Comp.13 Comp.14 Comp.15
Standard deviation 0.51280325 0.4370697 0.31874103 0.286476368 0.270069813
Proportion of Variance 0.01384038 0.0100542 0.00534715 0.004319406 0.003838827
Cumulative Proportion 0.97274477 0.9827990 0.98814612 0.992465529 0.996304355
Comp.16 Comp.17 Comp.18 Comp.19
Standard deviation 0.211606675 0.1235839263 0.1006053051 6.741848e-03
Proportion of Variance 0.002356704 0.0008038414 0.0005327067 2.392237e-06
Cumulative Proportion 0.998661060 0.9994649011 0.9999976078 1.000000e+00
제 1주성분과 제6주성분까지의 누적 분산비율은 대략 85.71%로 6개의 주성분 변수를 활용해 전체 데이터의 85.71%를 설명할 수 있다.
screeplot (dt.pca, npcs= 8 , type= "lines" )
Loadings:
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
승 0.322 0.201 0.105
패 0.272 0.198 0.108 0.236 -0.108 -0.477 -0.294
세 -0.202 0.289 0.126 -0.512 -0.426 -0.192 0.360
홀드 -0.253 0.336 0.141 0.417 0.343 0.154 0.296
블론 -0.274 0.396 0.204 -0.208 0.116 -0.183 -0.354
경기 0.191 -0.220 0.372 0.222 0.227 0.236
선발 0.261 0.350 -0.123 -0.250 -0.105
이닝 0.329 0.235 0.109 -0.134
삼진.9 0.393 -0.404 -0.229 0.455 -0.171 -0.204 -0.211
볼넷.9 -0.250 0.116 -0.236 0.289 0.752 -0.178
홈런.9 -0.173 0.286 0.276 0.268 0.154 -0.535
BABIP -0.142 0.207 0.292 -0.444 0.146
LOB. 0.131 -0.209 -0.230 0.228 -0.416 0.495 -0.174
ERA -0.235 0.294 0.288 -0.156 -0.133 0.106 0.107
RA9.WAR 0.327 0.176 -0.181 0.124 0.119 0.391
FIP -0.259 0.284 0.378
kFIP -0.257 0.268 0.423
WAR 0.318 0.213 -0.126 0.113 0.165 0.235
연봉.2017. 0.252 0.130 0.103 -0.296 -0.181 0.629 -0.535
Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
승 0.383 0.237 0.542 0.550 0.102
패 -0.478 -0.137 0.321 -0.270 -0.229
세 -0.134 -0.328 0.242 0.206 0.105
홀드 0.128 -0.339 -0.335 0.264 -0.113 0.240
블론 0.363 0.513 -0.251 -0.150 -0.121
경기 -0.183 0.560 -0.322 -0.240
선발 0.154 -0.463 0.328 0.249
이닝 0.213 -0.370 0.151
삼진.9 -0.380 0.376
볼넷.9 -0.105 0.128 -0.359
홈런.9 -0.614
BABIP 0.665 -0.171 0.296 -0.149 0.187
LOB. 0.414 -0.155 0.112 -0.328 0.216
ERA -0.141 0.336 -0.206 -0.663 0.296
RA9.WAR 0.123 -0.243 -0.247 -0.163 -0.683
FIP 0.323
kFIP 0.492
WAR -0.398 -0.499 0.305 0.473
연봉.2017. -0.229 0.174
Comp.18 Comp.19
승
패
세
홀드
블론
경기 0.318
선발 0.553
이닝 -0.758
삼진.9
볼넷.9
홈런.9 -0.125
BABIP
LOB.
ERA
RA9.WAR
FIP 0.754
kFIP -0.641
WAR
연봉.2017.
Comp.1 Comp.2 Comp.3 Comp.4 Comp.5 Comp.6 Comp.7 Comp.8 Comp.9
SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Proportion Var 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053
Cumulative Var 0.053 0.105 0.158 0.211 0.263 0.316 0.368 0.421 0.474
Comp.10 Comp.11 Comp.12 Comp.13 Comp.14 Comp.15 Comp.16 Comp.17
SS loadings 1.000 1.000 1.000 1.000 1.000 1.000 1.000 1.000
Proportion Var 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053
Cumulative Var 0.526 0.579 0.632 0.684 0.737 0.789 0.842 0.895
Comp.18 Comp.19
SS loadings 1.000 1.000
Proportion Var 0.053 0.053
Cumulative Var 0.947 1.000
능형회귀
rfit <- lm.ridge (연봉.2018. ~ ., dt, lambda= seq (0.01 ,20 ,0.1 ))
ERROR: Error in UseMethod("select"): no applicable method for 'select' applied to an object of class "ridgelm"
round (rfit$ coef[,rfit$ lam== '4.11' ],3 )
승 3659.317 패 -737.614 세 -93.401 홀드 -155.777 블론 859.526 경기 -1932.839 선발 -3234.453 이닝 441.579 삼진.9 -934.786 볼넷.9 1047.402 홈런.9 1328.778 BABIP -755.31 LOB. -578.691 ERA -288.368 RA9.WAR 516.127 FIP -176.744 kFIP -420.71 WAR 9904.312 연봉.2017. 21113.076
matplot (rfit$ lambda, t (rfit$ coef), type= 'l' ,
xlab= expression (lambda),
ylab= expression (bold (beta)(lambda)), lwd= 2 )
abline (h= 0 , col= "grey" , lty= 2 )
abline (v= 14.91 , col= "black" , lty= 2 )
glm
X <- model.matrix (연봉.2018. ~ ., dt)[,- 1 ]
y <- dt$ 연봉.2018.
A matrix: 6 × 19 of type dbl
1
16
7
0
0
0
30
30
190.0
8.95
2.13
0.76
0.342
73.7
3.60
6.91
3.69
3.44
6.62
85000
2
11
11
1
0
0
30
29
185.1
7.43
1.85
0.53
0.319
67.1
3.88
6.80
3.52
3.41
6.08
50000
3
20
6
0
0
0
31
31
193.1
7.36
2.09
0.79
0.332
72.1
3.44
6.54
3.94
3.82
5.64
150000
4
10
7
0
0
0
28
28
175.2
8.04
1.95
1.02
0.298
75.0
3.43
6.11
4.20
4.03
4.63
100000
5
13
7
0
0
0
30
30
187.1
7.49
2.11
0.91
0.323
74.1
3.80
6.13
4.36
4.31
4.38
85000
6
8
10
0
0
0
26
26
160.0
7.42
1.74
1.12
0.289
76.1
3.04
6.52
4.42
4.32
3.94
35000
ridge.fit<- glmnet (X,y,alpha= 0 , lambda= seq (0 ,100 ,10 )) ##ridge : alpha=0
plot (ridge.fit, label= TRUE )
abline (h= 0 , col= "grey" , lty= 2 )
Length Class Mode
a0 11 -none- numeric
beta 209 dgCMatrix S4
df 11 -none- numeric
dim 2 -none- numeric
lambda 11 -none- numeric
dev.ratio 11 -none- numeric
nulldev 1 -none- numeric
npasses 1 -none- numeric
jerr 1 -none- numeric
offset 1 -none- logical
call 5 -none- call
nobs 1 -none- numeric
cv.fit<- cv.glmnet (X,y,alpha= 0 ,nfolds= length (y))
Warning message:
“Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold”
Call: cv.glmnet(x = X, y = y, nfolds = length(y), alpha = 0)
Measure: Mean-Squared Error
Lambda Index Measure SE Nonzero
min 2869 100 117171048 49876335 19
1se 12711 84 166795504 78214517 19
예측
WAR이라는 변수가 다른 설명변수의 곱으로 이루어진 변수니까, 인터넷에서 나오는 WAR계산법에 들어가는 변수들을 제거해보자.(근데 인터넷에 말이 다 다름 ㅎㅎ)
ERA, 이닝수, FIP는 일단 빼야함.
FIP자체가 홈런, 삼진, 볼넷 등의 값인데..
step, 전진: 연봉.2017.이랑 kFIP, 볼넷.9, 삼진.9를ㄴ넣어보자.
연봉.2018. ~ 연봉.2017. + WAR + kFIP + 승 + 이닝
후진: 연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승 + 이닝
연봉.2018. ~ 연봉.2017. + WAR + 패 + kFIP + 승 + 이닝
X <- data.frame (FIP = dt$ "FIP" ,
WAR = dt$ "WAR" ,
` 볼넷/9 ` = dt$ "볼넷.9" ,
` 삼진/9 ` = dt$ "삼진.9" ,
` 연봉.2017. ` = dt$ "연봉.2017." )
set.seed (19 )
train_indices <- createDataPartition (y, p = 0.8 , list = FALSE )
X_train <- X[train_indices, ]
X_test <- X[- train_indices, ]
y_train <- y[train_indices]
y_test <- y[- train_indices]
model <- lm (y_train ~ ., data = cbind (X_train, y_train))
Call:
lm(formula = y_train ~ ., data = cbind(X_train, y_train))
Residuals:
Min 1Q Median 3Q Max
-48253 -2002 -534 3472 43845
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2312.85178 1754.63797 1.318 0.19003
승 1322.49243 533.83994 2.477 0.01467 *
WAR 8040.03263 1345.09686 5.977 2.52e-08 ***
경기 -148.06296 57.54624 -2.573 0.01133 *
선발 -522.51845 187.30514 -2.790 0.00616 **
연봉.2017. 0.88556 0.04671 18.957 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9453 on 117 degrees of freedom
Multiple R-squared: 0.9143, Adjusted R-squared: 0.9106
F-statistic: 249.6 on 5 and 117 DF, p-value: < 2.2e-16
ㅎ므
X <- data.frame ('승' = picher$ "승" ,
WAR = picher$ "WAR" ,
` 경기 ` = picher$ "경기" ,
` 선발 ` = picher$ "선발" ,
` 연봉.2017. ` = picher$ "연봉.2017." )
predict_2018_salary <- predict (model, newdata = X)
picher <- cbind (picher, new_col = predict_2018_salary)
A data.frame: 6 × 23
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
<dbl>
1
켈리
SK
16
7
0
0
0
30
30
190.0
⋯
0.342
73.7
3.60
6.91
3.69
3.44
6.62
140000
85000
131853.25
2
소사
LG
11
11
1
0
0
30
29
185.1
⋯
0.319
67.1
3.88
6.80
3.52
3.41
6.08
120000
50000
90426.95
3
양현종
KIA
20
6
0
0
0
31
31
193.1
⋯
0.332
72.1
3.44
6.54
3.94
3.82
5.64
230000
150000
186155.07
4
차우찬
LG
10
7
0
0
0
28
28
175.2
⋯
0.298
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
122543.26
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
0.323
74.1
3.80
6.13
4.36
4.31
4.38
111000
85000
109876.10
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
⋯
0.289
76.1
3.04
6.52
4.42
4.32
3.94
85000
35000
58130.15
predictsalart <- round (predict_2018_salary,0 )
result1 <- picher$ "선수명"
result2 <- picher$ "연봉.2018."
result3 <- predictsalart
result4 <- picher$ "연봉.2017."
result <- cbind (result1,result2,result3,result4)
A matrix: 152 × 4 of type chr
1
켈리
140000
131853
85000
2
소사
120000
90427
50000
3
양현종
230000
186155
150000
4
차우찬
100000
122543
100000
5
레일리
111000
109876
85000
6
피어밴드
85000
58130
35000
7
고영표
11500
32371
5200
8
장원준
100000
120891
100000
9
함덕주
16000
33083
7000
10
팻딘
70000
85876
70000
11
윤성환
80000
94613
80000
12
유희관
50000
64779
50000
13
임기영
13000
24736
3100
14
박세웅
25000
28684
10000
15
백정현
15500
27341
10000
16
송승준
40000
54034
40000
17
류제국
29000
44409
35000
18
우규민
70000
73705
70000
19
임찬규
11500
14822
6500
20
손승락
70000
71226
70000
21
정우람
120000
122776
120000
22
윤희상
13000
23102
15000
23
원종현
18500
22358
14000
24
배영수
50000
57019
55000
25
박종훈
20000
21139
10000
26
이상화
10000
13605
4500
27
김진성
23000
33080
18000
28
이민호
18800
23738
16000
29
이재학
19000
21005
20000
30
김강률
15000
16907
6200
⋮
⋮
⋮
⋮
⋮
123
정재원
4000
2817
4000
124
김민우
3600
3076
3800
125
이현호
5200
3064
6000
126
권혁
45000
36721
45000
127
배제성
3000
-295
2700
128
홍상삼
9000
10584
12500
129
이태양
7300
3544
8300
130
김진우
6000
7804
12000
131
이영하
4200
2615
2700
132
최성영
2900
2237
2900
133
김동호
6000
1649
5000
134
김윤동
15000
3978
4700
135
정인욱
5700
3413
7200
136
송창식
24000
17311
22000
137
배재환
3000
3039
4000
138
이정민
10000
14081
15000
139
최동환
6500
1837
6000
140
이종혁
3200
2970
2700
141
홍성용
6800
-360
6300
142
정영일
3000
2111
4000
143
김지용
9000
5556
10000
144
최금강
12500
5460
14000
145
김범수
3600
-2975
3300
146
이승현
7000
2469
6200
147
주권
7600
-176
7500
148
장민재
7100
853
8100
149
정용운
7500
-4363
3100
150
노경은
10000
9200
16000
151
김승현
4000
-6818
2900
152
류희운
4000
-8730
3000
k-fold
X <- dt[c ('승' , '경기' , '선발' , 'WAR' , '연봉.2017.' )]
y <- dt$ "연봉.2018."
ERROR: Error in dt[c("승", "경기", "선발", "WAR", "연봉.2017.")]: object of type 'closure' is not subsettable
ctrl <- trainControl (method = "cv" ,
number = 10 ,
verboseIter = TRUE )
Linear Regression
152 samples
5 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 136, 139, 137, 136, 137, 136, ...
Resampling results:
RMSE Rsquared MAE
8775.345 0.9234182 5096.303
Tuning parameter 'intercept' was held constant at a value of TRUE
createDataPartition
set.seed (20 )
train_indices <- createDataPartition (y, p = 0.8 , list = FALSE ) # 80%를 훈련 세트로 사용
X_train <- X[train_indices, ] # X 훈련 세트
y_train <- y[train_indices] # y 훈련 세트
X_test <- X[- train_indices, ] # X 테스트 세트
y_test <- y[- train_indices] # y 테스트 세트
model <- lm (y_train ~ ., data = cbind (X_train, y_train))
Call:
lm(formula = y_train ~ ., data = cbind(X_train, y_train))
Residuals:
Min 1Q Median 3Q Max
-51628 -1742 835 2679 54478
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.132e+03 3.752e+03 -0.835 0.406
FIP 3.656e+02 6.383e+02 0.573 0.568
WAR 6.521e+03 9.510e+02 6.857 3.53e-10 ***
볼넷.9 4.095e+02 4.922e+02 0.832 0.407
삼진.9 -2.664e+02 3.220e+02 -0.827 0.410
연봉.2017. 9.436e-01 4.885e-02 19.314 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 9407 on 117 degrees of freedom
Multiple R-squared: 0.9084, Adjusted R-squared: 0.9045
F-statistic: 232.1 on 5 and 117 DF, p-value: < 2.2e-16
y_pred <- predict (model, newdata = X_test)
2 83760.5036635398 11 92856.5551719829 16 49834.1350433709 20 74394.6037441324 21 121111.857311619 23 21354.3954407502 30 11877.9200631754 31 13693.1214770523 36 10957.9331638088 39 7904.67772835263 41 22618.8632801464 47 3977.22428640269 51 4759.88345127168 58 5598.23942115724 62 2384.85845607467 64 3520.11810139773 67 3352.59845508957 68 22782.5629218232 73 2753.45775461199 77 8019.61080626648 80 5122.41200499673 83 -3354.1873156885 90 -1479.51353608508 98 2904.84077333906 118 5349.37841885841 121 3656.12420217399 132 2282.13105827633 141 3578.68637161238 142 4774.13856910488
## Shapiro-Wilk Test
## H0 : normal distribution vs. H1 : not H0
shapiro.test (resid (model))
Shapiro-Wilk normality test
data: resid(model)
W = 0.80293, p-value = 1.485e-11
귀무가설 기각
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
### 등분산성
## H0 : 등분산 vs. H1 : 이분산 (Heteroscedasticity)
bptest (model)
studentized Breusch-Pagan test
data: model
BP = 43.35, df = 5, p-value = 3.138e-08
잔차 이분산..
studentized Breusch-Pagan test
data: model
BP = 43.35, df = 5, p-value = 3.138e-08
기각..
rmse <- sqrt (mean ((y_pred - y_test)^ 2 ))
rmse
8169.95343721918
r_squared <- 1 - sum ((y_test - y_pred)^ 2 ) / sum ((y_test - mean (y_test))^ 2 )
r_squared
0.937905268099376
y_pred <- predict (model, newdata = X)
y_pred
1 131853.251722752 2 90426.9473005226 3 186155.073818348 4 122543.256377936 5 109876.101347654 6 58130.1460693261 7 32370.6338078238 8 120891.419231451 9 33082.9090122538 10 85875.5646042552 11 94612.907258451 12 64779.2432182822 13 24736.432167844 14 28683.8050596197 15 27341.0434374789 16 54033.6089742136 17 44409.0690815691 18 73704.7935015169 19 14822.4597236171 20 71225.8489501107 21 122776.430033276 22 23102.4723102339 23 22358.4004975124 24 57019.0430915069 25 21138.91207732 26 13605.1026078606 27 33080.433064936 28 23738.2511333632 29 21005.3510614688 30 16907.230038373 31 18149.4647698284 32 8022.27721072489 33 10555.5431004635 34 22860.981530776 35 9305.40472215851 36 11392.9246005369 37 57338.2157721655 38 23904.7420284782 39 15323.2191074946 40 25799.8646455771 41 15797.1830796868 42 12386.7566462361 43 11268.4374599085 44 27181.6327954838 45 58218.5631642926 46 84.2491549617121 47 6997.48046210642 48 6623.06609223969 49 37524.5209172796 50 68252.8672643547 51 5526.75772775975 52 46011.0463688199 53 13988.4104382785 54 5669.59301443541 55 9266.84557614415 56 30415.6933317048 57 2427.28629533588 58 3523.97145639002 59 9345.91329228837 60 8522.14286154392 61 7117.61695272959 62 5826.76177364835 63 20575.8304166946 64 -3248.10803524208 65 1432.47361320697 66 2851.50820535714 67 4305.61668282123 68 27789.0082687676 69 9890.01282405876 70 1628.47835528225 71 20886.1978578739 72 7039.25714045121 73 4256.94270974835 74 10755.7289381351 75 8543.22556649082 76 6443.23553034538 77 6491.71113930571 78 8023.19002679344 79 1852.91236244955 80 5204.00279116998 81 4539.09996965536 82 10388.5037497285 83 4754.82557308093 84 4606.76260822984 85 1841.22291855011 86 2122.67759069216 87 4822.48821165541 88 3669.56302759059 89 5893.32120073109 90 4750.24396797544 91 5123.38648652916 92 2185.01324555074 93 3907.22819695754 94 4420.48658027234 95 6368.7275754839 96 4555.8118574213 97 4555.8118574213 98 4998.59390178756 99 1349.48963739276 100 4198.77252155423 101 2038.74145495265 102 3970.30923042652 103 4088.0680907676 104 4475.41153114468 105 5980.87048198998 106 4741.08075776444 107 3938.68971025506 108 4322.76696118809 109 2309.39200925058 110 3405.05185250049 111 16045.6585612658 112 1784.80278842067 113 7212.97988221046 114 710.581165599726 115 4161.96630863486 116 3844.94660863389 117 296.340101124663 118 3734.08957927484 119 3435.1499491986 120 1648.47038365735 121 3048.6997058917 122 36138.5026118377 123 2817.42271438994 124 3075.90264334206 125 3063.77546613714 126 36720.9932854539 127 -294.771437209275 128 10583.9599334096 129 3544.07628683963 130 7803.97039248833 131 2614.93126241198 132 2236.54036052891 133 1648.88795289679 134 3977.83231596103 135 3413.20328574532 136 17310.9499196651 137 3039.19328780306 138 14080.6714157531 139 1836.91714437448 140 2969.84408716327 141 -360.151744432489 142 2110.53166566634 143 5555.91285324796 144 5459.60230519531 145 -2975.13715540324 146 2468.83069055606 147 -176.047452043045 148 853.420331233892 149 -4363.31532189832 150 9199.85371406927 151 -6817.61675823917 152 -8729.68865531109
picher <- cbind (picher, new_data= y_pred)
sorted_picher <- picher[order (- picher$ "연봉.2018." ), ]
sorted_picher
A data.frame: 152 × 24
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
<dbl>
<dbl>
3
양현종
KIA
20
6
0
0
0
31
31
193.1
⋯
72.1
3.44
6.54
3.94
3.82
5.64
230000
150000
186155.07
186155.07
1
켈리
SK
16
7
0
0
0
30
30
190.0
⋯
73.7
3.60
6.91
3.69
3.44
6.62
140000
85000
131853.25
131853.25
2
소사
LG
11
11
1
0
0
30
29
185.1
⋯
67.1
3.88
6.80
3.52
3.41
6.08
120000
50000
90426.95
90426.95
21
정우람
한화
6
4
26
0
5
56
0
59.0
⋯
79.4
2.75
2.85
3.26
2.69
1.81
120000
120000
122776.43
122776.43
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
74.1
3.80
6.13
4.36
4.31
4.38
111000
85000
109876.10
109876.10
4
차우찬
LG
10
7
0
0
0
28
28
175.2
⋯
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
122543.26
122543.26
8
장원준
두산
14
9
0
0
0
29
29
180.1
⋯
75.8
3.14
7.28
4.26
4.35
3.85
100000
100000
120891.42
120891.42
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
⋯
76.1
3.04
6.52
4.42
4.32
3.94
85000
35000
58130.15
58130.15
11
윤성환
삼성
12
9
0
0
0
28
28
174.1
⋯
72.7
4.28
5.36
4.78
4.80
3.03
80000
80000
94612.91
94612.91
10
팻딘
KIA
9
7
0
0
0
30
29
176.0
⋯
76.5
4.14
5.66
4.65
4.61
3.64
70000
70000
85875.56
85875.56
18
우규민
삼성
7
10
0
0
0
27
25
133.0
⋯
60.6
5.21
1.48
4.95
4.98
2.14
70000
70000
73704.79
73704.79
20
손승락
롯데
1
3
37
0
5
61
0
62.0
⋯
89.9
2.18
3.91
3.69
3.37
1.82
70000
70000
71225.85
71225.85
45
이동현
LG
3
6
7
5
3
45
0
50.2
⋯
58.9
4.80
1.22
3.64
3.59
0.68
60000
60000
58218.56
58218.56
12
유희관
두산
11
6
0
1
0
30
29
188.2
⋯
69.7
4.53
4.79
4.78
4.97
2.89
50000
50000
64779.24
64779.24
24
배영수
한화
7
8
0
0
0
25
25
128.0
⋯
66.4
5.06
2.47
5.10
5.21
1.68
50000
55000
57019.04
57019.04
37
임창용
KIA
8
6
7
9
5
51
0
50.0
⋯
75.9
3.78
1.40
3.69
3.35
0.96
50000
50000
57338.22
57338.22
52
윤길현
롯데
1
4
0
13
2
40
0
39.1
⋯
60.5
6.41
-0.04
3.99
3.73
0.50
50000
50000
46011.05
46011.05
122
송은범
한화
0
4
1
0
1
13
6
37.1
⋯
62.7
6.51
0.10
6.33
6.79
-0.12
45000
45000
36138.50
36138.50
126
권혁
한화
1
3
0
11
1
37
0
31.1
⋯
67.3
6.32
0.04
6.48
6.78
-0.16
45000
45000
36720.99
36720.99
16
송승준
롯데
11
5
0
1
1
30
22
130.1
⋯
75.8
4.21
4.22
4.91
4.77
2.20
40000
40000
54033.61
54033.61
49
이현승
두산
3
2
5
9
7
57
0
52.0
⋯
77.2
3.98
1.69
4.25
4.30
0.53
40000
40000
37524.52
37524.52
41
안영명
한화
1
8
0
0
1
25
16
87.2
⋯
66.9
5.75
0.52
5.19
5.34
0.81
35000
20000
15797.18
15797.18
68
채병용
SK
6
4
0
6
3
43
0
50.0
⋯
58.1
6.84
-0.17
5.05
4.94
0.22
30000
25000
27789.01
27789.01
17
류제국
LG
8
6
0
0
0
25
25
131.1
⋯
63.5
5.35
1.53
4.79
4.84
2.15
29000
35000
44409.07
44409.07
14
박세웅
롯데
12
6
0
0
0
28
28
171.1
⋯
78.3
3.68
5.92
5.07
5.14
2.54
25000
10000
28683.81
28683.81
40
임창민
NC
4
3
29
0
6
60
0
66.0
⋯
81.4
3.68
2.68
4.83
4.60
0.89
25000
22500
25799.86
25799.86
56
박정진
한화
3
2
1
7
0
55
0
48.0
⋯
65.4
3.94
0.87
4.92
4.66
0.38
25000
33000
30415.69
30415.69
136
송창식
한화
5
6
0
15
6
63
0
73.1
⋯
61.0
6.63
-0.74
5.68
5.59
-0.22
24000
22000
17310.95
17310.95
27
김진성
NC
10
6
0
15
2
69
0
89.2
⋯
76.1
3.61
3.20
4.30
3.92
1.47
23000
18000
33080.43
33080.43
38
심창민
삼성
4
7
6
16
2
66
0
75.1
⋯
78.5
4.18
1.82
4.82
4.26
0.93
23000
21000
23904.74
23904.74
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋱
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
99
박치국
두산
1
1
0
0
0
21
3
32.0
⋯
60.6
6.75
-0.12
5.39
5.58
0.00
3400
2700
1349.4896
1349.4896
102
허건엽
SK
0
0
0
0
0
8
0
8.1
⋯
51.3
8.64
-0.20
5.29
5.64
-0.01
3400
3300
3970.3092
3970.3092
105
고봉재
두산
0
0
0
0
0
1
0
1.0
⋯
100.0
0.00
0.04
6.73
7.64
-0.01
3300
4400
5980.8705
5980.8705
112
김시현
삼성
0
0
0
0
0
17
0
21.1
⋯
69.3
7.59
0.08
7.06
7.18
-0.05
3300
2700
1784.8028
1784.8028
120
강장산
KT
0
0
0
0
0
17
0
26.1
⋯
69.4
5.47
0.54
6.16
6.46
-0.10
3300
3000
1648.4704
1648.4704
118
배민관
LG
0
0
0
0
0
4
0
3.2
⋯
69.8
9.82
-0.11
9.46
9.77
-0.08
3200
3000
3734.0896
3734.0896
140
이종혁
KT
2
0
0
0
0
16
0
19.0
⋯
67.0
6.63
-0.09
7.10
7.57
-0.25
3200
2700
2969.8441
2969.8441
70
최지광
삼성
0
2
0
0
0
11
6
25.0
⋯
64.3
6.48
-0.16
5.69
5.65
0.21
3100
2700
1628.4784
1628.4784
73
박세진
KT
0
2
0
0
0
4
3
11.1
⋯
50.0
9.53
-0.13
4.44
4.67
0.18
3100
3000
4256.9427
4256.9427
101
박상원
한화
0
0
0
1
0
18
0
21.2
⋯
73.4
4.15
0.32
5.30
5.42
0.00
3100
2700
2038.7415
2038.7415
81
김대유
SK
0
0
0
0
0
6
0
4.2
⋯
58.1
9.64
0.08
5.88
5.91
0.09
3000
2700
4539.1000
4539.1000
84
손주영
LG
0
0
0
0
0
5
0
6.0
⋯
62.5
4.50
0.06
3.07
3.08
0.08
3000
2700
4606.7626
4606.7626
87
김진영
한화
0
0
0
0
0
3
0
2.2
⋯
57.1
10.12
-0.02
2.98
2.29
0.07
3000
2700
4822.4882
4822.4882
98
임진우
두산
0
0
0
0
0
1
0
1.0
⋯
40.0
27.00
-0.11
4.73
4.61
0.00
3000
3200
4998.5939
4998.5939
109
서균
한화
0
0
0
0
0
14
0
14.1
⋯
79.2
4.40
0.25
5.41
5.58
-0.04
3000
2700
2309.3920
2309.3920
121
홍성무
KT
0
1
0
0
0
7
0
6.1
⋯
51.3
12.79
-0.29
7.37
7.58
-0.11
3000
3000
3048.6997
3048.6997
127
배제성
KT
0
0
0
0
0
21
1
32.0
⋯
59.6
8.72
-0.10
6.39
6.59
-0.17
3000
2700
-294.7714
-294.7714
137
배재환
NC
0
1
0
0
0
3
1
8.0
⋯
82.1
9.00
-0.16
11.48
11.73
-0.23
3000
4000
3039.1933
3039.1933
142
정영일
SK
0
0
0
0
0
9
0
8.0
⋯
47.3
10.12
-0.61
10.73
11.37
-0.30
3000
4000
2110.5317
2110.5317
96
김도영
KT
0
0
0
0
0
1
0
1.0
⋯
100.0
0.00
0.04
3.73
4.43
0.00
2900
2700
4555.8119
4555.8119
97
조근종
KT
0
0
0
0
0
1
0
2.0
⋯
35.7
22.50
-0.16
6.73
4.86
0.00
2900
2700
4555.8119
4555.8119
108
안규현
삼성
0
0
0
0
0
1
0
2.0
⋯
66.7
9.00
-0.04
8.73
9.67
-0.04
2900
2800
4322.7670
4322.7670
115
김종훈
KIA
0
0
0
0
0
1
0
1.0
⋯
43.5
36.00
-0.16
18.73
17.60
-0.06
2900
2800
4161.9663
4161.9663
132
최성영
NC
0
0
0
0
0
7
0
11.1
⋯
68.6
9.53
-0.26
9.29
9.48
-0.20
2900
2900
2236.5404
2236.5404
83
장지훈
삼성
0
0
0
0
0
4
0
2.1
⋯
100.0
0.00
0.09
0.73
-0.69
0.08
2800
2700
4754.8256
4754.8256
90
차재용
롯데
0
0
0
0
0
3
0
2.1
⋯
100.0
0.00
0.08
2.02
1.83
0.05
2800
2800
4750.2440
4750.2440
106
이수민
삼성
0
0
0
0
0
1
0
2.2
⋯
50.0
10.12
-0.15
5.23
5.95
-0.01
2800
3000
4741.0808
4741.0808
94
장민익
두산
0
0
0
0
0
3
0
2.0
⋯
100.0
0.00
0.08
4.73
4.61
0.02
2700
2700
4420.4866
4420.4866
104
성영훈
두산
0
0
0
0
0
1
0
1.0
⋯
100.0
0.00
0.04
6.73
7.64
-0.01
2700
2700
4475.4115
4475.4115
116
정동윤
SK
0
0
0
0
0
2
0
3.0
⋯
71.4
9.00
-0.03
9.40
10.05
-0.07
2700
2700
3844.9466
3844.9466
filtered_df <- picher[picher$ "연봉.2018." != picher$ "연봉.2017." , ]
filtered_df
A data.frame: 128 × 24
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
<dbl>
<dbl>
1
켈리
SK
16
7
0
0
0
30
30
190.0
⋯
73.7
3.60
6.91
3.69
3.44
6.62
140000
85000
131853.252
131853.252
2
소사
LG
11
11
1
0
0
30
29
185.1
⋯
67.1
3.88
6.80
3.52
3.41
6.08
120000
50000
90426.947
90426.947
3
양현종
KIA
20
6
0
0
0
31
31
193.1
⋯
72.1
3.44
6.54
3.94
3.82
5.64
230000
150000
186155.074
186155.074
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
74.1
3.80
6.13
4.36
4.31
4.38
111000
85000
109876.101
109876.101
6
피어밴드
KT
8
10
0
0
0
26
26
160.0
⋯
76.1
3.04
6.52
4.42
4.32
3.94
85000
35000
58130.146
58130.146
7
고영표
KT
8
12
0
1
0
25
24
141.2
⋯
64.6
5.08
2.97
3.88
3.78
3.87
11500
5200
32370.634
32370.634
9
함덕주
두산
9
8
0
2
0
35
24
137.1
⋯
73.1
3.67
4.99
3.91
3.67
3.78
16000
7000
33082.909
33082.909
13
임기영
KIA
8
6
0
0
0
23
19
118.1
⋯
72.3
3.65
4.25
4.07
4.19
2.79
13000
3100
24736.432
24736.432
14
박세웅
롯데
12
6
0
0
0
28
28
171.1
⋯
78.3
3.68
5.92
5.07
5.14
2.54
25000
10000
28683.805
28683.805
15
백정현
삼성
8
4
0
3
0
35
14
100.2
⋯
73.2
4.38
3.01
4.51
4.34
2.25
15500
10000
27341.043
27341.043
17
류제국
LG
8
6
0
0
0
25
25
131.1
⋯
63.5
5.35
1.53
4.79
4.84
2.15
29000
35000
44409.069
44409.069
19
임찬규
LG
6
10
0
0
0
27
26
124.1
⋯
71.3
4.63
3.15
4.81
4.79
2.04
11500
6500
14822.460
14822.460
22
윤희상
SK
6
7
0
0
0
23
22
120.0
⋯
63.1
6.00
1.52
5.13
5.22
1.80
13000
15000
23102.472
23102.472
23
원종현
NC
3
6
0
22
0
68
0
80.0
⋯
63.9
4.39
2.02
3.60
3.52
1.71
18500
14000
22358.400
22358.400
24
배영수
한화
7
8
0
0
0
25
25
128.0
⋯
66.4
5.06
2.47
5.10
5.21
1.68
50000
55000
57019.043
57019.043
25
박종훈
SK
12
7
0
1
0
29
28
151.1
⋯
75.3
4.10
4.31
5.38
5.55
1.62
20000
10000
21138.912
21138.912
26
이상화
KT
4
3
6
4
1
70
0
66.0
⋯
68.0
3.95
2.43
3.57
3.45
1.54
10000
4500
13605.103
13605.103
27
김진성
NC
10
6
0
15
2
69
0
89.2
⋯
76.1
3.61
3.20
4.30
3.92
1.47
23000
18000
33080.433
33080.433
28
이민호
NC
5
1
3
6
1
60
3
88.2
⋯
71.4
4.06
1.97
4.41
4.22
1.38
18800
16000
23738.251
23738.251
29
이재학
NC
5
7
0
0
0
28
23
119.0
⋯
65.4
5.67
1.04
5.53
5.35
1.31
19000
20000
21005.351
21005.351
30
김강률
두산
7
2
7
12
1
70
0
89.0
⋯
73.5
3.44
2.68
4.15
4.09
1.27
15000
6200
16907.230
16907.230
31
김재윤
KT
3
5
15
0
4
41
0
37.1
⋯
54.3
5.79
0.73
3.15
3.05
1.24
11000
9000
18149.465
18149.465
32
김원중
롯데
7
8
0
0
0
24
24
107.1
⋯
70.3
5.70
2.22
5.53
5.56
1.23
6300
3000
8022.277
8022.277
33
박진형
롯데
4
4
2
10
3
45
9
88.0
⋯
68.9
5.11
1.70
4.43
4.14
1.12
10500
6000
10555.543
10555.543
34
윤규진
한화
8
7
0
2
0
36
18
119.0
⋯
72.1
5.22
2.23
5.46
5.42
1.09
21000
18000
22860.982
22860.982
35
김재영
한화
5
7
0
0
1
20
15
85.1
⋯
67.8
4.54
1.60
5.00
5.23
1.06
5300
3000
9305.405
9305.405
36
신정락
LG
3
5
10
12
3
63
0
59.0
⋯
59.3
5.34
0.51
4.23
4.12
0.97
10500
7500
11392.925
11392.925
38
심창민
삼성
4
7
6
16
2
66
0
75.1
⋯
78.5
4.18
1.82
4.82
4.26
0.93
23000
21000
23904.742
23904.742
39
정찬헌
LG
8
7
7
3
3
61
0
61.2
⋯
62.9
5.84
0.97
3.96
3.80
0.93
9500
4500
15323.219
15323.219
40
임창민
NC
4
3
29
0
6
60
0
66.0
⋯
81.4
3.68
2.68
4.83
4.60
0.89
25000
22500
25799.865
25799.865
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋱
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
118
배민관
LG
0
0
0
0
0
4
0
3.2
⋯
69.8
9.82
-0.11
9.46
9.77
-0.08
3200
3000
3734.0896
3734.0896
119
윤근영
KT
0
0
0
1
0
12
0
18.0
⋯
84.8
3.50
0.30
6.23
6.31
-0.08
4200
4000
3435.1499
3435.1499
120
강장산
KT
0
0
0
0
0
17
0
26.1
⋯
69.4
5.47
0.54
6.16
6.46
-0.10
3300
3000
1648.4704
1648.4704
124
김민우
한화
0
0
0
0
0
4
2
7.1
⋯
50.7
17.18
-0.52
7.83
8.32
-0.12
3600
3800
3075.9026
3075.9026
125
이현호
두산
1
0
0
0
0
24
2
30.0
⋯
65.3
5.70
-0.51
6.47
6.57
-0.16
5200
6000
3063.7755
3063.7755
127
배제성
KT
0
0
0
0
0
21
1
32.0
⋯
59.6
8.72
-0.10
6.39
6.59
-0.17
3000
2700
-294.7714
-294.7714
128
홍상삼
두산
1
1
0
0
0
11
2
17.0
⋯
53.0
7.94
-0.52
6.73
7.22
-0.18
9000
12500
10583.9599
10583.9599
129
이태양
한화
3
6
0
0
0
16
12
59.0
⋯
64.3
7.17
0.05
6.77
7.04
-0.18
7300
8300
3544.0763
3544.0763
130
김진우
KIA
2
6
0
0
1
14
8
36.1
⋯
65.3
7.93
-0.35
6.68
7.04
-0.19
6000
12000
7803.9704
7803.9704
131
이영하
두산
3
3
0
0
0
20
3
35.2
⋯
78.7
5.55
0.59
6.79
6.86
-0.19
4200
2700
2614.9313
2614.9313
133
김동호
삼성
0
1
0
0
0
20
1
36.0
⋯
67.0
6.75
-0.07
5.46
5.96
-0.20
6000
5000
1648.8880
1648.8880
134
김윤동
KIA
7
4
11
6
6
65
1
80.1
⋯
75.0
4.59
1.89
5.54
5.53
-0.20
15000
4700
3977.8323
3977.8323
135
정인욱
삼성
1
4
0
0
0
9
7
32.0
⋯
52.5
9.84
-0.43
7.39
7.36
-0.20
5700
7200
3413.2033
3413.2033
136
송창식
한화
5
6
0
15
6
63
0
73.1
⋯
61.0
6.63
-0.74
5.68
5.59
-0.22
24000
22000
17310.9499
17310.9499
137
배재환
NC
0
1
0
0
0
3
1
8.0
⋯
82.1
9.00
-0.16
11.48
11.73
-0.23
3000
4000
3039.1933
3039.1933
138
이정민
롯데
3
1
0
2
1
24
0
26.2
⋯
71.4
5.40
0.19
6.47
6.83
-0.24
10000
15000
14080.6714
14080.6714
139
최동환
LG
1
2
1
5
2
35
0
38.0
⋯
71.1
5.68
-0.05
6.76
6.77
-0.24
6500
6000
1836.9171
1836.9171
140
이종혁
KT
2
0
0
0
0
16
0
19.0
⋯
67.0
6.63
-0.09
7.10
7.57
-0.25
3200
2700
2969.8441
2969.8441
141
홍성용
KT
0
2
0
1
0
37
1
39.0
⋯
68.3
6.23
0.25
6.40
6.63
-0.28
6800
6300
-360.1517
-360.1517
142
정영일
SK
0
0
0
0
0
9
0
8.0
⋯
47.3
10.12
-0.61
10.73
11.37
-0.30
3000
4000
2110.5317
2110.5317
143
김지용
LG
4
3
3
8
4
53
0
53.0
⋯
73.9
5.09
0.28
6.30
6.27
-0.38
9000
10000
5555.9129
5555.9129
144
최금강
NC
5
3
0
0
0
39
13
89.2
⋯
57.4
7.33
-0.27
6.31
6.58
-0.41
12500
14000
5459.6023
5459.6023
145
김범수
한화
0
4
0
0
0
15
5
31.0
⋯
66.4
8.71
-0.42
8.15
8.52
-0.42
3600
3300
-2975.1372
-2975.1372
146
이승현
삼성
2
0
0
0
0
30
0
31.2
⋯
81.9
5.12
0.31
8.03
8.03
-0.44
7000
6200
2468.8307
2468.8307
147
주권
KT
5
6
1
3
2
39
12
81.2
⋯
63.2
6.61
-0.02
6.33
6.54
-0.46
7600
7500
-176.0475
-176.0475
148
장민재
한화
2
5
0
0
2
33
5
62.2
⋯
56.9
7.76
-1.21
6.21
6.48
-0.47
7100
8100
853.4203
853.4203
149
정용운
KIA
3
2
0
0
0
25
11
59.1
⋯
65.4
5.92
0.39
6.41
6.77
-0.49
7500
3100
-4363.3153
-4363.3153
150
노경은
롯데
0
2
0
0
0
9
2
14.2
⋯
52.8
11.66
-0.83
8.03
8.29
-0.61
10000
16000
9199.8537
9199.8537
151
김승현
삼성
0
3
0
1
0
41
0
43.2
⋯
73.9
5.77
-0.40
6.87
6.95
-0.70
4000
2900
-6817.6168
-6817.6168
152
류희운
KT
4
4
0
0
0
24
14
81.0
⋯
65.3
7.67
-0.68
7.60
7.81
-1.01
4000
3000
-8729.6887
-8729.6887
df <- filtered_df[,c ("선수명" ,"연봉.2018." ,"new_data" ,"연봉.2017." )]
A data.frame: 128 × 4
<chr>
<int>
<dbl>
<int>
1
켈리
140000
131853.252
85000
2
소사
120000
90426.947
50000
3
양현종
230000
186155.074
150000
5
레일리
111000
109876.101
85000
6
피어밴드
85000
58130.146
35000
7
고영표
11500
32370.634
5200
9
함덕주
16000
33082.909
7000
13
임기영
13000
24736.432
3100
14
박세웅
25000
28683.805
10000
15
백정현
15500
27341.043
10000
17
류제국
29000
44409.069
35000
19
임찬규
11500
14822.460
6500
22
윤희상
13000
23102.472
15000
23
원종현
18500
22358.400
14000
24
배영수
50000
57019.043
55000
25
박종훈
20000
21138.912
10000
26
이상화
10000
13605.103
4500
27
김진성
23000
33080.433
18000
28
이민호
18800
23738.251
16000
29
이재학
19000
21005.351
20000
30
김강률
15000
16907.230
6200
31
김재윤
11000
18149.465
9000
32
김원중
6300
8022.277
3000
33
박진형
10500
10555.543
6000
34
윤규진
21000
22860.982
18000
35
김재영
5300
9305.405
3000
36
신정락
10500
11392.925
7500
38
심창민
23000
23904.742
21000
39
정찬헌
9500
15323.219
4500
40
임창민
25000
25799.865
22500
⋮
⋮
⋮
⋮
⋮
118
배민관
3200
3734.0896
3000
119
윤근영
4200
3435.1499
4000
120
강장산
3300
1648.4704
3000
124
김민우
3600
3075.9026
3800
125
이현호
5200
3063.7755
6000
127
배제성
3000
-294.7714
2700
128
홍상삼
9000
10583.9599
12500
129
이태양
7300
3544.0763
8300
130
김진우
6000
7803.9704
12000
131
이영하
4200
2614.9313
2700
133
김동호
6000
1648.8880
5000
134
김윤동
15000
3977.8323
4700
135
정인욱
5700
3413.2033
7200
136
송창식
24000
17310.9499
22000
137
배재환
3000
3039.1933
4000
138
이정민
10000
14080.6714
15000
139
최동환
6500
1836.9171
6000
140
이종혁
3200
2969.8441
2700
141
홍성용
6800
-360.1517
6300
142
정영일
3000
2110.5317
4000
143
김지용
9000
5555.9129
10000
144
최금강
12500
5459.6023
14000
145
김범수
3600
-2975.1372
3300
146
이승현
7000
2468.8307
6200
147
주권
7600
-176.0475
7500
148
장민재
7100
853.4203
8100
149
정용운
7500
-4363.3153
3100
150
노경은
10000
9199.8537
16000
151
김승현
4000
-6817.6168
2900
152
류희운
4000
-8729.6887
3000
sorted_df <- df[order (df$ "연봉.2018." , decreasing = TRUE ), ]
A data.frame: 128 × 4
<chr>
<int>
<dbl>
<int>
3
양현종
230000
186155.074
150000
1
켈리
140000
131853.252
85000
2
소사
120000
90426.947
50000
5
레일리
111000
109876.101
85000
6
피어밴드
85000
58130.146
35000
24
배영수
50000
57019.043
55000
41
안영명
35000
15797.183
20000
68
채병용
30000
27789.008
25000
17
류제국
29000
44409.069
35000
14
박세웅
25000
28683.805
10000
40
임창민
25000
25799.865
22500
56
박정진
25000
30415.693
33000
136
송창식
24000
17310.950
22000
27
김진성
23000
33080.433
18000
38
심창민
23000
23904.742
21000
34
윤규진
21000
22860.982
18000
25
박종훈
20000
21138.912
10000
44
심수창
20000
27181.633
25000
50
장원삼
20000
68252.867
75000
29
이재학
19000
21005.351
20000
43
진해수
19000
11268.437
11000
28
이민호
18800
23738.251
16000
23
원종현
18500
22358.400
14000
111
박희수
18500
16045.659
21000
9
함덕주
16000
33082.909
7000
15
백정현
15500
27341.043
10000
30
김강률
15000
16907.230
6200
60
권오준
15000
8522.143
10500
63
김사율
15000
20575.830
20000
134
김윤동
15000
3977.832
4700
⋮
⋮
⋮
⋮
⋮
79
황수범
3800
1852.9124
2700
124
김민우
3600
3075.9026
3800
145
김범수
3600
-2975.1372
3300
92
이형범
3500
2185.0132
2700
93
임현준
3500
3907.2282
3200
110
안성무
3500
3405.0519
2800
99
박치국
3400
1349.4896
2700
102
허건엽
3400
3970.3092
3300
105
고봉재
3300
5980.8705
4400
112
김시현
3300
1784.8028
2700
120
강장산
3300
1648.4704
3000
118
배민관
3200
3734.0896
3000
140
이종혁
3200
2969.8441
2700
70
최지광
3100
1628.4784
2700
73
박세진
3100
4256.9427
3000
101
박상원
3100
2038.7415
2700
81
김대유
3000
4539.1000
2700
84
손주영
3000
4606.7626
2700
87
김진영
3000
4822.4882
2700
98
임진우
3000
4998.5939
3200
109
서균
3000
2309.3920
2700
127
배제성
3000
-294.7714
2700
137
배재환
3000
3039.1933
4000
142
정영일
3000
2110.5317
4000
96
김도영
2900
4555.8119
2700
97
조근종
2900
4555.8119
2700
108
안규현
2900
4322.7670
2800
115
김종훈
2900
4161.9663
2800
83
장지훈
2800
4754.8256
2700
106
이수민
2800
4741.0808
3000
이상치 제거
picher2 <- picher[- c (1 ,2 ,3 ,6 ,21 ,50 ,98 ),]
A data.frame: 145 × 24
<chr>
<chr>
<int>
<int>
<int>
<int>
<int>
<int>
<int>
<dbl>
⋯
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<dbl>
<int>
<int>
<dbl>
<dbl>
4
차우찬
LG
10
7
0
0
0
28
28
175.2
⋯
75.0
3.43
6.11
4.20
4.03
4.63
100000
100000
122543.256
122543.256
5
레일리
롯데
13
7
0
0
0
30
30
187.1
⋯
74.1
3.80
6.13
4.36
4.31
4.38
111000
85000
109876.101
109876.101
7
고영표
KT
8
12
0
1
0
25
24
141.2
⋯
64.6
5.08
2.97
3.88
3.78
3.87
11500
5200
32370.634
32370.634
8
장원준
두산
14
9
0
0
0
29
29
180.1
⋯
75.8
3.14
7.28
4.26
4.35
3.85
100000
100000
120891.419
120891.419
9
함덕주
두산
9
8
0
2
0
35
24
137.1
⋯
73.1
3.67
4.99
3.91
3.67
3.78
16000
7000
33082.909
33082.909
10
팻딘
KIA
9
7
0
0
0
30
29
176.0
⋯
76.5
4.14
5.66
4.65
4.61
3.64
70000
70000
85875.565
85875.565
11
윤성환
삼성
12
9
0
0
0
28
28
174.1
⋯
72.7
4.28
5.36
4.78
4.80
3.03
80000
80000
94612.907
94612.907
12
유희관
두산
11
6
0
1
0
30
29
188.2
⋯
69.7
4.53
4.79
4.78
4.97
2.89
50000
50000
64779.243
64779.243
13
임기영
KIA
8
6
0
0
0
23
19
118.1
⋯
72.3
3.65
4.25
4.07
4.19
2.79
13000
3100
24736.432
24736.432
14
박세웅
롯데
12
6
0
0
0
28
28
171.1
⋯
78.3
3.68
5.92
5.07
5.14
2.54
25000
10000
28683.805
28683.805
15
백정현
삼성
8
4
0
3
0
35
14
100.2
⋯
73.2
4.38
3.01
4.51
4.34
2.25
15500
10000
27341.043
27341.043
16
송승준
롯데
11
5
0
1
1
30
22
130.1
⋯
75.8
4.21
4.22
4.91
4.77
2.20
40000
40000
54033.609
54033.609
17
류제국
LG
8
6
0
0
0
25
25
131.1
⋯
63.5
5.35
1.53
4.79
4.84
2.15
29000
35000
44409.069
44409.069
18
우규민
삼성
7
10
0
0
0
27
25
133.0
⋯
60.6
5.21
1.48
4.95
4.98
2.14
70000
70000
73704.794
73704.794
19
임찬규
LG
6
10
0
0
0
27
26
124.1
⋯
71.3
4.63
3.15
4.81
4.79
2.04
11500
6500
14822.460
14822.460
20
손승락
롯데
1
3
37
0
5
61
0
62.0
⋯
89.9
2.18
3.91
3.69
3.37
1.82
70000
70000
71225.849
71225.849
22
윤희상
SK
6
7
0
0
0
23
22
120.0
⋯
63.1
6.00
1.52
5.13
5.22
1.80
13000
15000
23102.472
23102.472
23
원종현
NC
3
6
0
22
0
68
0
80.0
⋯
63.9
4.39
2.02
3.60
3.52
1.71
18500
14000
22358.400
22358.400
24
배영수
한화
7
8
0
0
0
25
25
128.0
⋯
66.4
5.06
2.47
5.10
5.21
1.68
50000
55000
57019.043
57019.043
25
박종훈
SK
12
7
0
1
0
29
28
151.1
⋯
75.3
4.10
4.31
5.38
5.55
1.62
20000
10000
21138.912
21138.912
26
이상화
KT
4
3
6
4
1
70
0
66.0
⋯
68.0
3.95
2.43
3.57
3.45
1.54
10000
4500
13605.103
13605.103
27
김진성
NC
10
6
0
15
2
69
0
89.2
⋯
76.1
3.61
3.20
4.30
3.92
1.47
23000
18000
33080.433
33080.433
28
이민호
NC
5
1
3
6
1
60
3
88.2
⋯
71.4
4.06
1.97
4.41
4.22
1.38
18800
16000
23738.251
23738.251
29
이재학
NC
5
7
0
0
0
28
23
119.0
⋯
65.4
5.67
1.04
5.53
5.35
1.31
19000
20000
21005.351
21005.351
30
김강률
두산
7
2
7
12
1
70
0
89.0
⋯
73.5
3.44
2.68
4.15
4.09
1.27
15000
6200
16907.230
16907.230
31
김재윤
KT
3
5
15
0
4
41
0
37.1
⋯
54.3
5.79
0.73
3.15
3.05
1.24
11000
9000
18149.465
18149.465
32
김원중
롯데
7
8
0
0
0
24
24
107.1
⋯
70.3
5.70
2.22
5.53
5.56
1.23
6300
3000
8022.277
8022.277
33
박진형
롯데
4
4
2
10
3
45
9
88.0
⋯
68.9
5.11
1.70
4.43
4.14
1.12
10500
6000
10555.543
10555.543
34
윤규진
한화
8
7
0
2
0
36
18
119.0
⋯
72.1
5.22
2.23
5.46
5.42
1.09
21000
18000
22860.982
22860.982
35
김재영
한화
5
7
0
0
1
20
15
85.1
⋯
67.8
4.54
1.60
5.00
5.23
1.06
5300
3000
9305.405
9305.405
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋱
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
⋮
123
정재원
한화
0
2
0
1
1
14
0
21.0
⋯
69.2
3.43
0.12
5.16
5.73
-0.12
4000
4000
2817.4227
2817.4227
124
김민우
한화
0
0
0
0
0
4
2
7.1
⋯
50.7
17.18
-0.52
7.83
8.32
-0.12
3600
3800
3075.9026
3075.9026
125
이현호
두산
1
0
0
0
0
24
2
30.0
⋯
65.3
5.70
-0.51
6.47
6.57
-0.16
5200
6000
3063.7755
3063.7755
126
권혁
한화
1
3
0
11
1
37
0
31.1
⋯
67.3
6.32
0.04
6.48
6.78
-0.16
45000
45000
36720.9933
36720.9933
127
배제성
KT
0
0
0
0
0
21
1
32.0
⋯
59.6
8.72
-0.10
6.39
6.59
-0.17
3000
2700
-294.7714
-294.7714
128
홍상삼
두산
1
1
0
0
0
11
2
17.0
⋯
53.0
7.94
-0.52
6.73
7.22
-0.18
9000
12500
10583.9599
10583.9599
129
이태양
한화
3
6
0
0
0
16
12
59.0
⋯
64.3
7.17
0.05
6.77
7.04
-0.18
7300
8300
3544.0763
3544.0763
130
김진우
KIA
2
6
0
0
1
14
8
36.1
⋯
65.3
7.93
-0.35
6.68
7.04
-0.19
6000
12000
7803.9704
7803.9704
131
이영하
두산
3
3
0
0
0
20
3
35.2
⋯
78.7
5.55
0.59
6.79
6.86
-0.19
4200
2700
2614.9313
2614.9313
132
최성영
NC
0
0
0
0
0
7
0
11.1
⋯
68.6
9.53
-0.26
9.29
9.48
-0.20
2900
2900
2236.5404
2236.5404
133
김동호
삼성
0
1
0
0
0
20
1
36.0
⋯
67.0
6.75
-0.07
5.46
5.96
-0.20
6000
5000
1648.8880
1648.8880
134
김윤동
KIA
7
4
11
6
6
65
1
80.1
⋯
75.0
4.59
1.89
5.54
5.53
-0.20
15000
4700
3977.8323
3977.8323
135
정인욱
삼성
1
4
0
0
0
9
7
32.0
⋯
52.5
9.84
-0.43
7.39
7.36
-0.20
5700
7200
3413.2033
3413.2033
136
송창식
한화
5
6
0
15
6
63
0
73.1
⋯
61.0
6.63
-0.74
5.68
5.59
-0.22
24000
22000
17310.9499
17310.9499
137
배재환
NC
0
1
0
0
0
3
1
8.0
⋯
82.1
9.00
-0.16
11.48
11.73
-0.23
3000
4000
3039.1933
3039.1933
138
이정민
롯데
3
1
0
2
1
24
0
26.2
⋯
71.4
5.40
0.19
6.47
6.83
-0.24
10000
15000
14080.6714
14080.6714
139
최동환
LG
1
2
1
5
2
35
0
38.0
⋯
71.1
5.68
-0.05
6.76
6.77
-0.24
6500
6000
1836.9171
1836.9171
140
이종혁
KT
2
0
0
0
0
16
0
19.0
⋯
67.0
6.63
-0.09
7.10
7.57
-0.25
3200
2700
2969.8441
2969.8441
141
홍성용
KT
0
2
0
1
0
37
1
39.0
⋯
68.3
6.23
0.25
6.40
6.63
-0.28
6800
6300
-360.1517
-360.1517
142
정영일
SK
0
0
0
0
0
9
0
8.0
⋯
47.3
10.12
-0.61
10.73
11.37
-0.30
3000
4000
2110.5317
2110.5317
143
김지용
LG
4
3
3
8
4
53
0
53.0
⋯
73.9
5.09
0.28
6.30
6.27
-0.38
9000
10000
5555.9129
5555.9129
144
최금강
NC
5
3
0
0
0
39
13
89.2
⋯
57.4
7.33
-0.27
6.31
6.58
-0.41
12500
14000
5459.6023
5459.6023
145
김범수
한화
0
4
0
0
0
15
5
31.0
⋯
66.4
8.71
-0.42
8.15
8.52
-0.42
3600
3300
-2975.1372
-2975.1372
146
이승현
삼성
2
0
0
0
0
30
0
31.2
⋯
81.9
5.12
0.31
8.03
8.03
-0.44
7000
6200
2468.8307
2468.8307
147
주권
KT
5
6
1
3
2
39
12
81.2
⋯
63.2
6.61
-0.02
6.33
6.54
-0.46
7600
7500
-176.0475
-176.0475
148
장민재
한화
2
5
0
0
2
33
5
62.2
⋯
56.9
7.76
-1.21
6.21
6.48
-0.47
7100
8100
853.4203
853.4203
149
정용운
KIA
3
2
0
0
0
25
11
59.1
⋯
65.4
5.92
0.39
6.41
6.77
-0.49
7500
3100
-4363.3153
-4363.3153
150
노경은
롯데
0
2
0
0
0
9
2
14.2
⋯
52.8
11.66
-0.83
8.03
8.29
-0.61
10000
16000
9199.8537
9199.8537
151
김승현
삼성
0
3
0
1
0
41
0
43.2
⋯
73.9
5.77
-0.40
6.87
6.95
-0.70
4000
2900
-6817.6168
-6817.6168
152
류희운
KT
4
4
0
0
0
24
14
81.0
⋯
65.3
7.67
-0.68
7.60
7.81
-1.01
4000
3000
-8729.6887
-8729.6887
aa <- lm (연봉.2018. ~ .- 팀명- 선수명, data= picher2)
Call:
lm(formula = 연봉.2018. ~ . - 팀명 - 선수명, data = picher2)
Residuals:
Min 1Q Median 3Q Max
-6719.3 -1077.4 65.0 796.1 21824.2
Coefficients: (2 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1180.5580 6767.2703 -0.174 0.862
승 -11.5761 219.0412 -0.053 0.958
패 81.9489 210.8035 0.389 0.698
세 -37.2721 103.6610 -0.360 0.720
홀드 80.5730 116.2315 0.693 0.489
블론 370.1732 278.5525 1.329 0.186
경기 -32.2197 53.8240 -0.599 0.551
선발 -174.3314 168.9477 -1.032 0.304
이닝 52.2789 42.2955 1.236 0.219
삼진.9 201.3240 879.7268 0.229 0.819
볼넷.9 340.5238 826.0454 0.412 0.681
홈런.9 1665.6462 5202.1586 0.320 0.749
BABIP 114.3823 5557.0488 0.021 0.984
LOB. 27.0594 55.5030 0.488 0.627
ERA 188.2818 297.7530 0.632 0.528
RA9.WAR 1323.6685 595.3892 2.223 0.028 *
FIP -6291.3374 16484.2058 -0.382 0.703
kFIP 4975.4255 13255.6236 0.375 0.708
WAR 477.9530 756.5703 0.632 0.529
연봉.2017. 0.8897 0.0197 45.155 <2e-16 ***
new_col NA NA NA NA
new_data NA NA NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3321 on 125 degrees of freedom
Multiple R-squared: 0.9762, Adjusted R-squared: 0.9726
F-statistic: 269.9 on 19 and 125 DF, p-value: < 2.2e-16
X <- data.frame ('승' = picher$ "승" ,
WAR = picher$ "WAR" ,
` 경기 ` = picher$ "경기" ,
` 선발 ` = picher$ "선발" ,
` 연봉.2017. ` = picher$ "연봉.2017." )
X <- picher2[c ('승' , 'WAR' , '경기' , '선발' , '연봉.2017.' )]
y <- picher2$ "연봉.2018."
set.seed (20 )
train_indices <- createDataPartition (y, p = 0.8 , list = FALSE ) # 80%를 훈련 세트로 사용
X_train <- X[train_indices, ] # X 훈련 세트
y_train <- y[train_indices] # y 훈련 세트
X_test <- X[- train_indices, ] # X 테스트 세트
y_test <- y[- train_indices] # y 테스트 세트
model2 <- lm (y_train ~ ., data = cbind (X_train, y_train))
summary (model2)
Call:
lm(formula = y_train ~ ., data = cbind(X_train, y_train))
Residuals:
Min 1Q Median 3Q Max
-7594.6 -1045.4 335.8 777.8 8330.4
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -291.41119 436.61649 -0.667 0.505870
승 439.94566 149.96710 2.934 0.004065 **
WAR 1528.78519 392.59368 3.894 0.000168 ***
경기 43.60185 15.92552 2.738 0.007195 **
선발 -81.74551 51.55867 -1.585 0.115676
연봉.2017. 0.88666 0.01366 64.889 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2315 on 112 degrees of freedom
Multiple R-squared: 0.9871, Adjusted R-squared: 0.9865
F-statistic: 1710 on 5 and 112 DF, p-value: < 2.2e-16
y_pred <- predict (model2, newdata = X_test)
## Shapiro-Wilk Test
## H0 : normal distribution vs. H1 : not H0
shapiro.test (resid (model2))
Shapiro-Wilk normality test
data: resid(model2)
W = 0.93815, p-value = 3.734e-05
귀무가설 기각
### 등분산성
## H0 : 등분산 vs. H1 : 이분산 (Heteroscedasticity)
bptest (model2)
studentized Breusch-Pagan test
data: model2
BP = 10.625, df = 5, p-value = 0.05934
잔차 이분산..
studentized Breusch-Pagan test
data: model2
BP = 10.625, df = 5, p-value = 0.05934
y_pred <- predict (model2, newdata = X)
picher2 <- cbind (picher2, new_data= y_pred)
sorted_picher2 <- picher[order (- picher2$ "연봉.2018." ), ]
filtered_df2 <- picher2[picher2$ "연봉.2018." != picher2$ "연봉.2017." , ]
df <- filtered_df2[,c ("선수명" ,"연봉.2018." ,"new_data" ,"연봉.2017." )]
sorted_df <- df[order (df$ "연봉.2018." , decreasing = TRUE ), ]
summary (model2)
A data.frame: 122 × 4
<chr>
<int>
<dbl>
<int>
5
레일리
111000
109876.101
85000
24
배영수
50000
57019.043
55000
41
안영명
35000
15797.183
20000
68
채병용
30000
27789.008
25000
17
류제국
29000
44409.069
35000
14
박세웅
25000
28683.805
10000
40
임창민
25000
25799.865
22500
56
박정진
25000
30415.693
33000
136
송창식
24000
17310.950
22000
27
김진성
23000
33080.433
18000
38
심창민
23000
23904.742
21000
34
윤규진
21000
22860.982
18000
25
박종훈
20000
21138.912
10000
44
심수창
20000
27181.633
25000
29
이재학
19000
21005.351
20000
43
진해수
19000
11268.437
11000
28
이민호
18800
23738.251
16000
23
원종현
18500
22358.400
14000
111
박희수
18500
16045.659
21000
9
함덕주
16000
33082.909
7000
15
백정현
15500
27341.043
10000
30
김강률
15000
16907.230
6200
60
권오준
15000
8522.143
10500
63
김사율
15000
20575.830
20000
134
김윤동
15000
3977.832
4700
71
임정우
14000
20886.198
22000
78
박정배
14000
8023.190
8000
13
임기영
13000
24736.432
3100
22
윤희상
13000
23102.472
15000
42
심동섭
13000
12386.757
11000
⋮
⋮
⋮
⋮
⋮
152
류희운
4000
-8729.6887
3000
79
황수범
3800
1852.9124
2700
124
김민우
3600
3075.9026
3800
145
김범수
3600
-2975.1372
3300
92
이형범
3500
2185.0132
2700
93
임현준
3500
3907.2282
3200
110
안성무
3500
3405.0519
2800
99
박치국
3400
1349.4896
2700
102
허건엽
3400
3970.3092
3300
105
고봉재
3300
5980.8705
4400
112
김시현
3300
1784.8028
2700
120
강장산
3300
1648.4704
3000
118
배민관
3200
3734.0896
3000
140
이종혁
3200
2969.8441
2700
70
최지광
3100
1628.4784
2700
73
박세진
3100
4256.9427
3000
101
박상원
3100
2038.7415
2700
81
김대유
3000
4539.1000
2700
84
손주영
3000
4606.7626
2700
87
김진영
3000
4822.4882
2700
109
서균
3000
2309.3920
2700
127
배제성
3000
-294.7714
2700
137
배재환
3000
3039.1933
4000
142
정영일
3000
2110.5317
4000
96
김도영
2900
4555.8119
2700
97
조근종
2900
4555.8119
2700
108
안규현
2900
4322.7670
2800
115
김종훈
2900
4161.9663
2800
83
장지훈
2800
4754.8256
2700
106
이수민
2800
4741.0808
3000
다 안되노
번외 (WAR을 y로)
model_ <- lm (WAR ~ ., dt)
summary (model_)
Call:
lm(formula = WAR ~ ., data = dt)
Residuals:
Min 1Q Median 3Q Max
-1.25837 -0.18387 -0.00443 0.17470 1.58012
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.632e-01 8.184e-01 1.055 0.2935
승 7.000e-04 2.446e-02 0.029 0.9772
패 3.924e-02 2.449e-02 1.602 0.1115
세 1.444e-03 1.218e-02 0.119 0.9058
홀드 1.016e-02 1.417e-02 0.717 0.4747
블론 -5.186e-02 3.392e-02 -1.529 0.1287
경기 -1.216e-02 6.489e-03 -1.874 0.0631 .
선발 -2.104e-02 2.081e-02 -1.011 0.3139
이닝 7.125e-03 5.161e-03 1.381 0.1698
삼진.9 2.483e-02 1.055e-01 0.235 0.8142
볼넷.9 1.254e-02 1.014e-01 0.124 0.9017
홈런.9 -3.549e-03 6.346e-01 -0.006 0.9955
BABIP 9.050e-01 6.636e-01 1.364 0.1749
LOB. -1.276e-02 5.728e-03 -2.227 0.0277 *
ERA -3.164e-02 2.542e-02 -1.245 0.2154
RA9.WAR 4.259e-01 5.563e-02 7.655 3.63e-12 ***
FIP -2.926e-02 2.011e+00 -0.015 0.9884
kFIP -2.821e-02 1.613e+00 -0.017 0.9861
연봉.2017. -9.617e-06 3.872e-06 -2.484 0.0143 *
연봉.2018. 1.726e-05 3.608e-06 4.783 4.55e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.413 on 132 degrees of freedom
Multiple R-squared: 0.9147, Adjusted R-squared: 0.9024
F-statistic: 74.5 on 19 and 132 DF, p-value: < 2.2e-16
승 7.90295946434333 패 5.20359428013343 세 3.03700161686219 홀드 3.62204119070082 블론 2.71884077341594 경기 13.9861481235637 선발 36.4583393437973 이닝 59.6518045536396 삼진.9 78.7060221382563 볼넷.9 50.827473231828 홈런.9 368.731473824191 BABIP 3.08657710689232 LOB. 3.90285091455687 ERA 9.92892504245132 RA9.WAR 9.31215347437081 FIP 12527.1829609619 kFIP 9046.79675535442 연봉.2017. 8.32457422927786 연봉.2018. 11.033305518369
WAR로 돌린 모델의 \(R^2\) 값이 연봉으로 돌린 것보다 높게 나왔다.