library(ggplot2)

1번

중학교 2학년 학생 중에서 15명을 임의로 추출하여 각 학생의 약력(kg), 신장(cm), 체중(kg)과 원반던지기에서 던진 거리(m)를 측정하여 다음의 데이터를 얻었다. 다음 물음에 답하여라. (R을 이용하여 풀이)(검정에서는 유의수준 α = 0.05 사용)

학생번호	x1(약력)	x2(신장)	x3(체중)	y(원반던지기 거리)
1	28	146	34	22
2	47	169	57	36
3	39	160	38	24
4	25	156	28	22
5	34	161	37	27
6	29	168	50	29
7	38	154	54	26
8	23	153	40	23
9	52	160	62	31
10	37	152	39	25
11	35	155	46	23
12	39	154	54	27
13	38	157	57	31
14	32	162	53	25
15	25	142	32	23

dt <- data.frame(x1 = c(28,47,39,25,34,29,38,23,52,37,35,39,38,32,25),
                 x2 = c(146,169,160,156,161,168,154,153,160,152,155,154,157,162,142),
                 x3 = c(34,57,38,28,37,50,54,40,62,39,46,54,57,53,32),
 y = c(22,36,24,22,27,29,26,23,31,25,23,27,31,25,23))

(1)

$x_3$를 설명변수로 하고 $y$를 반응변수로 하여 회귀직선을 적합시키고, 이상치가 있는가, 어떤 것이 영향을 크게 주는 측정값인가를 판정하시오.

model1 <- lm(y~x3,dt)
summary(model1)


Call:
lm(formula = y ~ x3, data = dt)

Residuals:
   Min     1Q Median     3Q    Max 
-3.453 -1.727 -0.042  1.068  6.396 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 13.20655    3.15279   4.189 0.001061 ** 
x3           0.28767    0.06774   4.247 0.000953 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.692 on 13 degrees of freedom
Multiple R-squared:  0.5811,    Adjusted R-squared:  0.5489 
F-statistic: 18.04 on 1 and 13 DF,  p-value: 0.0009527

plot(y~x3, dt,pch = 20,cex = 2,col = "darkorange")
abline(model1, col='steelblue', lwd=2)

leverage

Hmatrix

X = cbind(rep(1, nrow(dt)), dt$x3)
H = X %*% solve(t(X) %*% X) %*% t(X)
diag(H)

0.148940660082722
0.151852789735798
0.101333670971554
0.258335443572213
0.111336203258209
0.0800624630708194
0.113488646914831
0.0851270363805184
0.241115894319237
0.0925972820123238
0.0668945724656031
0.113488646914831
0.151852789735798
0.103232885962691
0.180341014602853

hatvalues 함수

which.max(hatvalues(model1))
hatvalues(model1)[which.max(hatvalues(model1))]

4: 4

4: 0.258335443572212

2*(1+1)/nrow(dt)

0.266666666666667

$h_{4} < 2\bar h$이므로 leverage포인트로 고려할 점은 없다.

이상치

잔차

residual <- model1$residuals
head(residual)

1: -0.987254157170599
2: 6.39638727103908
3: -0.137925213134126
4: 0.738752426774713
5: 3.14974255085676
6: 1.41006161897527

hist(residual)

내적표준화된 잔차

s_residual <- rstandard(model1)
head(s_residual)

1: -0.397517137814801
2: 2.57991821095028
3: -0.0540444666750733
4: 0.318641226294653
5: 1.2411181316058
6: 0.546091891647763

hist(s_residual)

s_residual>2

1: FALSE
2: TRUE
3: FALSE
4: FALSE
5: FALSE
6: FALSE
7: FALSE
8: FALSE
9: FALSE
10: FALSE
11: FALSE
12: FALSE
13: FALSE
14: FALSE
15: FALSE

외적표준화된 잔차

s_residual_i <- rstudent(model1)
head(s_residual_i)

1: -0.384264693530971
2: 3.5482500706297
3: -0.0519300781139025
4: 0.307343141739411
5: 1.27004329309873
6: 0.530791544243437

hist(s_residual_i)

which.max(s_residual_i)
s_residual_i[which.max(s_residual_i)]

2: 2

2: 3.5482500706297

qt(0.975, 15-1-2)

2.17881282966723

$|r_i^*| \geq t_{\alpha/2}(12)$ 이므로 2번째 관측값은 유의수준 0.05에서 이상점이다.

plot(y~x3, dt,pch = 20,cex = 2,col = "darkorange")
abline(model1, col='steelblue', lwd=2)

영향점

influence(model1)

$hat

1: 0.148940660082721
2: 0.151852789735798
3: 0.101333670971554
4: 0.258335443572212
5: 0.111336203258209
6: 0.0800624630708196
7: 0.113488646914831
8: 0.0851270363805183
9: 0.241115894319237
10: 0.0925972820123238
11: 0.0668945724656031
12: 0.113488646914831
13: 0.151852789735798
14: 0.103232885962691
15: 0.180341014602853

$coefficients

A matrix: 15 × 2 of type dbl
	(Intercept)	x3
1	-0.45742205	0.0083719544
2	-2.01160268	0.0553827321
3	-0.04287451	0.0007190015
4	0.56454256	-0.0109721950
5	1.09199851	-0.0188481935
6	-0.10046455	0.0044636533
7	0.55803792	-0.0168311648
8	-0.41549161	0.0064019062
9	0.02268944	-0.0005809430
10	0.15864320	-0.0025647941
11	-0.18216030	-0.0014000365
12	0.35441974	-0.0106897699
13	-0.43915046	0.0120905347
14	0.58437398	-0.0185257601
15	0.32415484	-0.0060864196

$sigma

1: 2.78495746793195
2: 1.95742401077584
3: 2.80172466134264
4: 2.79107580280945
5: 2.63079972749359
6: 2.76971399687302
7: 2.67308548669918
8: 2.75391719372408
9: 2.80200497070363
10: 2.79662725993432
11: 2.60673085648245
12: 2.75075026012413
13: 2.76764179330928
14: 2.59682146853761
15: 2.79575824802225

$wt.res

1: -0.987254157170599
2: 6.39638727103908
3: -0.137925213134126
4: 0.738752426774713
5: 3.14974255085676
6: 1.41006161897527
7: -2.74060943698827
8: -1.71326074111589
9: -0.0419515489153378
10: 0.574407022874991
11: -3.4392673250612
12: -1.74060943698827
13: 1.39638727103908
14: -3.45294167299738
15: 0.588081370811178

influence.measures(model1)

Influence measures of
     lm(formula = y ~ x3, data = dt) :

     dfb.1_   dfb.x3    dffit cov.r   cook.d    hat inf
1  -0.14025  0.11948 -0.16075 1.346 1.38e-02 0.1489    
2  -0.87752  1.12451  1.50138 0.330 5.96e-01 0.1519   *
3  -0.01307  0.01020 -0.01744 1.305 1.65e-04 0.1013    
4   0.17271 -0.15624  0.18139 1.558 1.77e-02 0.2583   *
5   0.35443 -0.28474  0.44954 1.026 9.65e-02 0.1113    
6  -0.03097  0.06405  0.15659 1.218 1.30e-02 0.0801    
7   0.17826 -0.25025 -0.38961 1.096 7.48e-02 0.1135    
8  -0.12883  0.09239 -0.19840 1.197 2.06e-02 0.0851    
9   0.00691 -0.00824 -0.00969 1.546 5.08e-05 0.2411   *
10  0.04844 -0.03645  0.06888 1.283 2.56e-03 0.0926    
11 -0.05967 -0.02135 -0.36571 0.942 6.27e-02 0.0669    
12  0.11002 -0.15445 -0.24046 1.230 3.02e-02 0.1135    
13 -0.13549  0.17362  0.23181 1.317 2.84e-02 0.1519    
14  0.19215 -0.28354 -0.47641 0.965 1.06e-01 0.1032    
15  0.09900 -0.08652  0.10898 1.419 6.40e-03 0.1803

DFFITS

dffits(model1)

1: -0.160752301355561
2: 1.50137779530986
3: -0.0174379982140392
4: 0.181389340447362
5: 0.449539631280775
6: 0.156588296343735
7: -0.389606512891271
8: -0.198401773199523
9: -0.00968760810666192
10: 0.0688785820818028
11: -0.36570800481418
12: -0.240459576775504
13: 0.231812126360193
14: -0.476405218577437
15: 0.108981268211392

which(abs(dffits(model1)) > 2*sqrt(2/(15-2)))

2: 2

Cook’s Distance

cooks.distance(model1)

1: 0.0138272288214354
2: 0.595845162090567
3: 0.000164675041376078
4: 0.0176827743279407
5: 0.0964928567484941
6: 0.0129769331507123
7: 0.0748275851885876
8: 0.0205956959313652
9: 5.08340304142156e-05
10: 0.00255988931413331
11: 0.0626967366405896
12: 0.0301835348837091
13: 0.0283972104088968
14: 0.105589522000774
15: 0.00640451964826487

qf(0.5,2,15-2)

0.731454594627164

which(cooks.distance(model1) >qf(0.5,2,15-2))

없다

COVRATIO

covratio(model1)

1: 1.34567954997946
2: 0.329530512740796
3: 1.30536080523447
4: 1.55778037598378
5: 1.02622098650489
6: 1.2178914729273
7: 1.0964638479128
8: 1.19693328273621
9: 1.54641970673037
10: 1.28341023214919
11: 0.94206589709893
12: 1.22955391198272
13: 1.31702952017448
14: 0.965419184972857
15: 1.41903295690633

which(abs(covratio(model1)-1) > 3*(1+1)/15)

2: 2
4: 4
9: 9
15: 15

summary(influence.measures(model1))

Potentially influential observations of
     lm(formula = y ~ x3, data = dt) :

  dfb.1_ dfb.x3  dffit   cov.r   cook.d hat  
2 -0.88   1.12_*  1.50_*  0.33_*  0.60   0.15
4  0.17  -0.16    0.18    1.56_*  0.02   0.26
9  0.01  -0.01   -0.01    1.55_*  0.00   0.24

영향점은 2번째 관측값이다.

## 2제거 전후
plot(y~x3, dt,pch = 20,
 cex = 2,col = "darkorange",
 main = "2번 제거")
abline(model1, col='steelblue', lwd=2)
abline(lm(y~x3, dt[-2,]), col='red', lwd=2)
text(dt[2,], pos=2, "2")
legend('topright', legend=c("full", "del(2)"),
 col=c('steelblue', 'red'), lty=1, lwd=2)
# high leverage and high influence, not outlier

## 4제거 전후
plot(y~x3, dt,pch = 20,
 cex = 2,col = "darkorange",
 main = "4번 제거")
abline(model1, col='steelblue', lwd=2)
abline(lm(y~x3, dt[-4,]), col='red', lwd=2)
text(dt[-4,], pos=2, "2")
legend('topright', legend=c("full", "del(4)"),
 col=c('steelblue', 'red'), lty=1, lwd=2)
# high leverage and high influence, not outlier

## 9제거 전후
plot(y~x3, dt,pch = 20,
 cex = 2,col = "darkorange",
 main = "9번 제거")
abline(model1, col='steelblue', lwd=2)
abline(lm(y~x3, dt[-9,]), col='red', lwd=2)
legend('topright', legend=c("full", "del(9)"),
 col=c('steelblue', 'red'), lty=1, lwd=2)
# high leverage and high influence, not outlier

## 4,9제거 전후
plot(y~x3, dt,pch = 20,
 cex = 2,col = "darkorange",
 main = "4,9번 제거")
abline(model1, col='steelblue', lwd=2)
abline(lm(y~x3, dt[-c(4,9),]), col='red', lwd=2)
legend('topright', legend=c("full", "del(4,9)"),
 col=c('steelblue', 'red'), lty=1, lwd=2)
# high leverage and high influence, not outlier

## 2,4,9제거 전후
plot(y~x3, dt,pch = 20,
 cex = 2,col = "darkorange",
 main = "2,4,9번 제거")
abline(model1, col='steelblue', lwd=2)
abline(lm(y~x3, dt[-c(2,4,9),]), col='red', lwd=2)
legend('topright', legend=c("full", "del(2,4,9)"),
 col=c('steelblue', 'red'), lty=1, lwd=2)
# high leverage and high influence, not outlier

정규성

shapiro.test(model1$residuals)


    Shapiro-Wilk normality test

data:  model1$residuals
W = 0.93332, p-value = 0.3057

회귀진단 그림


par(mfrow = c(2, 2))
plot(model1, pch=16)

(2)

$x_1, x_2, x_3$ 를 모두 사용하여 $y$에 대한 중회귀모형을 적합시키고, 이상치의 존재유무와 영향을 크게 주는 측정값이 어떤 것인가를 판정하시오.

model2 <- lm(y~x1+x2+x3,dt)
summary(model2)


Call:
lm(formula = y ~ x1 + x2 + x3, data = dt)

Residuals:
    Min      1Q  Median      3Q     Max 
-3.0723 -1.0869 -0.0208  0.9365  3.8745 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)  
(Intercept) -15.78431   14.50462  -1.088   0.2998  
x1            0.15684    0.11331   1.384   0.1937  
x2            0.19620    0.10215   1.921   0.0811 .
x3            0.12948    0.09083   1.425   0.1818  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.331 on 11 degrees of freedom
Multiple R-squared:  0.7342,    Adjusted R-squared:  0.6617 
F-statistic: 10.13 on 3 and 11 DF,  p-value: 0.001699

leverage

which.max(hatvalues(model2))
hatvalues(model2)[which.max(hatvalues(model2))]

6: 6

6: 0.467111162145579

2*(3+1)/nrow(dt)

0.533333333333333

leverage 고려할 포인트는 업사.

이상치

잔차

residual2 <- model2$residuals
head(residual2)
hist(residual2)

1: 0.345214860348173
2: 3.87452253360442
3: -2.64479086930799
4: -0.369359156300476
5: 1.07271444722616
6: 0.800293089055777

내적표준화된 잔차

s_residual2 <- rstandard(model2)
head(s_residual2)
s_residual>2
hist(s_residual2)

1: 0.168693789872619
2: 2.04040961191812
3: -1.36846222178994
4: -0.192038604753614
5: 0.527462985239622
6: 0.470252444990339

1: FALSE
2: TRUE
3: FALSE
4: FALSE
5: FALSE
6: FALSE
7: FALSE
8: FALSE
9: FALSE
10: FALSE
11: FALSE
12: FALSE
13: FALSE
14: FALSE
15: FALSE

외적표준화된 잔차

s_residual_i2 <- rstudent(model2)
head(s_residual_i2)
hist(s_residual_i2)

1: 0.161051677291382
2: 2.46770471155657
3: -1.43239034421095
4: -0.183409334396953
5: 0.509399466568036
6: 0.452944085994544

which.max(s_residual_i2)
s_residual_i[which.max(s_residual_i2)]

2: 2

2: 3.5482500706297

qt(0.975, 15-3-2)

2.22813885198627

$|r_i^*| \geq t_{\alpha/2}(10)$ 이므로 2번째 관측값은 유의수준 0.05에서 이상점이다.

영향점

influence(model2)

$hat

1: 0.229484482984878
2: 0.336559048244901
3: 0.312745387413974
4: 0.31935404183557
5: 0.238999578272863
6: 0.467111162145579
7: 0.176674868362933
8: 0.262199425033544
9: 0.414083032167575
10: 0.19055339953678
11: 0.0731668365931882
12: 0.174623444618773
13: 0.197413593761242
14: 0.241923572696132
15: 0.365108126332067

$coefficients

A matrix: 15 × 4 of type dbl
	(Intercept)	x1	x2	x3
1	0.95354693	0.0003764930	-5.561428e-03	-0.0014501180
2	-13.93952031	0.0672978312	8.377466e-02	-0.0248399264
3	4.69552573	-0.0704991936	-3.644734e-02	0.0705780525
4	0.50943934	0.0007684142	-5.873357e-03	0.0076533361
5	-2.27767961	0.0105499531	1.862708e-02	-0.0200833113
6	-4.18168476	-0.0332209276	3.130321e-02	0.0117532167
7	-2.20897093	0.0037053698	1.765648e-02	-0.0175478830
8	-0.01539851	0.0005690967	2.732094e-05	-0.0002319326
9	-0.51557236	-0.0263081084	9.369868e-03	-0.0028197702
10	0.12758711	0.0019405994	-7.974601e-04	-0.0013464759
11	-1.76717565	-0.0009954571	1.149237e-02	-0.0048224698
12	-0.86864768	-0.0001014964	7.037690e-03	-0.0060239702
13	2.27737859	-0.0219351895	-2.102859e-02	0.0439849174
14	3.66063758	0.0653541960	-2.631196e-02	-0.0454527846
15	13.67495355	-0.0124352729	-8.212831e-02	-0.0017947153

$sigma

1: 2.44193182869043
2: 1.92763082719566
3: 2.22726104086183
4: 2.44099449377856
5: 2.41397737377572
6: 2.42039452193581
7: 2.39715919509951
8: 2.44508463698753
9: 2.42314403118888
10: 2.44479643924596
11: 2.22712958167362
12: 2.43788055286261
13: 2.26047895469207
14: 2.20980300839363
15: 2.16577831202649

$wt.res

1: 0.345214860348173
2: 3.87452253360442
3: -2.64479086930799
4: -0.369359156300476
5: 1.07271444722616
6: 0.800293089055777
7: -1.38244275079492
8: -0.0208410556291935
9: -0.79131150566298
10: 0.109012275312978
11: -3.07226152705352
12: -0.539287518287805
13: 2.64051702167739
14: -2.88148775547281
15: 2.85950791128479

influence.measures(model2)

Influence measures of
     lm(formula = y ~ x1 + x2 + x3, data = dt) :

     dfb.1_    dfb.x1    dfb.x2   dfb.x3    dffit cov.r   cook.d    hat inf
1   0.06276  0.003172 -0.051979 -0.01524  0.08789 1.881 2.12e-03 0.2295    
2  -1.16230  0.718336  0.991882 -0.33074  1.75761 0.329 5.28e-01 0.3366   *
3   0.33885 -0.651274 -0.373479  0.81332 -0.96627 1.010 2.13e-01 0.3127    
4   0.03354  0.006477 -0.054915  0.08047 -0.12563 2.122 4.33e-03 0.3194   *
5  -0.15165  0.089922  0.176110 -0.21353  0.28547 1.737 2.18e-02 0.2390    
6  -0.27769 -0.282407  0.295171  0.12463  0.42407 2.533 4.85e-02 0.4671   *
7  -0.14811  0.031804  0.168104 -0.18788 -0.29442 1.518 2.29e-02 0.1767    
8  -0.00101  0.004789  0.000255 -0.00243 -0.00592 1.984 9.62e-06 0.2622    
9  -0.03420 -0.223389  0.088252 -0.02987 -0.35865 2.325 3.47e-02 0.4141   *
10  0.00839  0.016332 -0.007445 -0.01414  0.02405 1.807 1.59e-04 0.1906    
11 -0.12753 -0.009197  0.117770 -0.05558 -0.40259 0.748 3.70e-02 0.0732    
12 -0.05727 -0.000857  0.065885 -0.06342 -0.11200 1.732 3.43e-03 0.1746    
13  0.16193 -0.199660 -0.212315  0.49942  0.64667 0.973 9.83e-02 0.1974    
14  0.26625  0.608514 -0.271751 -0.52792 -0.84604 0.860 1.61e-01 0.2419    
15  1.01486 -0.118139 -0.865466 -0.02127  1.25657 0.874 3.41e-01 0.3651   *

DFFITS

dffits(model2)

1: 0.0878923790933185
2: 1.75761070078296
3: -0.966268917673672
4: -0.125631055448568
5: 0.285472714339999
6: 0.424068881254535
7: -0.294419313015607
8: -0.00591565303872599
9: -0.358654177245365
10: 0.0240465539273412
11: -0.402594900995498
12: -0.111997181258085
13: 0.646673748175155
14: -0.846036927571321
15: 1.25657461898781

which(abs(dffits(model2)) > 2*sqrt(2/(15-3-1)))

2: 2
3: 3
15: 15

Cook’s distance

cooks.distance(model2)

1: 0.00211889841023739
2: 0.527999760572049
3: 0.213048697653186
4: 0.00432581796350647
5: 0.0218442039453938
6: 0.0484602617041985
7: 0.0229122092214353
8: 9.62351672584315e-06
9: 0.0347416842517463
10: 0.000158976058928977
11: 0.0369801047117493
12: 0.00342909608500354
13: 0.0982906080447196
14: 0.160777911639787
15: 0.340678864434841

qf(0.5,3+1,15-3-1)

0.893156955577709

which(cooks.distance(model2) >qf(0.5,4,11))

없다

covratio

covratio(model2)

1: 1.88056935548212
2: 0.32929951279886
3: 1.0098427682077
4: 2.12234306958682
5: 1.73653332829024
6: 2.53311628226267
7: 1.51777046887204
8: 1.98433341199223
9: 2.32487827053021
10: 1.80699058897987
11: 0.748453434660152
12: 1.73240614043161
13: 0.973451691520289
14: 0.859642643137438
15: 0.873805339356644

which(abs(covratio(model2)-1) > 3*(3+1)/15)

1: 1
4: 4
6: 6
8: 8
9: 9
10: 10

정규성

shapiro.test(model2$residuals)


    Shapiro-Wilk normality test

data:  model2$residuals
W = 0.95818, p-value = 0.6608

회귀진단 그림

par(mfrow = c(2, 2))
plot(model2, pch=16)

2번

어떤 화학공장에서 $NH_3$를 $HNO_3$로 산화시키는 공정을 가지고 있다. 이 산화공정에 미치는 중요한 요인으로 생각되어지는 변수는

$x_1$ = 작업속도

$x_2$ = 냉각수의 온도

$x_3$ = 흡수액 속의 $HNO_3$의 농도

이다. $y$는 $NH_3$를 $HNO_3$로 바꿀 때 손실되는 $NH_3$의 %로 잡아주었다. 21일간의 공정기간 중에 얻은 자료는 앞의 것과 같다.

실험번호	$x_1$	$x_2$	$x_3$	$y$
1	80	27	89	42
2	80	27	88	37
3	75	25	90	37
4	62	24	87	28
5	62	22	87	18
6	62	23	87	18
7	62	24	93	19
8	62	24	93	20
9	58	23	87	15
10	58	18	90	14
11	58	18	89	14
12	58	17	88	13
13	58	18	82	11
14	58	19	93	12
15	50	18	89	8
16	50	18	86	7
17	50	19	72	8
18	50	19	79	8
19	50	20	80	9
20	56	20	82	15
21	70	20	91	15

dt2 <- data.frame(x1 = c(80,80,75,62,62,62,62,62,58,58,58,58,58,58,50,50,50,50,50,56,70),
                 x2 = c(27,27,25,24,22,23,24,24,23,18,18,17,18,19,18,18,19,19,20,20,20),
                 x3 = c(89,88,90,87,87,87,93,93,87,90,89,88,82,93,89,86,72,79,80,82,91),
 y = c(42,37,37,28,18,18,19,20,15,14,14,13,11,12,8,7,8,8,9,15,15))

(1)

중회귀모형

$y_i = β_0 + β_1x_{i1} + β_2x_{i2} + β_3x_{i3} + ϵ_i$

$i = 1, 2, · · · , 21$

$ϵ_i $∼$N(0, σ^2_ϵ)$

이 데이터간의 관계를 설명하는 데 충분하다고 가정하고 다음의 물음에 답하시오. 유의수준 α = 0.05를 사용하여라.

pairs(dt2, pch=16)
cor(dt2)

A matrix: 4 × 4 of type dbl
	x1	x2	x3	y
x1	1.0000000	0.7818523	0.4885669	0.9196635
x2	0.7818523	1.0000000	0.3078454	0.8755044
x3	0.4885669	0.3078454	1.0000000	0.3776617
y	0.9196635	0.8755044	0.3776617	1.0000000

y와 x1의 상관관계가 높아보인다. y와 x2도 높아보이낟.
x1과 x2도 높다!!

m <- lm(y~., dt2) ##FM
summary(m)


Call:
lm(formula = y ~ ., data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.4829 -1.7449 -0.4688  2.3497  5.7224 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -42.5557    12.7798  -3.330 0.003965 ** 
x1            0.7107     0.1416   5.018 0.000106 ***
x2            1.2610     0.3768   3.347 0.003823 ** 
x3           -0.1092     0.1632  -0.669 0.512537    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.289 on 17 degrees of freedom
Multiple R-squared:  0.9111,    Adjusted R-squared:  0.8954 
F-statistic: 58.08 on 3 and 17 DF,  p-value: 3.83e-09

(a)

add1/drop1 함수를 이용하여, 부분F검정통계량 값을 통한 후진제거법을 시행하여 가장 적절한 회귀모형을 구하여라.

drop1(m,test="F")

A anova: 4 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	183.9537	53.57340	NA	NA
x1	1	272.464536	456.4183	70.65663	25.1796878	0.0001055439
x2	1	121.206721	305.1604	62.20262	11.2012647	0.0038231136
x3	1	4.841619	188.7953	52.11896	0.4474361	0.5125370733

qf(0.95,1,21-3-1)

4.45132177246813

$F_L < F_{\alpha B}$ 이므로 설명변수 x3제거

m1 <- update(m, ~ . -x3)
summary(m1)


Call:
lm(formula = y ~ x1 + x2, data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.5290 -1.7505  0.1894  2.1156  5.6588 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -50.3588     5.1383  -9.801 1.22e-08 ***
x1            0.6712     0.1267   5.298 4.90e-05 ***
x2            1.2954     0.3675   3.525  0.00242 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.239 on 18 degrees of freedom
Multiple R-squared:  0.9088,    Adjusted R-squared:  0.8986 
F-statistic: 89.64 on 2 and 18 DF,  p-value: 4.382e-10

pvalue의 값도 유의하고 $R^2$도 약 90%의 설명력을 가진다. 각각의 회귀계수도 유의하다.
$y=-50.3588 + 0.6712 x_1 + 1.2954 x_2$

drop1(m1, test = "F")

A anova: 3 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	188.7953	52.11896	NA	NA
x1	1	294.3553	483.1506	69.85193	28.06423	0.0000489797
x2	1	130.3208	319.1161	61.14168	12.42496	0.0024191459

qf(0.95,1,21-2-1)

4.41387341917057

F-value값이 작은 x2를 제거하려고 봤더니 pr값이 0.002419로 유의하므로 제거하지 않는다.

(b)

add1/drop1 함수를 이용하여, 부분F검정통계량 값을 통한 전진선택법을 시행하여 가장 적절한 회귀모형을 구하여라.

m0 = lm(y ~ 1, data = dt2)
add1(m0, scope = y ~ x1 + x2 + x3, test = "F")

A anova: 4 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	2069.2381	98.39868	NA	NA
x1	1	1750.1220	319.1161	61.14168	104.201315	3.774296e-09
x2	1	1586.0875	483.1506	69.85193	62.373224	2.028017e-07
x3	1	295.1321	1774.1060	97.16712	3.160752	9.143800e-02

F값이 큰 x1을 추가하자. 심지어 유의하당

qf(0.95,1,21-1-1)

4.3807496923318

m1 <- update(m0, ~.+x1)
summary(m1)


Call:
lm(formula = y ~ x1, data = dt2)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.2896  -1.1272  -0.0459   1.1166   8.8728 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -44.13202    6.10586  -7.228 7.31e-07 ***
x1            1.02031    0.09995  10.208 3.77e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.098 on 19 degrees of freedom
Multiple R-squared:  0.8458,    Adjusted R-squared:  0.8377 
F-statistic: 104.2 on 1 and 19 DF,  p-value: 3.774e-09

add1(m1,
 scope = y ~ x1 + x2 + x3,
 test = "F")

A anova: 3 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	319.1161	61.14168	NA	NA
x2	1	130.32077	188.7953	52.11896	12.4249569	0.002419146
x3	1	13.95567	305.1604	62.20262	0.8231803	0.376238733

x2의 Fvalue가 크고 유의하므로 추가하자.

m2 <- update(m1, ~ . +x2)
summary(m2)


Call:
lm(formula = y ~ x1 + x2, data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.5290 -1.7505  0.1894  2.1156  5.6588 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -50.3588     5.1383  -9.801 1.22e-08 ***
x1            0.6712     0.1267   5.298 4.90e-05 ***
x2            1.2954     0.3675   3.525  0.00242 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.239 on 18 degrees of freedom
Multiple R-squared:  0.9088,    Adjusted R-squared:  0.8986 
F-statistic: 89.64 on 2 and 18 DF,  p-value: 4.382e-10

add1(m2,
 scope = y ~ x1 + x2 + x3,
 test = "F")

A anova: 2 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	188.7953	52.11896	NA	NA
x3	1	4.841619	183.9537	53.57340	0.4474361	0.5125371

x3의 pr값도 유의수준 0.05에서 유의하지 않다. 모형에 포함될 수 없으므로 멈춘다. 최종모형은 x1과 x2를 포함한 모형이다.

(c)

add1/drop1 함수를 이용하여, 부분F검정통계량 값을 통한 단계적 전진선택법을 시행하여 가장 적절한 회귀모형을 구하여라.

m0 = lm(y ~ 1, data = dt2)

add1(m0,
 scope = y ~ x1 + x2 + x3,
 test = "F")

A anova: 4 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	2069.2381	98.39868	NA	NA
x1	1	1750.1220	319.1161	61.14168	104.201315	3.774296e-09
x2	1	1586.0875	483.1506	69.85193	62.373224	2.028017e-07
x3	1	295.1321	1774.1060	97.16712	3.160752	9.143800e-02

Fvalue가 크고 pr값이 유의하 x1을 추가하자.

m1 <- update(m0, ~ . +x1)
summary(m1)


Call:
lm(formula = y ~ x1, data = dt2)

Residuals:
     Min       1Q   Median       3Q      Max 
-12.2896  -1.1272  -0.0459   1.1166   8.8728 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -44.13202    6.10586  -7.228 7.31e-07 ***
x1            1.02031    0.09995  10.208 3.77e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4.098 on 19 degrees of freedom
Multiple R-squared:  0.8458,    Adjusted R-squared:  0.8377 
F-statistic: 104.2 on 1 and 19 DF,  p-value: 3.774e-09

add1(m1,
 scope = y ~ x1 + x2 + x3,
 test = "F")

A anova: 3 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	319.1161	61.14168	NA	NA
x2	1	130.32077	188.7953	52.11896	12.4249569	0.002419146
x3	1	13.95567	305.1604	62.20262	0.8231803	0.376238733

x2의 Fvalue가 더 크고 pr값이 유의하므로 x2선택

m2 <- update(m1, ~ . +x2)

x1이 있는 모형에 x2를 추가함

drop1(m2, test = "F")

A anova: 3 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	188.7953	52.11896	NA	NA
x1	1	294.3553	483.1506	69.85193	28.06423	0.0000489797
x2	1	130.3208	319.1161	61.14168	12.42496	0.0024191459

x1을 보자. 제거할까? 유의하므로 제거하지 말자.

add1(m2,
 scope = y ~ x1 + x2 + x3,
 test = "F")

A anova: 2 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	188.7953	52.11896	NA	NA
x3	1	4.841619	183.9537	53.57340	0.4474361	0.5125371

유의수준 0.05에서는 유의하지 않으므로 x3를 추가하지 않는다.

summary(m2)


Call:
lm(formula = y ~ x1 + x2, data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.5290 -1.7505  0.1894  2.1156  5.6588 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -50.3588     5.1383  -9.801 1.22e-08 ***
x1            0.6712     0.1267   5.298 4.90e-05 ***
x2            1.2954     0.3675   3.525  0.00242 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.239 on 18 degrees of freedom
Multiple R-squared:  0.9088,    Adjusted R-squared:  0.8986 
F-statistic: 89.64 on 2 and 18 DF,  p-value: 4.382e-10

(2)

leaps 패키지의 regsubsets 함수를 이용하여, 다음의 물음에 답하여라.

library(leaps)

(a)

전역탐색법을 사용하여, 가능한 모든 회귀모형에 대하여 $MSE_p, R^2_p, R^2_{adj,p}, C_p$ 를 구하여라. (regsubsets 사용시, 옵션에서 nbest=1 대신에 적당한 숫자를 입력하면 모든 회귀모형에 대한 결과를 확인할 수 있다.)

fit<-regsubsets(y~., data=dt2, nbest=6,
method='exhaustive',
)
summary(fit)
with(summary(fit),
     round(cbind(which,rss,rsq,adjr2,cp,bic),3))

Subset selection object
Call: regsubsets.formula(y ~ ., data = dt2, nbest = 6, method = "exhaustive", 
    )
3 Variables  (and intercept)
   Forced in Forced out
x1     FALSE      FALSE
x2     FALSE      FALSE
x3     FALSE      FALSE
6 subsets of each size up to 3
Selection Algorithm: exhaustive
         x1  x2  x3 
1  ( 1 ) "*" " " " "
1  ( 2 ) " " "*" " "
1  ( 3 ) " " " " "*"
2  ( 1 ) "*" "*" " "
2  ( 2 ) "*" " " "*"
2  ( 3 ) " " "*" "*"
3  ( 1 ) "*" "*" "*"

A matrix: 7 × 9 of type dbl
	(Intercept)	x1	x2	x3	rss	rsq	adjr2	cp	bic
1	1	1	0	0	319.116	0.846	0.838	12.491	-33.168
1	1	0	1	0	483.151	0.767	0.754	27.650	-24.458
1	1	0	0	1	1774.106	0.143	0.098	146.953	2.857
2	1	1	1	0	188.795	0.909	0.899	2.447	-41.146
2	1	1	0	1	305.160	0.853	0.836	13.201	-31.062
2	1	0	1	1	456.418	0.779	0.755	27.180	-22.608
3	1	1	1	1	183.954	0.911	0.895	4.000	-38.647

(b)

$p = 1, p = 2, p = 3$에서 각각 가장 적절한 회귀모형을 구하여라. (기준 : $R^2_p$)

p=1인 경우에 $R_2$ 값이 가장 큰(0.846) 모형을 선택한다. 즉 $y=\beta_0+\beta_1x_1$

p=2인 경우에는 $y=\beta_0+\beta_1x_1+\beta_2x_2$

p=3인 경우에는 $y=\beta_0+\beta_1x_1+\beta_2x_2+\beta_3x_3$

fit<-regsubsets(y~., data=dt2, nbest=1,
method='exhaustive',
)
summary(fit)
with(summary(fit),
     round(cbind(which,rss,rsq,adjr2,cp,bic),3))

Subset selection object
Call: regsubsets.formula(y ~ ., data = dt2, nbest = 1, method = "exhaustive", 
    )
3 Variables  (and intercept)
   Forced in Forced out
x1     FALSE      FALSE
x2     FALSE      FALSE
x3     FALSE      FALSE
1 subsets of each size up to 3
Selection Algorithm: exhaustive
         x1  x2  x3 
1  ( 1 ) "*" " " " "
2  ( 1 ) "*" "*" " "
3  ( 1 ) "*" "*" "*"

A matrix: 3 × 9 of type dbl
	(Intercept)	x1	x2	x3	rss	rsq	adjr2	cp	bic
1	1	1	0	0	319.116	0.846	0.838	12.491	-33.168
2	1	1	1	0	188.795	0.909	0.899	2.447	-41.146
3	1	1	1	1	183.954	0.911	0.895	4.000	-38.647

(c)

위에서 구한 모형 중 가장 적절한 회귀모형을 선택하여라.

adjr이 제일 높은 모형-> y=x1+x2

rss는 작은것이 제일 좋은데 3개를 선택한 모형은 원래 제일 작음.. x1 1개만 선택한 모형의 rss는 319이고 x1과 x2를 선택한 모형의 rss는 188로 급격히 감소함. x3까ㅣ 추가한것은 183으로 별차이가 없으므로 x1+x2를 선택한 모형을 고르는 것이 좋아보인다. bic값도 제일 낮음

(d)

$C_p$의 경우 $C_p ≤ p + 1$ 이면 좋은 모형으로 판정하고, 이를 만족할 때 변수의 수가 가장 적은 모형을 선택하는 것이 좋다고 알려져 있다. $C_p$ 를 이용했을 때, $p = 1, p = 2, p = 3$에서 각각 가장 좋은 모형을 선택하여라.

p=1일때는 x1선택한 모형이 제일작지만, cp>2이므로 좋은 모형이 아니다.

p=2일때는 x1+x2

p=3일때는 x1+x2+x3

fit<-regsubsets(y~., data=dt2, nbest=6,
method='exhaustive',
)
summary(fit)
with(summary(fit),
     round(cbind(which,rss,rsq,adjr2,cp,bic),3))

Subset selection object
Call: regsubsets.formula(y ~ ., data = dt2, nbest = 6, method = "exhaustive", 
    )
3 Variables  (and intercept)
   Forced in Forced out
x1     FALSE      FALSE
x2     FALSE      FALSE
x3     FALSE      FALSE
6 subsets of each size up to 3
Selection Algorithm: exhaustive
         x1  x2  x3 
1  ( 1 ) "*" " " " "
1  ( 2 ) " " "*" " "
1  ( 3 ) " " " " "*"
2  ( 1 ) "*" "*" " "
2  ( 2 ) "*" " " "*"
2  ( 3 ) " " "*" "*"
3  ( 1 ) "*" "*" "*"

A matrix: 7 × 9 of type dbl
	(Intercept)	x1	x2	x3	rss	rsq	adjr2	cp	bic
1	1	1	0	0	319.116	0.846	0.838	12.491	-33.168
1	1	0	1	0	483.151	0.767	0.754	27.650	-24.458
1	1	0	0	1	1774.106	0.143	0.098	146.953	2.857
2	1	1	1	0	188.795	0.909	0.899	2.447	-41.146
2	1	1	0	1	305.160	0.853	0.836	13.201	-31.062
2	1	0	1	1	456.418	0.779	0.755	27.180	-22.608
3	1	1	1	1	183.954	0.911	0.895	4.000	-38.647

(e)

위에서 선택한 모형 중 $C_p$를 기준으로 가장 적절한 회귀모형을 선택하여라

p=2일때 x1+x2를 선택한 모형이 가장 적절하다. p=1인 경우에는 cp<p+1이므로 좋은 모형이라고 할 수 없다.

(3)

일차 선형항(lenear terms)만으로는 충분하지 않다고 생각하고 이차다항회귀모형

$y = β_0 + β_1x_1 + β_2x_2 + β_3x_3 + β_4x^2_1 + β_5x^2_2 + β_6x^2_3 + β_7x_1x_2 + β_8x_1x_3 + β_9x_2x_3 + ϵ$을 가정하였다. 다음 물음에 답하여라.

dt2$x_1 <- dt2$x1^2
dt2$x_2 <- dt2$x2^2
dt2$x_3 <- dt2$x3^2
dt2$x1x2 <- dt2$x1 * dt2$x2
dt2$x1x3 <- dt2$x1 * dt2$x1
dt2$x2x3 <- dt2$x2 * dt2$x3
head(dt2)

A data.frame: 6 × 10
	x1	x2	x3	y	x_1	x_2	x_3	x1x2	x1x3	x2x3
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	80	27	89	42	6400	729	7921	2160	6400	2403
2	80	27	88	37	6400	729	7744	2160	6400	2376
3	75	25	90	37	5625	625	8100	1875	5625	2250
4	62	24	87	28	3844	576	7569	1488	3844	2088
5	62	22	87	18	3844	484	7569	1364	3844	1914
6	62	23	87	18	3844	529	7569	1426	3844	2001

(a)

변수간 상관계수를 구하여라.

cor(dt2)

A matrix: 10 × 10 of type dbl
	x1	x2	x3	y	x_1	x_2	x_3	x1x2	x1x3	x2x3
x1	1.0000000	0.7818523	0.4885669	0.9196635	0.9969709	0.8002266	0.4861833	0.9452829	0.9969709	0.8221139
x2	0.7818523	1.0000000	0.3078454	0.8755044	0.7846889	0.9982761	0.3079752	0.9377909	0.7846889	0.9542580
x3	0.4885669	0.3078454	1.0000000	0.3776617	0.4524089	0.3130881	0.9991718	0.4004805	0.4524089	0.5773637
y	0.9196635	0.8755044	0.3776617	1.0000000	0.9251379	0.8933751	0.3735385	0.9588371	0.9251379	0.8672805
x_1	0.9969709	0.7846889	0.4524089	0.9251379	1.0000000	0.8053207	0.4499860	0.9500453	1.0000000	0.8131868
x_2	0.8002266	0.9982761	0.3130881	0.8933751	0.8053207	1.0000000	0.3129003	0.9498899	0.8053207	0.9544366
x_3	0.4861833	0.3079752	0.9991718	0.3735385	0.4499860	0.3129003	1.0000000	0.3990206	0.4499860	0.5777549
x1x2	0.9452829	0.9377909	0.4004805	0.9588371	0.9500453	0.9498899	0.3990206	1.0000000	0.9500453	0.9290098
x1x3	0.9969709	0.7846889	0.4524089	0.9251379	1.0000000	0.8053207	0.4499860	0.9500453	1.0000000	0.8131868
x2x3	0.8221139	0.9542580	0.5773637	0.8672805	0.8131868	0.9544366	0.5777549	0.9290098	0.8131868	1.0000000

(b)

설명변수 9개를 모두 사용한 완전모형(full model)에 대한 분산분석표를 작성하고, α = 0.05에서 선형회귀모형의 유의성을 검정하시오.

model3 <- lm(y~.,data=dt2)
summary(model3)


Call:
lm(formula = y ~ ., data = dt2)

Residuals:
   Min     1Q Median     3Q    Max 
-4.265 -1.249  0.005  1.177  5.382 

Coefficients: (1 not defined because of singularities)
              Estimate Std. Error t value Pr(>|t|)
(Intercept) -95.454579 171.941017  -0.555    0.589
x1            1.918799   1.503723   1.276    0.226
x2           -4.101841  10.326442  -0.397    0.698
x3            1.727460   3.273516   0.528    0.607
x_1          -0.040497   0.026368  -1.536    0.151
x_2          -0.007079   0.265375  -0.027    0.979
x_3          -0.004630   0.023903  -0.194    0.850
x1x2          0.172153   0.128263   1.342    0.204
x1x3                NA         NA      NA       NA
x2x3         -0.054589   0.112622  -0.485    0.637

Residual standard error: 3.037 on 12 degrees of freedom
Multiple R-squared:  0.9465,    Adjusted R-squared:  0.9109 
F-statistic: 26.55 on 8 and 12 DF,  p-value: 1.706e-06

anova(model3)

A anova: 9 × 5
	Df	Sum Sq	Mean Sq	F value	Pr(>F)
	<int>	<dbl>	<dbl>	<dbl>	<dbl>
x1	1	1750.121989	1750.121989	189.7977744	1.023932e-08
x2	1	130.320772	130.320772	14.1330676	2.723908e-03
x3	1	4.841619	4.841619	0.5250654	4.825789e-01
x_1	1	8.498097	8.498097	0.9216043	3.559986e-01
x_2	1	39.533776	39.533776	4.2873712	6.062395e-02
x_3	1	5.206534	5.206534	0.5646399	4.668786e-01
x1x2	1	17.897053	17.897053	1.9409052	1.888401e-01
x2x3	1	2.166462	2.166462	0.2349492	6.366054e-01
Residuals	12	110.651792	9.220983	NA	NA

null_model <- lm(y~1, data=dt2)  #H0
model3<- lm(y~., data=dt2)  #H1

anova(null_model, model3)

A anova: 2 × 6
	Res.Df	RSS	Df	Sum of Sq	F	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
1	20	2069.2381	NA	NA	NA	NA
2	12	110.6518	8	1958.586	26.55067	1.706475e-06

F=26.55067

(c)

상관계수들을 보고 변수를 하나만 선택할 때에는 어떤 변수가 선택될 것인지를 말하시오.

x1x3와 y의 상관계수가 0.9588371로 가장 높으므로 선택

(d)

상관계수들을 보고 변수를 하나만 제거시키려고 할 때, 어떤 변수가 제거될 것인가를 말할 수 있는가?

x3^2와 y의 상관계수가 0.3735385로 가장 작으므로 제거

(e)

add1/drop1 함수를 이용하여, $MSE_p$ 기준으로 단계적 전진선택법을 시행하여 가장 적절한 회귀모형을 구하여라.

model_step = step(
 m0,
 scope =y ~ x1 + x2 + x3+ I(x1^2)+I(x2^2)+I(x3^2)+I(x1*x2)+I(x1*x3)+I(x2*x3),
 direction = "both")
summary(model_step)

Start:  AIC=98.4
y ~ 1

             Df Sum of Sq     RSS    AIC
+ I(x1 * x2)  1   1902.39  166.85 47.523
+ I(x1^2)     1   1771.02  298.22 59.719
+ x1          1   1750.12  319.12 61.142
+ I(x2^2)     1   1651.50  417.74 66.797
+ x2          1   1586.09  483.15 69.852
+ I(x1 * x3)  1   1563.93  505.31 70.794
+ I(x2 * x3)  1   1556.43  512.81 71.103
+ x3          1    295.13 1774.11 97.167
+ I(x3^2)     1    288.72 1780.52 97.243
<none>                    2069.24 98.399

Step:  AIC=47.52
y ~ I(x1 * x2)

             Df Sum of Sq     RSS    AIC
<none>                     166.85 47.523
+ x2          1      9.63  157.22 48.275
+ I(x2 * x3)  1      8.34  158.51 48.447
+ I(x2^2)     1      6.42  160.42 48.699
+ I(x1^2)     1      4.28  162.56 48.977
+ x1          1      3.43  163.41 49.087
+ I(x1 * x3)  1      0.61  166.23 49.446
+ I(x3^2)     1      0.20  166.64 49.498
+ x3          1      0.10  166.75 49.511
- I(x1 * x2)  1   1902.39 2069.24 98.399


Call:
lm(formula = y ~ I(x1 * x2), data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1481 -2.3759 -0.6042  2.6123  5.6241 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -15.29308    2.32149  -6.588 2.64e-06 ***
I(x1 * x2)    0.02532    0.00172  14.719 7.68e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.963 on 19 degrees of freedom
Multiple R-squared:  0.9194,    Adjusted R-squared:  0.9151 
F-statistic: 216.6 on 1 and 19 DF,  p-value: 7.675e-12

AIC로 확인해보니까 $y$~$I(x1*x2)$가 최종모형

m0 = lm(y ~ 1, data = dt2)
add1(m0,
 scope = y ~ x1 + x2 + x3+ I(x1^2)+I(x2^2)+I(x3^2)+I(x1*x2)+I(x1*x3)+I(x2*x3),
 test = "F")

A anova: 10 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	2069.2381	98.39868	NA	NA
x1	1	1750.1220	319.1161	61.14168	104.201315	3.774296e-09
x2	1	1586.0875	483.1506	69.85193	62.373224	2.028017e-07
x3	1	295.1321	1774.1060	97.16712	3.160752	9.143800e-02
I(x1^2)	1	1771.0198	298.2182	59.71937	112.834735	1.972872e-09
I(x2^2)	1	1651.4985	417.7396	66.79705	75.114902	5.001792e-08
I(x3^2)	1	288.7229	1780.5152	97.24285	3.080982	9.532366e-02
I(x1 * x2)	1	1902.3924	166.8457	47.52349	216.639964	7.675449e-12
I(x1 * x3)	1	1563.9280	505.3101	70.79365	58.804744	3.124530e-07
I(x2 * x3)	1	1556.4302	512.8079	71.10295	57.667163	3.601417e-07

qf(0.95,1,21-9-1)

4.84433567494361

RSS의 값이 가장 작은 x1*x2 변수를 선택

m1 <- update(m0, ~ . +I(x1*x2))
summary(m1)


Call:
lm(formula = y ~ I(x1 * x2), data = dt2)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1481 -2.3759 -0.6042  2.6123  5.6241 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -15.29308    2.32149  -6.588 2.64e-06 ***
I(x1 * x2)    0.02532    0.00172  14.719 7.68e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.963 on 19 degrees of freedom
Multiple R-squared:  0.9194,    Adjusted R-squared:  0.9151 
F-statistic: 216.6 on 1 and 19 DF,  p-value: 7.675e-12

add1(m1, 
     scope = y ~ x1 + x2 + x3+ I(x1^2)+I(x2^2)+I(x3^2)+I(x1*x2)+I(x1*x3)+I(x2*x3),
 test = "F")

A anova: 9 × 6
	Df	Sum of Sq	RSS	AIC	F value	Pr(>F)
	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>	<dbl>
<none>	NA	NA	166.8457	47.52349	NA	NA
x1	1	3.43425436	163.4115	49.08673	0.37828785	0.5462188
x2	1	9.62876230	157.2170	48.27519	1.10241098	0.3076309
x3	1	0.09886847	166.7469	49.51104	0.01067266	0.9188604
I(x1^2)	1	4.28275838	162.5630	48.97740	0.47421406	0.4998351
I(x2^2)	1	6.42241880	160.4233	48.69917	0.72061555	0.4070905
I(x3^2)	1	0.20188867	166.6438	49.49807	0.02180696	0.8842434
I(x1 * x3)	1	0.61479960	166.2309	49.44597	0.06657240	0.7993213
I(x2 * x3)	1	8.33654778	158.5092	48.44709	0.94668239	0.3434600

qf(0.95,1,21-8-1)

4.74722534672251

X2의 RSS값이 157.2170으로 가장 낮지만 pr값이 유의하지 않으므로 선택하지 않는다.

(f)

regsubsets 함수를 이용하여 전역탐색법을 시행하여라. $C_p, MSE_p, R^2_{adj,p}$ 기준으로 가장 적절한 모형을 선택한 후 위의 결과와 비교하시오

fit2 <- regsubsets(y~., data=dt2, nbest=20, method='exhaustive',)
summary(fit2)

Warning message in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in = force.in, :
“1  linear dependencies found”

Reordering variables and trying again:

Subset selection object
Call: regsubsets.formula(y ~ ., data = dt2, nbest = 20, method = "exhaustive", 
    )
9 Variables  (and intercept)
     Forced in Forced out
x1       FALSE      FALSE
x2       FALSE      FALSE
x3       FALSE      FALSE
x_1      FALSE      FALSE
x_2      FALSE      FALSE
x_3      FALSE      FALSE
x1x2     FALSE      FALSE
x2x3     FALSE      FALSE
x1x3     FALSE      FALSE
20 subsets of each size up to 8
Selection Algorithm: exhaustive
          x1  x2  x3  x_1 x_2 x_3 x1x2 x1x3 x2x3
1  ( 1 )  " " " " " " " " " " " " "*"  " "  " " 
1  ( 2 )  " " " " " " " " " " " " " "  "*"  " " 
1  ( 3 )  " " " " " " "*" " " " " " "  " "  " " 
1  ( 4 )  "*" " " " " " " " " " " " "  " "  " " 
1  ( 5 )  " " " " " " " " "*" " " " "  " "  " " 
1  ( 6 )  " " "*" " " " " " " " " " "  " "  " " 
1  ( 7 )  " " " " " " " " " " " " " "  " "  "*" 
1  ( 8 )  " " " " "*" " " " " " " " "  " "  " " 
1  ( 9 )  " " " " " " " " " " "*" " "  " "  " " 
2  ( 1 )  " " "*" " " " " " " " " "*"  " "  " " 
2  ( 2 )  " " " " " " " " " " " " "*"  " "  "*" 
2  ( 3 )  " " " " " " " " "*" " " "*"  " "  " " 
2  ( 4 )  " " " " " " " " " " " " "*"  "*"  " " 
2  ( 5 )  " " " " " " "*" " " " " "*"  " "  " " 
2  ( 6 )  "*" " " " " " " " " " " "*"  " "  " " 
2  ( 7 )  " " " " " " " " " " "*" "*"  " "  " " 
2  ( 8 )  " " " " "*" " " " " " " "*"  " "  " " 
2  ( 9 )  " " " " " " " " "*" " " " "  "*"  " " 
2  ( 10 ) " " " " " " "*" "*" " " " "  " "  " " 
2  ( 11 ) "*" " " " " " " "*" " " " "  " "  " " 
2  ( 12 ) " " "*" " " " " " " " " " "  "*"  " " 
2  ( 13 ) " " "*" " " "*" " " " " " "  " "  " " 
2  ( 14 ) "*" "*" " " " " " " " " " "  " "  " " 
2  ( 15 ) " " " " " " " " " " " " " "  "*"  "*" 
2  ( 16 ) " " " " " " "*" " " " " " "  " "  "*" 
2  ( 17 ) "*" " " " " " " " " " " " "  " "  "*" 
2  ( 18 ) " " "*" " " " " "*" " " " "  " "  " " 
2  ( 19 ) " " " " " " " " " " "*" " "  "*"  " " 
2  ( 20 ) " " " " " " "*" " " "*" " "  " "  " " 
3  ( 1 )  " " "*" " " "*" " " " " "*"  " "  " " 
3  ( 2 )  " " "*" " " " " " " " " "*"  "*"  " " 
3  ( 3 )  " " " " " " "*" "*" " " "*"  " "  " " 
3  ( 4 )  " " " " " " " " "*" " " "*"  "*"  " " 
3  ( 5 )  "*" "*" " " " " "*" " " " "  " "  " " 
3  ( 6 )  " " "*" " " " " "*" " " "*"  " "  " " 
3  ( 7 )  " " "*" " " "*" "*" " " " "  " "  " " 
3  ( 8 )  " " "*" " " " " "*" " " " "  "*"  " " 
3  ( 9 )  "*" "*" " " " " " " " " "*"  " "  " " 
3  ( 10 ) " " " " "*" " " " " " " "*"  " "  "*" 
3  ( 11 ) " " " " " " " " " " "*" "*"  " "  "*" 
3  ( 12 ) " " "*" " " " " " " " " "*"  " "  "*" 
3  ( 13 ) " " "*" " " " " " " "*" "*"  " "  " " 
3  ( 14 ) " " "*" "*" " " " " " " "*"  " "  " " 
3  ( 15 ) "*" " " " " " " "*" " " "*"  " "  " " 
3  ( 16 ) " " " " "*" " " " " "*" "*"  " "  " " 
3  ( 17 ) " " " " " " " " "*" " " "*"  " "  "*" 
3  ( 18 ) "*" " " " " " " " " " " "*"  " "  "*" 
3  ( 19 ) " " " " " " " " " " " " "*"  "*"  "*" 
3  ( 20 ) " " " " " " "*" " " " " "*"  " "  "*" 
4  ( 1 )  "*" "*" " " " " " " " " "*"  "*"  " " 
4  ( 2 )  "*" "*" " " "*" " " " " "*"  " "  " " 
4  ( 3 )  " " " " "*" "*" " " " " "*"  " "  "*" 
4  ( 4 )  " " " " "*" " " " " " " "*"  "*"  "*" 
4  ( 5 )  " " "*" " " "*" "*" " " "*"  " "  " " 
4  ( 6 )  " " "*" " " " " "*" " " "*"  "*"  " " 
4  ( 7 )  " " " " " " "*" " " "*" "*"  " "  "*" 
4  ( 8 )  " " " " " " " " " " "*" "*"  "*"  "*" 
4  ( 9 )  " " "*" "*" "*" " " " " "*"  " "  " " 
4  ( 10 ) " " "*" "*" " " " " " " "*"  "*"  " " 
4  ( 11 ) " " "*" " " "*" " " " " "*"  " "  "*" 
4  ( 12 ) " " "*" " " " " " " " " "*"  "*"  "*" 
4  ( 13 ) " " "*" " " " " " " "*" "*"  "*"  " " 
4  ( 14 ) " " "*" " " "*" " " "*" "*"  " "  " " 
4  ( 15 ) " " "*" " " "*" " " " " "*"  "*"  " " 
4  ( 16 ) "*" " " "*" " " "*" " " " "  " "  "*" 
4  ( 17 ) "*" " " " " " " "*" " " "*"  "*"  " " 
4  ( 18 ) "*" " " " " "*" "*" " " "*"  " "  " " 
4  ( 19 ) " " " " "*" " " "*" " " "*"  " "  "*" 
4  ( 20 ) " " " " " " "*" "*" " " "*"  " "  "*" 
5  ( 1 )  "*" " " "*" "*" " " " " "*"  " "  "*" 
5  ( 2 )  "*" " " "*" " " " " " " "*"  "*"  "*" 
5  ( 3 )  "*" " " " " " " " " "*" "*"  "*"  "*" 
5  ( 4 )  "*" " " " " "*" " " "*" "*"  " "  "*" 
5  ( 5 )  "*" "*" " " "*" " " " " "*"  " "  "*" 
5  ( 6 )  "*" "*" " " " " " " " " "*"  "*"  "*" 
5  ( 7 )  "*" "*" " " "*" " " "*" "*"  " "  " " 
5  ( 8 )  "*" "*" " " " " " " "*" "*"  "*"  " " 
5  ( 9 )  "*" "*" "*" " " " " " " "*"  "*"  " " 
5  ( 10 ) "*" "*" "*" "*" " " " " "*"  " "  " " 
5  ( 11 ) "*" "*" " " "*" "*" " " "*"  " "  " " 
5  ( 12 ) "*" "*" " " " " "*" " " "*"  "*"  " " 
5  ( 13 ) "*" "*" " " "*" " " " " "*"  "*"  " " 
5  ( 14 ) " " " " "*" " " " " "*" "*"  "*"  "*" 
5  ( 15 ) " " " " "*" "*" " " "*" "*"  " "  "*" 
5  ( 16 ) " " " " "*" " " "*" " " "*"  "*"  "*" 
5  ( 17 ) " " " " "*" "*" "*" " " "*"  " "  "*" 
5  ( 18 ) " " "*" "*" "*" " " " " "*"  " "  "*" 
5  ( 19 ) " " "*" "*" " " " " " " "*"  "*"  "*" 
5  ( 20 ) " " " " "*" "*" " " " " "*"  "*"  "*" 
6  ( 1 )  "*" "*" "*" " " " " " " "*"  "*"  "*" 
6  ( 2 )  "*" "*" "*" "*" " " " " "*"  " "  "*" 
6  ( 3 )  "*" " " "*" " " "*" " " "*"  "*"  "*" 
6  ( 4 )  "*" " " "*" "*" "*" " " "*"  " "  "*" 
6  ( 5 )  "*" " " "*" "*" " " "*" "*"  " "  "*" 
6  ( 6 )  "*" " " "*" " " " " "*" "*"  "*"  "*" 
6  ( 7 )  "*" " " "*" "*" " " " " "*"  "*"  "*" 
6  ( 8 )  "*" "*" "*" " " " " "*" "*"  "*"  " " 
6  ( 9 )  "*" "*" "*" "*" " " "*" "*"  " "  " " 
6  ( 10 ) "*" "*" " " "*" " " "*" "*"  " "  "*" 
6  ( 11 ) "*" "*" " " " " " " "*" "*"  "*"  "*" 
6  ( 12 ) "*" " " " " " " "*" "*" "*"  "*"  "*" 
6  ( 13 ) "*" " " " " "*" "*" "*" "*"  " "  "*" 
6  ( 14 ) "*" " " " " "*" " " "*" "*"  "*"  "*" 
6  ( 15 ) "*" "*" " " "*" "*" " " "*"  " "  "*" 
6  ( 16 ) "*" "*" " " " " "*" " " "*"  "*"  "*" 
6  ( 17 ) "*" "*" " " "*" " " " " "*"  "*"  "*" 
6  ( 18 ) "*" "*" " " "*" "*" "*" "*"  " "  " " 
6  ( 19 ) "*" "*" " " " " "*" "*" "*"  "*"  " " 
6  ( 20 ) "*" "*" " " "*" " " "*" "*"  "*"  " " 
7  ( 1 )  "*" "*" "*" " " " " "*" "*"  "*"  "*" 
7  ( 2 )  "*" "*" "*" "*" " " "*" "*"  " "  "*" 
7  ( 3 )  "*" "*" "*" " " "*" " " "*"  "*"  "*" 
7  ( 4 )  "*" "*" "*" "*" "*" " " "*"  " "  "*" 
7  ( 5 )  "*" "*" "*" "*" " " " " "*"  "*"  "*" 
7  ( 6 )  "*" " " "*" " " "*" "*" "*"  "*"  "*" 
7  ( 7 )  "*" " " "*" "*" "*" "*" "*"  " "  "*" 
7  ( 8 )  "*" " " "*" "*" "*" " " "*"  "*"  "*" 
7  ( 9 )  "*" " " "*" "*" " " "*" "*"  "*"  "*" 
7  ( 10 ) "*" "*" "*" " " "*" "*" "*"  "*"  " " 
7  ( 11 ) "*" "*" "*" "*" "*" "*" "*"  " "  " " 
7  ( 12 ) "*" "*" "*" "*" " " "*" "*"  "*"  " " 
7  ( 13 ) "*" "*" " " "*" "*" "*" "*"  " "  "*" 
7  ( 14 ) "*" "*" " " " " "*" "*" "*"  "*"  "*" 
7  ( 15 ) "*" "*" " " "*" " " "*" "*"  "*"  "*" 
7  ( 16 ) "*" " " " " "*" "*" "*" "*"  "*"  "*" 
7  ( 17 ) "*" "*" " " "*" "*" " " "*"  "*"  "*" 
7  ( 18 ) "*" "*" " " "*" "*" "*" "*"  "*"  " " 
7  ( 19 ) "*" "*" "*" "*" "*" " " "*"  "*"  " " 
7  ( 20 ) " " "*" "*" " " "*" "*" "*"  "*"  "*" 
8  ( 1 )  "*" "*" "*" " " "*" "*" "*"  "*"  "*" 
8  ( 2 )  "*" "*" "*" "*" "*" "*" "*"  " "  "*" 
8  ( 3 )  "*" "*" "*" "*" " " "*" "*"  "*"  "*" 
8  ( 4 )  "*" "*" "*" "*" "*" " " "*"  "*"  "*" 
8  ( 5 )  "*" " " "*" "*" "*" "*" "*"  "*"  "*" 
8  ( 6 )  "*" "*" " " "*" "*" "*" "*"  "*"  "*" 
8  ( 7 )  " " "*" "*" "*" "*" "*" "*"  "*"  "*" 
8  ( 8 )  "*" "*" "*" "*" "*" "*" " "  "*"  "*"

with(summary(fit2),
     round(cbind(which,rss,rsq,adjr2,cp,bic),3))

A matrix: 137 × 15 of type dbl
	(Intercept)	x1	x2	x3	x_1	x_2	x_3	x1x2	x1x3	x2x3	rss	rsq	adjr2	cp	bic
1	1	0	0	0	0	0	0	1	0	0	166.846	0.919	0.915	-0.414	-46.786
1	1	0	0	0	0	0	0	0	1	0	298.218	0.856	0.848	12.646	-34.590
1	1	0	0	0	1	0	0	0	0	0	298.218	0.856	0.848	12.646	-34.590
1	1	1	0	0	0	0	0	0	0	0	319.116	0.846	0.838	14.724	-33.168
1	1	0	0	0	0	1	0	0	0	0	417.740	0.798	0.787	24.528	-27.513
1	1	0	1	0	0	0	0	0	0	0	483.151	0.767	0.754	31.030	-24.458
1	1	0	0	0	0	0	0	0	0	1	512.808	0.752	0.739	33.979	-23.207
1	1	0	0	1	0	0	0	0	0	0	1774.106	0.143	0.098	159.366	2.857
1	1	0	0	0	0	0	1	0	0	0	1780.515	0.140	0.094	160.003	2.933
2	1	0	1	0	0	0	0	1	0	0	157.217	0.924	0.916	0.629	-44.990
2	1	0	0	0	0	0	0	1	0	1	158.509	0.923	0.915	0.758	-44.818
2	1	0	0	0	0	1	0	1	0	0	160.423	0.922	0.914	0.948	-44.566
2	1	0	0	0	0	0	0	1	1	0	162.563	0.921	0.913	1.161	-44.288
2	1	0	0	0	1	0	0	1	0	0	162.563	0.921	0.913	1.161	-44.288
2	1	1	0	0	0	0	0	1	0	0	163.411	0.921	0.912	1.245	-44.178
2	1	0	0	0	0	0	1	1	0	0	166.644	0.919	0.911	1.566	-43.767
2	1	0	0	1	0	0	0	1	0	0	166.747	0.919	0.910	1.576	-43.754
2	1	0	0	0	0	1	0	0	1	0	168.659	0.918	0.909	1.767	-43.515
2	1	0	0	0	1	1	0	0	0	0	168.659	0.918	0.909	1.767	-43.515
2	1	1	0	0	0	1	0	0	0	0	176.505	0.915	0.905	2.547	-42.560
2	1	0	1	0	0	0	0	0	1	0	177.768	0.914	0.905	2.672	-42.410
2	1	0	1	0	1	0	0	0	0	0	177.768	0.914	0.905	2.672	-42.410
2	1	1	1	0	0	0	0	0	0	0	188.795	0.909	0.899	3.768	-41.146
2	1	0	0	0	0	0	0	0	1	1	217.470	0.895	0.883	6.619	-38.177
2	1	0	0	0	1	0	0	0	0	1	217.470	0.895	0.883	6.619	-38.177
2	1	1	0	0	0	0	0	0	0	1	240.158	0.884	0.871	8.874	-36.093
2	1	0	1	0	0	1	0	0	0	0	257.547	0.876	0.862	10.603	-34.625
2	1	0	0	0	0	0	1	0	1	0	293.474	0.858	0.842	14.175	-31.883
2	1	0	0	0	1	0	1	0	0	0	293.474	0.858	0.842	14.175	-31.883
3	1	0	1	0	1	0	0	1	0	0	132.668	0.936	0.925	0.189	-45.511
⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮	⋮
6	1	1	1	0	0	1	1	1	1	0	115.569	0.944	0.920	4.489	-39.275
6	1	1	1	0	1	0	1	1	1	0	115.767	0.944	0.920	4.508	-39.239
7	1	1	1	1	0	0	1	1	1	1	110.658	0.947	0.918	6.001	-37.142
7	1	1	1	1	1	0	1	1	0	1	110.658	0.947	0.918	6.001	-37.142
7	1	1	1	1	0	1	0	1	1	1	110.998	0.946	0.917	6.034	-37.078
7	1	1	1	1	1	1	0	1	0	1	110.998	0.946	0.917	6.034	-37.078
7	1	1	1	1	1	0	0	1	1	1	110.998	0.946	0.917	6.034	-37.078
7	1	1	0	1	0	1	1	1	1	1	112.107	0.946	0.917	6.145	-36.869
7	1	1	0	1	1	1	1	1	0	1	112.107	0.946	0.917	6.145	-36.869
7	1	1	0	1	1	1	0	1	1	1	112.112	0.946	0.917	6.145	-36.868
7	1	1	0	1	1	0	1	1	1	1	112.246	0.946	0.917	6.158	-36.843
7	1	1	1	1	0	1	1	1	1	0	112.818	0.945	0.916	6.215	-36.736
7	1	1	1	1	1	1	1	1	0	0	112.818	0.945	0.916	6.215	-36.736
7	1	1	1	1	1	0	1	1	1	0	112.974	0.945	0.916	6.231	-36.707
7	1	1	1	0	1	1	1	1	0	1	113.220	0.945	0.916	6.255	-36.662
7	1	1	1	0	0	1	1	1	1	1	113.220	0.945	0.916	6.255	-36.662
7	1	1	1	0	1	0	1	1	1	1	113.233	0.945	0.916	6.257	-36.659
7	1	1	0	0	1	1	1	1	1	1	114.787	0.945	0.915	6.411	-36.373
7	1	1	1	0	1	1	0	1	1	1	115.049	0.944	0.914	6.437	-36.325
7	1	1	1	0	1	1	1	1	1	0	115.569	0.944	0.914	6.489	-36.230
7	1	1	1	1	1	1	0	1	1	0	115.969	0.944	0.914	6.529	-36.158
7	1	0	1	1	0	1	1	1	1	1	125.666	0.939	0.907	7.493	-34.471
8	1	1	1	1	0	1	1	1	1	1	110.652	0.947	0.911	8.000	-34.099
8	1	1	1	1	1	1	1	1	0	1	110.652	0.947	0.911	8.000	-34.099
8	1	1	1	1	1	0	1	1	1	1	110.658	0.947	0.911	8.001	-34.098
8	1	1	1	1	1	1	0	1	1	1	110.998	0.946	0.911	8.034	-34.033
8	1	1	0	1	1	1	1	1	1	1	112.107	0.946	0.910	8.145	-33.824
8	1	1	1	0	1	1	1	1	1	1	113.220	0.945	0.909	8.255	-33.617
8	1	0	1	1	1	1	1	1	1	1	125.666	0.939	0.899	9.493	-31.427
8	1	1	1	1	1	1	1	0	1	1	127.263	0.938	0.897	9.651	-31.162

cp기준으로는 x1x2를 선택한 변수 1개가 가장 적절하다.

fit3 <- regsubsets(y~., data=dt2, nbest=1, method='exhaustive',)
summary(fit3)
with(summary(fit3),
     round(cbind(which,rss,rsq,adjr2,cp,bic),3))

Warning message in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in = force.in, :
“1  linear dependencies found”

Reordering variables and trying again:

Subset selection object
Call: regsubsets.formula(y ~ ., data = dt2, nbest = 1, method = "exhaustive", 
    )
9 Variables  (and intercept)
     Forced in Forced out
x1       FALSE      FALSE
x2       FALSE      FALSE
x3       FALSE      FALSE
x_1      FALSE      FALSE
x_2      FALSE      FALSE
x_3      FALSE      FALSE
x1x2     FALSE      FALSE
x2x3     FALSE      FALSE
x1x3     FALSE      FALSE
1 subsets of each size up to 8
Selection Algorithm: exhaustive
         x1  x2  x3  x_1 x_2 x_3 x1x2 x1x3 x2x3
1  ( 1 ) " " " " " " " " " " " " "*"  " "  " " 
2  ( 1 ) " " "*" " " " " " " " " "*"  " "  " " 
3  ( 1 ) " " "*" " " " " " " " " "*"  "*"  " " 
4  ( 1 ) "*" "*" " " " " " " " " "*"  "*"  " " 
5  ( 1 ) "*" " " "*" "*" " " " " "*"  " "  "*" 
6  ( 1 ) "*" "*" "*" "*" " " " " "*"  " "  "*" 
7  ( 1 ) "*" "*" "*" "*" " " "*" "*"  " "  "*" 
8  ( 1 ) "*" "*" "*" "*" "*" "*" "*"  " "  "*"

A matrix: 8 × 15 of type dbl
	(Intercept)	x1	x2	x3	x_1	x_2	x_3	x1x2	x1x3	x2x3	rss	rsq	adjr2	cp	bic
1	1	0	0	0	0	0	0	1	0	0	166.846	0.919	0.915	-0.414	-46.786
2	1	0	1	0	0	0	0	1	0	0	157.217	0.924	0.916	0.629	-44.990
3	1	0	1	0	0	0	0	1	1	0	132.668	0.936	0.925	0.189	-45.511
4	1	1	1	0	0	0	0	1	1	0	120.807	0.942	0.927	1.009	-44.433
5	1	1	0	1	1	0	0	1	0	1	112.246	0.946	0.928	2.158	-42.932
6	1	1	1	1	1	0	0	1	0	1	110.998	0.946	0.923	4.034	-40.122
7	1	1	1	1	1	0	1	1	0	1	110.658	0.947	0.918	6.001	-37.142
8	1	1	1	1	1	1	1	1	0	1	110.652	0.947	0.911	8.000	-34.099

실험번호	\(x_1\)	\(x_2\)	\(x_3\)	\(y\)
1	80	27	89	42
2	80	27	88	37
3	75	25	90	37
4	62	24	87	28
5	62	22	87	18
6	62	23	87	18
7	62	24	93	19
8	62	24	93	20
9	58	23	87	15
10	58	18	90	14
11	58	18	89	14
12	58	17	88	13
13	58	18	82	11
14	58	19	93	12
15	50	18	89	8
16	50	18	86	7
17	50	19	72	8
18	50	19	79	8
19	50	20	80	9
20	56	20	82	15
21	70	20	91	15