응용통계학 과제 20220512

  • 202150754 최서연

    다중공선성

다중공선성이 존재하는 상황을 가정하고

다중공선성을 어느 정도 제거한 모형 (M1)과 다중공선성이 내재되어 있는 모형 (M2) 을 고려하여

두 모형의 예측력을 모의실험을 통해 비교하여라,

단, 실험은 여러 번 반복하여 평균적인 결과를 report하되 설명변수의 개수는 3개 이상으로 설정하여라.

이미 존재하는 문서들을 참고하거나 재현해도 무방함.

(첨부된 문서 참고)

Nrep = 200

n = 1000

hatb1 <- hatb2 <- c()
te1 <- te2 <- c()

for (k in 1:Nrep)
{
x1 = runif(n,-3,3)
x2 = x1 + rnorm(n,0,0.01)
#X = model.matrix(~ x1 + x2)
#solve(t(X)%*%X)
y = 2 + x1 + 2*x2 + rnorm(n)

ind = sample(1:n,500)
tx1 = x1[ind]
tx2 = x2[ind]
test_x1 = x1[-ind]
test_x2 = x2[-ind]
ty  = y[ind]
test_y = y[-ind]

fit1 = lm(ty~tx1+tx2)
fit2 = lm(ty~tx1)

hatb1[k] = fit1$coefficients[2]
hatb2[k] = fit2$coefficients[2]

te1[k] = mean((test_y - predict(fit1,newdata=data.frame(test_x1,test_x2)))^2)
te2[k] = mean((test_y - predict(fit2,newdata=data.frame(test_x1)))^2)
#summary(lm(y~x1+x2))
#print(k)
}

c(mean(te1),mean(te2))
c(mean((hatb1-1)^2),mean((hatb2-2)^2))
<ol class=list-inline>
  • 55.0383388997727
  • 55.0360914185699
  • </ol>
    <ol class=list-inline>
  • 23.3087271671364
  • 1.00460057563712
  • </ol>

    library(regclass)
    
    library(car)
    

    시도 1

    Nrep = 200
    
    n = 1000
    
    hatb1 <- hatb2 <- c()
    te1 <- te2 <- c()
    
    for (k in 1:Nrep)
    {
    x1 = runif(n,-3,3)
    x2 = x1 + rnorm(n,0,0.01)
    x3 = x1^2 + rnorm(n,0,0.01)
    y = 2 + x1 + 2*x2 + 3*x3 + rnorm(n)
    
    ind = sample(1:n,500)
    tx1 = x1[ind]
    tx2 = x2[ind]
    tx3 = x3[ind]
    test_x1 = x1[-ind]
    test_x2 = x2[-ind]
    test_x3 = x3[-ind]
    ty  = y[ind]
    test_y = y[-ind]
    
    M2 = lm(ty~tx1+tx2+tx3)
    M1 = lm(ty~tx1)
    
    hatb1[k] = M2$coefficients[2]
    hatb2[k] = M1$coefficients[2]
    
    te1[k] = mean((test_y - predict(M1,newdata=data.frame(test_x1,test_x2,test_x3)))^2)
    te2[k] = mean((test_y - predict(M1,newdata=data.frame(test_x1)))^2)
    }
    
    c(mean(te1),mean(te2))
    c(mean((hatb1-2)^2),mean((hatb2-1)^2))
    
    <ol class=list-inline>
  • 120.706759195379
  • 120.706759195379
  • </ol>
    <ol class=list-inline>
  • 19.0165546979099
  • 4.16295517171254
  • </ol>

    시도 2

    • $y_1=3+x_1+1.5x_2+3.5x_3+\epsilon$
      • $x_2=x_1+\epsilon$
      • $x_3=x_1+\epsilon$
    • 1000번 반복
    • 500개 데이터
    x1 = runif(500)
    x2 = x1 + rnorm(500,0,0.01)
    x3 = x1 + rnorm(500,0,0.1)
    y1= 3 + x1 + 1.5*x2 + 3.5*x3 + rnorm(500)
    
    cor(x1,x2)
    cor(x1,x3)
    
    0.999394817060633
    0.948251870423055

    높은 상관계수 확인

    M1 = lm(y1~x1)
    
    M2 = lm(y1~x1+x2+x3)
    
    VIF(M2)
    
    <dl class=dl-inline>
    x1
    833.634174212119
    x2
    826.540055814332
    x3
    9.91994775238591
    </dl>

    다중공선성 존재를 가정한 모형의 VIF 10 넘는 모습이었다.

    print(M1$coefficients)
    
    (Intercept)          x1 
       3.101744    5.896756 
    
    print(M2$coefficients)
    
    (Intercept)          x1          x2          x3 
      3.1254411   2.3697439  -0.3499881   3.8186590 
    
    mean((y1-predict(M1,data.frame(x1)))^2)
    
    1.20540745012315
    mean((y1-predict(M2,data.frame(x1,x2,x3)))^2)
    
    1.05650517407321

    다중공선성을 제거한 모형이 다중공선성이 있는 모형보다 제곱평균오차가 컸다.

    반복

    result1 = c()
    result2 = c()
    for (i in 1:1000){
        x1 = runif(500)
        x2 = x1 + rnorm(500,0,0.01)
        x3 = x1 + rnorm(500,0,0.01)
        y1= 3 + x1 + 1.5*x2 + 3.5*x3 + rnorm(500)
        train_x1 = x1[1:250]
        train_x2 = x2[1:250]
        train_x3 = x3[1:250]
        train_y1 = y1[1:250]
        test_x1 = x1[251:500]
        test_x2 = x2[251:500]
        test_x3 = x3[251:500]
        test_y1 = y1[251:500]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2+train_x3)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3)))^2)
        }
    print(mean(result1));print(mean(result2))
    
    [1] 7.004371
    [1] 7.013314
    

    다중공선성이 있는 모형과 다중공선성이 없는 모형의 MSE가 비슷한 값이 나왔다.

    M2$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    2.95166936850483
    train_x1
    6.31893566934747
    train_x2
    -7.57004919218059
    train_x3
    7.31093469844317
    </dl>
    VIF(M2)
    
    <dl class=dl-inline>
    train_x1
    1532.26129565099
    train_x2
    752.809687640852
    train_x3
    810.576270200647
    </dl>

    시도 3

    • $y_1=3+x_1+2x_2+3x_3+\epsilon$
      • $x_2=x_1^2+\epsilon$
      • $x_3=x_1^3+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    result1 = c()
    result2 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1^2 + rnorm(1000,0,0.01)
        x3 = x1^3 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 2*x2 + 3*x3 + rnorm(1000)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2+train_x3)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3)))^2)
        }
    print(mean(result1));print(mean(result2))
    
    [1] 6.655706
    [1] 6.899516
    
    M2$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    3.1722807655406
    train_x1
    -0.803885135696428
    train_x2
    5.33385899535763
    train_x3
    1.27000093939837
    </dl>
    VIF(M2)
    
    <dl class=dl-inline>
    train_x1
    52.3160125363476
    train_x2
    285.051700873521
    train_x3
    114.224043489052
    </dl>

    시도4

    • $y_1=3+x_1+2x_2+3x_3+\epsilon$
      • 시도 3과 다른 점: $\epsilon$의 $mean=0$, $sd=0.1$ 가정
        • $x_2=x_1^2+\epsilon$
        • $x_3=x_2^3+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    result1 = c()
    result2 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1^2 + rnorm(1000,0,0.01)
        x3 = x2^3 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 2*x2 + 3*x3 + rnorm(1000,0,0.1)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2+train_x3)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3)))^2)
        }
    print(mean(result1));print(mean(result2))
    
    [1] 4.407255
    [1] 4.750254
    
    M2$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    2.98443080143881
    train_x1
    1.10397880526529
    train_x2
    1.89042910272871
    train_x3
    3.03856902277434
    </dl>
    VIF(M2)
    
    <dl class=dl-inline>
    train_x1
    41.1249486526823
    train_x2
    83.4135981154547
    train_x3
    13.854722775645
    </dl>

    시도 5

    • $y_1=3+x_1+5x_2+10x_3+\epsilon$
      • $x_2=2x_1+\epsilon$
      • $x_3=3x_1+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    result1 = c()
    result2 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1*2 + rnorm(1000,0,0.01)
        x3 = x1*3 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 5*x2 + 10*x3 + rnorm(1000)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2+train_x3)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3)))^2)
        }
    print(mean(result1));print(mean(result2))
    
    [1] 280.8567
    [1] 280.8766
    
    M2$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    2.94499782543414
    train_x1
    -0.851627852341095
    train_x2
    -2.05771910014527
    train_x3
    15.3323394415922
    </dl>
    VIF(M2)
    
    <dl class=dl-inline>
    train_x1
    12207.8561089719
    train_x2
    3438.30708249541
    train_x3
    8106.02210331097
    </dl>
    • $x_1$이랑 제곱관계였던 시도 4까지의 결과와 조금 다르게 거의 비슷한 모습

    시도 6

    • $y_1=3+x_1+5x_2+10x_3+\epsilon$
      • $x_2=2x_1+\epsilon$
      • $x_3=3x_1+\epsilon$
      • $x_4=4x_2+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    result1 = c()
    result2 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1*2 + rnorm(1000,0,0.01)
        x3 = x1*3 + rnorm(1000,0,0.01)
        x4 = x2*4 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 5*x2 + 10*x3 + 15*x4 + rnorm(1000)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_x4 = x4[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_x4 = x4[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2+train_x3++train_x4)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3,test_x4)))^2)
        }
    print(mean(result1));print(mean(result2))
    
    [1] 4317.344
    [1] 4317.734
    
    M2$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    3.06497938655941
    train_x1
    27.569754309646
    train_x2
    6.00445989616462
    train_x3
    0.271289346076251
    train_x4
    15.0658253783155
    </dl>
    VIF(M2)
    
    <dl class=dl-inline>
    train_x1
    10184.1166864687
    train_x2
    51761.1715395813
    train_x3
    7606.48940304079
    train_x4
    50649.4885768582
    </dl>

    시도 7

    • $y_1=3+x_1+5x_2+10x_3+15x_4+20x_5+\epsilon$
      • $x_2=2x_1+\epsilon$
      • $x_3=3x_1+\epsilon$
      • $x_4=4x_1+\epsilon$
      • $x_5=5x_1+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    • 설명변수를 하나씩 제거해나가며 값 확인
    result1 = c()
    result2 = c()
    result2_2 = c()
    result2_3 = c()
    result2_4 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1*2 + rnorm(1000,0,0.01)
        x3 = x1*3 + rnorm(1000,0,0.01)
        x4 = x1*4 + rnorm(1000,0,0.01)
        x5 = x1*5 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 5*x2 + 10*x3 + 15*x4 + 20*x5 + rnorm(1000)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_x4 = x4[1:500]
        train_x5 = x5[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_x4 = x4[501:1000]
        test_x5 = x5[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2)
        M2_2 = lm(train_y1~train_x1+train_x2+train_x3)
        M2_3 = lm(train_y1~train_x1+train_x2+train_x3+train_x4)
        M2_4 = lm(train_y1~train_x1+train_x2+train_x3+train_x4+train_x5)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2,test_x3)))^2)
        result2_2[i]=mean((test_y1-predict(M2_2,data.frame(test_x1,test_x2,test_x3)))^2)
        result2_3[i]=mean((test_y1-predict(M2_3,data.frame(test_x1,test_x2,test_x3,test_x4)))^2)
        result2_4[i]=mean((test_y1-predict(M2_4,data.frame(test_x1,test_x2,test_x3,test_x4,test_x5)))^2)
        }
    print(mean(result1));print(mean(result2));print(mean(result2_2));print(mean(result2_3));print(mean(result2_4))
    
    [1] 6743.22
    [1] 6743.23
    [1] 6743.228
    [1] 6743.258
    [1] 6743.326
    
    M2$coefficients;M2_2$coefficients;M2_3$coefficients;M2_4$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    3.14229057776291
    train_x1
    191.592156934206
    train_x2
    4.592358164603
    </dl>
    <dl class=dl-inline>
    (Intercept)
    3.13946828384026
    train_x1
    152.101047376151
    train_x2
    5.48457032032642
    train_x3
    12.5700253588448
    </dl>
    <dl class=dl-inline>
    (Intercept)
    3.15352997428773
    train_x1
    101.847117641019
    train_x2
    5.39865705788164
    train_x3
    12.5031156664608
    train_x4
    12.6523453161946
    </dl>
    <dl class=dl-inline>
    (Intercept)
    3.12999846572166
    train_x1
    12.5911240468077
    train_x2
    5.34451976152817
    train_x3
    12.6847839220765
    train_x4
    12.6165982386795
    train_x5
    17.7968532012727
    </dl>
    VIF(M2);VIF(M2_2);VIF(M2_3);VIF(M2_4)
    
    <dl class=dl-inline>
    train_x1
    3199.50596330676
    train_x2
    3199.50596330676
    </dl>
    <dl class=dl-inline>
    train_x1
    11028.5645796285
    train_x2
    3215.46470823126
    train_x3
    7139.1852762786
    </dl>
    <dl class=dl-inline>
    train_x1
    23792.4840684325
    train_x2
    3215.61368413446
    train_x3
    7139.38892775602
    train_x4
    12954.7807912569
    </dl>
    <dl class=dl-inline>
    train_x1
    45574.8755824188
    train_x2
    3215.64568601484
    train_x3
    7140.20111160112
    train_x4
    12954.8367355594
    train_x5
    21640.2394284721
    </dl>

    시도 8

    • $y_1=3+x_1+1.5x_2+2x_3+2.5x_4+3x_5+\epsilon$
      • $x_2=x^2_1+\epsilon$
      • $x_3=x^3_1+\epsilon$
      • $x_4=x^4_1+\epsilon$
      • $x_5=x^5_1+\epsilon$
    • 1000번 반복
    • 1000개 데이터
    • 설명변수를 하나씩 제거해나가며 값 확인
    result1 = c()
    result2 = c()
    result2_2 = c()
    result2_3 = c()
    result2_4 = c()
    for (i in 1:1000){
        x1 = runif(1000)
        x2 = x1^2 + rnorm(1000,0,0.01)
        x3 = x1^3 + rnorm(1000,0,0.01)
        x4 = x1^4 + rnorm(1000,0,0.01)
        x5 = x1^5 + rnorm(1000,0,0.01)
        y1= 3 + x1 + 1.5*x2 + 2*x3 + 2.5*x4 + 3*x5 + rnorm(1000)
        train_x1 = x1[1:500]
        train_x2 = x2[1:500]
        train_x3 = x3[1:500]
        train_x4 = x4[1:500]
        train_x5 = x5[1:500]
        train_y1 = y1[1:500]
        test_x1 = x1[501:1000]
        test_x2 = x2[501:1000]
        test_x3 = x3[501:1000]
        test_x4 = x4[501:1000]
        test_x5 = x5[501:1000]
        test_y1 = y1[501:1000]
        M1 = lm(train_y1~train_x1)
        M2 = lm(train_y1~train_x1+train_x2)
        M2_2 = lm(train_y1~train_x1+train_x2+train_x3)
        M2_3 = lm(train_y1~train_x1+train_x2+train_x3+train_x4)
        M2_4 = lm(train_y1~train_x1+train_x2+train_x3+train_x4+train_x5)
        result1[i]=mean((test_y1-predict(M1,data.frame(test_x1)))^2)
        result2[i]=mean((test_y1-predict(M2,data.frame(test_x1,test_x2)))^2)
        result2_2[i]=mean((test_y1-predict(M2_2,data.frame(test_x1,test_x2,test_x3)))^2)
        result2_3[i]=mean((test_y1-predict(M2_3,data.frame(test_x1,test_x2,test_x3,test_x4)))^2)
        result2_4[i]=mean((test_y1-predict(M2_4,data.frame(test_x1,test_x2,test_x3,test_x4,test_x5)))^2)  
        }
    print(mean(result1));print(mean(result2));print(mean(result2_2));print(mean(result2_3));print(mean(result2_4))
    
    [1] 14.147
    [1] 15.23608
    [1] 15.31943
    [1] 15.33841
    [1] 15.34419
    
    M2$coefficients;M2_2$coefficients;M2_3$coefficients;M2_4$coefficients
    
    <dl class=dl-inline>
    (Intercept)
    3.33742167691734
    train_x1
    -4.4218260738446
    train_x2
    13.0062453144602
    </dl>
    <dl class=dl-inline>
    (Intercept)
    2.79738940769783
    train_x1
    2.92350133078144
    train_x2
    -6.31809961451126
    train_x3
    13.268899752175
    </dl>
    <dl class=dl-inline>
    (Intercept)
    2.67631232157832
    train_x1
    3.53810018129673
    train_x2
    -4.20729411947289
    train_x3
    4.72887135993403
    train_x4
    6.18767167407493
    </dl>
    <dl class=dl-inline>
    (Intercept)
    2.6944260166692
    train_x1
    2.9164772419162
    train_x2
    -1.6408374248753
    train_x3
    3.84299551629256
    train_x4
    0.495275734059115
    train_x5
    4.76860416637478
    </dl>
    VIF(M2);VIF(M2_2);VIF(M2_3);VIF(M2_4)
    
    <dl class=dl-inline>
    train_x1
    15.3582739877788
    train_x2
    15.3582739877788
    </dl>
    <dl class=dl-inline>
    train_x1
    56.311258430317
    train_x2
    329.079676685842
    train_x3
    136.482356227786
    </dl>
    <dl class=dl-inline>
    train_x1
    58.1468087888072
    train_x2
    353.043195799331
    train_x3
    498.429281610092
    train_x4
    170.495203086306
    </dl>
    <dl class=dl-inline>
    train_x1
    62.3638143924368
    train_x2
    432.602200437012
    train_x3
    507.175881850585
    train_x4
    494.547525661941
    train_x5
    202.355701338419
    </dl>

    다중공선성을 어느정도 제거한 모델(M1)과 제거하지 않은 모델들(M2,M2_2,M2_3,M2_4)을 비교해보았다.

    • 설명변수($x_1,x_2,x_3,x_4,x_5$)끼리 배수 관계에 있던 모델들과 다르게 거듭제곱 관계에 있던 모델들은 결과(MSE)가 비슷하긴 하지만 설명변수가 배수관계에 있던 모델만큼 비슷하진 않았다.
      • 배수 관계 가정: 시도2, 시도5, 시도6, 시도7
      • 거듭제곱 관계 가정: 시도3, 시도4, 시도8
    • 다중공선성을 가정하여 모델을 만들었기 때문에 당연하게 VIF;분산팽창요인이 모든 시도에서 크게 나온 결과를 확인할 수 있었다.
    • 시도2, 시도3을 비교해보니 데이터 500개를 사용할 때보다 1000개를 사용할때 다중공선성을 가정한 MSE가 조금 더 컸다. 하지만 분산퍙창계수의 차이는 1000개의 데이터를 사용했을때 더 작았다.
      • 시도2보다 시도3에서 M1과 M2의 차이가 크다고 했지만 그 크기는 0.3정도의 차이이긴 하다.
      • 분산팽창요인은 그래도 시도2와 시도3이 10을 충분히 넘긴 값이긴 헀다.
    • 시도3시도4의 차이는 반응변수 $y$의 $\epsilon$ 의 $rnorm$ 지정을 해줄때 시도4에 $mean=0, sd=0.1$을 지정해준 것이다.
      • 결과를 보니 분산팽창계수가 낮아졌다. 하지만 여전히 모두 10을 넘었다.
    • 시도7,시도8 을 미루어 보아 설명변수끼리 관계가 있고, 또 여러개 존재했을때 설명변수가 추가될수록 모형의 평균제곱오차가 조금씩 커지는 경향을 보였다.
      • 즉, 예측력이 조금씩 줄어드는 경향을 보였다.