DATA

Lets begin with a descriptive analysis of our dataset

data<-read.csv("insurance.csv")
head(data)
##   age    sex    bmi children smoker    region   charges
## 1  19 female 27.900        0    yes southwest 16884.924
## 2  18   male 33.770        1     no southeast  1725.552
## 3  28   male 33.000        3     no southeast  4449.462
## 4  33   male 22.705        0     no northwest 21984.471
## 5  32   male 28.880        0     no northwest  3866.855
## 6  31 female 25.740        0     no southeast  3756.622
summary(data)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

We want to predict charges (insurance price) using 7 variables: age, sex, BMI, children, smoker, region.

table(data$sex)
## 
## female   male 
##    662    676
table(data$smoker)
## 
##   no  yes 
## 1064  274
plot.age <- ggplot(data, aes(x = age, y = charges)) +
 geom_point()

plot.bmi <- ggplot(data, aes(x = bmi, y = charges)) +
 geom_point()

grid.arrange(plot.age, plot.bmi, ncol=2)

In the first plot we see that there is a trend that with older age the charges increase. There are also three groups/lines visible. In the second plot we see some sort of trend that with increasing bmi the charges increase, however this is not very clear. Here there might also be two different groups.

plot.sex <- ggplot(data, aes(x = sex, y = charges)) +
 geom_boxplot()

plot.smoker <- ggplot(data, aes(x = smoker, y = charges)) +
 geom_boxplot()

plot.child <- ggplot(data, aes(x = as.factor(children), y = charges)) +
 geom_boxplot()

plot.region <- ggplot(data, aes(x = region, y = charges)) +
 geom_boxplot()

grid.arrange(plot.sex, plot.smoker, plot.child, plot.region, ncol=2, nrow=2)

The first boxplot (left upper corner) shows us that females and males pay on avarage the same charges. When looking at the second boxplot (right upper corner) we see that smokers pay higher charges compared to non smokers. Also people with more childres pay more charges and it seems that the region has not an influence on the charges. In all instances the charges have a skewed distribution.

MULTIVARIATE REGRESSION

Let’s eliminate the column charges to create our X matrix

y<- data$charges
 data$sexdummy<- ifelse(data$sex == "male", 1, 0)
data$smokerdummy<- ifelse(data$smoker == "yes", 1, 0)
data$regionsouthwest<- ifelse(data$region == "southwest", 1, 0)
data$regionsoutheast<-ifelse(data$region == "southeast", 1, 0)    
data$regionnorthwest<-ifelse(data$region == "northwest", 1, 0) 
X <- subset(data, select = -c(charges, sex, smoker, region))

Now we create the correltion matrix and its graphic representation

cor(X)
##                           age          bmi    children     sexdummy
## age              1.0000000000  0.109271882  0.04246900 -0.020855872
## bmi              0.1092718815  1.000000000  0.01275890  0.046371151
## children         0.0424689986  0.012758901  1.00000000  0.017162978
## sexdummy        -0.0208558722  0.046371151  0.01716298  1.000000000
## smokerdummy     -0.0250187515  0.003750426  0.00767312  0.076184817
## regionsouthwest  0.0100162342 -0.006205183  0.02191358 -0.004184049
## regionsoutheast -0.0116419406  0.270024649 -0.02306575  0.017116875
## regionnorthwest -0.0004074234 -0.135995524  0.02480613 -0.011155728
##                  smokerdummy regionsouthwest regionsoutheast regionnorthwest
## age             -0.025018752     0.010016234     -0.01164194   -0.0004074234
## bmi              0.003750426    -0.006205183      0.27002465   -0.1359955237
## children         0.007673120     0.021913576     -0.02306575    0.0248061293
## sexdummy         0.076184817    -0.004184049      0.01711688   -0.0111557280
## smokerdummy      1.000000000    -0.036945474      0.06849841   -0.0369454740
## regionsouthwest -0.036945474     1.000000000     -0.34626466   -0.3208292201
## regionsoutheast  0.068498410    -0.346264661      1.00000000   -0.3462646614
## regionnorthwest -0.036945474    -0.320829220     -0.34626466    1.0000000000
corrplot(cor(X), mar = c(0,0,1,0), 
         order = "hclust",
         main = "Covariance matrix of residuals")

Subsequently we estimate the OLS variables

fit <- lm(charges ~ age + bmi + children + sexdummy + regionsouthwest + regionsoutheast+ regionnorthwest+ smokerdummy, data= data)

summary(fit)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + sexdummy + regionsouthwest + 
##     regionsoutheast + regionnorthwest + smokerdummy, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## sexdummy          -131.3      332.9  -0.394 0.693348    
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## smokerdummy      23848.5      413.1  57.723  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
a<-calc.relimp(fit,type=("last"),
            rela=TRUE)
plot(a)

fit2 <- stepAIC(fit, direction="both") #Forward and backward applied (=both)
## Start:  AIC=23316.43
## charges ~ age + bmi + children + sexdummy + regionsouthwest + 
##     regionsoutheast + regionnorthwest + smokerdummy
## 
##                   Df  Sum of Sq        RSS   AIC
## - sexdummy         1 5.7164e+06 4.8845e+10 23315
## - regionnorthwest  1 2.0183e+07 4.8860e+10 23315
## <none>                          4.8840e+10 23316
## - regionsouthwest  1 1.4829e+08 4.8988e+10 23318
## - regionsoutheast  1 1.7180e+08 4.9011e+10 23319
## - children         1 4.3755e+08 4.9277e+10 23326
## - bmi              1 5.1692e+09 5.4009e+10 23449
## - age              1 1.7124e+10 6.5964e+10 23717
## - smokerdummy      1 1.2245e+11 1.7129e+11 24993
## 
## Step:  AIC=23314.58
## charges ~ age + bmi + children + regionsouthwest + regionsoutheast + 
##     regionnorthwest + smokerdummy
## 
##                   Df  Sum of Sq        RSS   AIC
## - regionnorthwest  1 2.0094e+07 4.8865e+10 23313
## <none>                          4.8845e+10 23315
## + sexdummy         1 5.7164e+06 4.8840e+10 23316
## - regionsouthwest  1 1.4808e+08 4.8993e+10 23317
## - regionsoutheast  1 1.7159e+08 4.9017e+10 23317
## - children         1 4.3596e+08 4.9281e+10 23324
## - bmi              1 5.1645e+09 5.4010e+10 23447
## - age              1 1.7151e+10 6.5996e+10 23715
## - smokerdummy      1 1.2301e+11 1.7186e+11 24996
## 
## Step:  AIC=23313.13
## charges ~ age + bmi + children + regionsouthwest + regionsoutheast + 
##     smokerdummy
## 
##                   Df  Sum of Sq        RSS   AIC
## <none>                          4.8865e+10 23313
## + regionnorthwest  1 2.0094e+07 4.8845e+10 23315
## - regionsouthwest  1 1.3139e+08 4.8997e+10 23315
## + sexdummy         1 5.6275e+06 4.8860e+10 23315
## - regionsoutheast  1 1.5694e+08 4.9022e+10 23315
## - children         1 4.3080e+08 4.9296e+10 23323
## - bmi              1 5.1638e+09 5.4029e+10 23446
## - age              1 1.7155e+10 6.6021e+10 23714
## - smokerdummy      1 1.2317e+11 1.7203e+11 24995
fit2$anova 
## Stepwise Model Path 
## Analysis of Deviance Table
## 
## Initial Model:
## charges ~ age + bmi + children + sexdummy + regionsouthwest + 
##     regionsoutheast + regionnorthwest + smokerdummy
## 
## Final Model:
## charges ~ age + bmi + children + regionsouthwest + regionsoutheast + 
##     smokerdummy
## 
## 
##                Step Df Deviance Resid. Df  Resid. Dev      AIC
## 1                                    1329 48839532844 23316.43
## 2        - sexdummy  1  5716429      1330 48845249273 23314.58
## 3 - regionnorthwest  1 20094242      1331 48865343515 23313.13

VALIDATION OF THE MODEL

Outliers and influencial cases

# obtain residuals
data$standardized.residuals<-rstandard(fit2)

# indicate which standardized residuals are > 2 and add to dataframe
data$large.residual2 <- data$standardized.residuals > 2 | data$standardized.residuals < -2

# indicate which standardized residuals are > 2.5 and add to dataframe
data$large.residual2.5 <- data$standardized.residuals > 2.5 | data$standardized.residuals < -2.5

# indicate which cases are outliers (standardized residual > 3)
data$outlier.residual <- data$standardized.residuals > 3 | data$standardized.residuals < -3

# percentage of cases with residual > 2 (this should 5% or lower)
paste("The percentage of cases with a standardized residual > 2 is", 
      round(sum(data$large.residual2)/nrow(data)*100, 2), "percent.")
## [1] "The percentage of cases with a standardized residual > 2 is 5.01 percent."
# percentage of cases with residual > 2.5 (this should be 1% or lower)
paste("The percentage of cases with a standardized residual > 2.5 is", 
      round(sum(data$large.residual2.5)/nrow(data)*100, 2), "percent.")
## [1] "The percentage of cases with a standardized residual > 2.5 is 3.29 percent."
# percentage of cases with residual > 3 (this should be 1% or lower)
paste("The percentage of cases with a standardized residual > 3 is", 
      round(sum(data$outlier.residual)/nrow(data)*100, 2), "percent.")
## [1] "The percentage of cases with a standardized residual > 3 is 2.09 percent."
# potential outliers
nrow(data[which(data$outlier.residual==T),])
## [1] 28
Residuals <- fit2$residuals

stand <- function(x){
  m=mean(x)
  s=(var(x)^0.5)
  z=(x-m)/s
  return(z)
}
#standardized residuals
s.resid <- stand(Residuals)

par(mfrow=c(1,2),fg = "black",bg = "white",mar=c(0.6,0.4,0.4,0.1),mai=c(0.6,0.4,0.4,0.1))
plot(fit2$residuals,xlab="",ylab="",main="Residuals",pch=21,cex.main=2,bg="grey")
grid(nx = 10, ny = 10, col = "gray", lty = "dotted")
abline(h=mean(fit2$residuals), col="red")
#abline(h=0)
plot(s.resid,xlab="",ylab="",main="Standardized Residuals",pch=21,cex.main=2,bg="grey")
grid(nx = 10, ny = 10, col = "gray", lty = "dotted")
abline(h=+2.57, col="red",lty=2)
abline(h=-2.57, col="red",lty=2)

Normality

qqnorm(Residuals, pch = 1, frame = FALSE)
qqline(Residuals, col = "steelblue", lwd = 2)

par(mfrow=c(1,1),fg = "black",bg = "white",mar=c(0.9,0.9,0.8,0.8),mai=c(0.9,0.9,0.8,0.8))
qqPlot(Residuals,xlab="Theoretical Quantiles",ylab="Sample Quantiles")
## [1] 1301  578
title("Normal Q-Q Plot",cex.main=2)

jarque.bera.test(Residuals)
## 
##  Jarque Bera Test
## 
## data:  Residuals
## X-squared = 716.59, df = 2, p-value < 2.2e-16

We reject H0, that means that there is no normality of residuals

Heteroskedasticity

bptest(fit2)  
## 
##  studentized Breusch-Pagan test
## 
## data:  fit2
## BP = 120.83, df = 6, p-value < 2.2e-16

We reject H0, that means that the residuals are heteroskedastics

Indipendence

dwt(fit2)  
##  lag Autocorrelation D-W Statistic p-value
##    1     -0.04733657       2.09192   0.098
##  Alternative hypothesis: rho != 0

The residuals are indipendent

Multicollinearity

vif(fit2)    
##             age             bmi        children regionsouthwest regionsoutheast 
##        1.016174        1.104196        1.002831        1.147367        1.244250 
##     smokerdummy 
##        1.005747

We can observe there is no multicollinearity

plot(fit2)

We have plotted four graphs. The first graph shows the fitted values against the observed residuals. In this plot the dots should be randomly placed around the horizontal zero line, which is clearly not the case. It seems that there are three groups present. Thus per group there is difference in variance across the residuals.

There is increasing variance across the residuals (heteroscedasticity), and there might also be a non-linear relationship between the outcome and the predictor. This non-linear relationship is also reflected by the second plot (Q-Q plot), since the dots deviate from the dotted line.

SOLVE PROBLEMS

Nonlinear Model

plot.age2 <- ggplot(data, aes(x = age^2, y = charges)) +
 geom_point()

grid.arrange(plot.age, plot.age2, ncol=2)

In the second plot we see there is a straight line. Thus we should use age square in our model.

data <- data %>% 
mutate(age.square = age^2) 

fit3 <- lm(charges ~ smoker + age + age.square + bmi + children + region, data = data)
summary(fit3)
## 
## Call:
## lm(formula = charges ~ smoker + age + age.square + bmi + children + 
##     region, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11726.0  -2905.4   -926.3   1306.3  30765.1 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6654.371   1683.270  -3.953 8.12e-05 ***
## smokeryes       23846.842    409.699  58.206  < 2e-16 ***
## age               -54.266     80.962  -0.670 0.502807    
## age.square          3.925      1.010   3.886 0.000107 ***
## bmi               334.656     28.427  11.772  < 2e-16 ***
## children          640.940    143.549   4.465 8.69e-06 ***
## regionnorthwest  -366.979    473.632  -0.775 0.438584    
## regionsoutheast -1030.808    476.021  -2.165 0.030530 *  
## regionsouthwest  -956.835    475.266  -2.013 0.044288 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6028 on 1329 degrees of freedom
## Multiple R-squared:  0.7537, Adjusted R-squared:  0.7522 
## F-statistic: 508.3 on 8 and 1329 DF,  p-value: < 2.2e-16
ggplot(data, aes(x = bmi, y = charges, col = smoker)) +
 geom_point()

There seems to be an interaction between smoke and bmi. Lets put this interaction in the model.

fit4 <- lm(charges ~ smoker + age + age.square + bmi + children + region + smoker*bmi, data = data)
summary(fit4)
## 
## Call:
## lm(formula = charges ~ smoker + age + age.square + bmi + children + 
##     region + smoker * bmi, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14960.9  -1641.5  -1307.5   -861.8  31122.0 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.611e+03  1.386e+03   1.884  0.05981 .  
## smokeryes       -2.024e+04  1.636e+03 -12.366  < 2e-16 ***
## age             -3.216e+01  6.466e+01  -0.497  0.61900    
## age.square       3.735e+00  8.066e-01   4.631 4.00e-06 ***
## bmi              1.925e+01  2.544e+01   0.757  0.44919    
## children         6.710e+02  1.146e+02   5.853 6.08e-09 ***
## regionnorthwest -5.955e+02  3.783e+02  -1.574  0.11575    
## regionsoutheast -1.203e+03  3.802e+02  -3.165  0.00159 ** 
## regionsouthwest -1.225e+03  3.797e+02  -3.226  0.00129 ** 
## smokeryes:bmi    1.436e+03  5.223e+01  27.494  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4814 on 1328 degrees of freedom
## Multiple R-squared:  0.843,  Adjusted R-squared:  0.842 
## F-statistic: 792.5 on 9 and 1328 DF,  p-value: < 2.2e-16
plot(fit4)

OLS Robust to heteroskedasticy

vcovHC(fit4,type = "HC0")
##                 (Intercept)    smokeryes           age  age.square
## (Intercept)     1768383.787 -536385.3712 -7.349312e+04 883.9489818
## smokeryes       -536385.371 3631727.0348  4.024108e+03 -38.8390986
## age              -73493.123    4024.1081  4.302408e+03 -52.7221741
## age.square          883.949     -38.8391 -5.272217e+01   0.6597736
## bmi              -13644.507   12638.0682 -3.969268e-01  -0.3654558
## children          54160.143   -3280.3631 -3.923476e+03  49.1047956
## regionnorthwest  -76243.117   93373.3653 -8.857206e+02  11.2314136
## regionsoutheast   21020.457   33452.5943 -9.691857e+02  18.2637281
## regionsouthwest  -42854.321   45112.7451 -1.766360e+02   6.7998733
## smokeryes:bmi     14336.052 -115080.8206  1.354871e+00  -0.4054245
##                           bmi   children regionnorthwest regionsoutheast
## (Intercept)     -1.364451e+04 54160.1432    -76243.11743     21020.45717
## smokeryes        1.263807e+04 -3280.3631     93373.36533     33452.59427
## age             -3.969268e-01 -3923.4758      -885.72065      -969.18569
## age.square      -3.654558e-01    49.1048        11.23141        18.26373
## bmi              4.978710e+02   153.8422       351.80164     -3134.32032
## children         1.538422e+02 13341.8665      -508.46426     -3705.30067
## regionnorthwest  3.518016e+02  -508.4643    160651.55074     77455.27879
## regionsoutheast -3.134320e+03 -3705.3007     77455.27879    157673.39244
## regionsouthwest -1.268835e+03 -5756.1126     78694.05179     84687.33755
## smokeryes:bmi   -4.054559e+02  -119.0861     -2464.19791     -1012.35788
##                 regionsouthwest smokeryes:bmi
## (Intercept)       -42854.320528  1.433605e+04
## smokeryes          45112.745089 -1.150808e+05
## age                 -176.636008  1.354871e+00
## age.square             6.799873 -4.054245e-01
## bmi                -1268.835011 -4.054559e+02
## children           -5756.112603 -1.190861e+02
## regionnorthwest    78694.051785 -2.464198e+03
## regionsoutheast    84687.337548 -1.012358e+03
## regionsouthwest   135853.534113 -1.057782e+03
## smokeryes:bmi      -1057.781672  3.792014e+03
coeftest(fit4, vcov = vcovHC(fit4,type = "HC0"))  #HC
## 
## t test of coefficients:
## 
##                    Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept)      2.6107e+03  1.3298e+03   1.9632 0.0498323 *  
## smokeryes       -2.0236e+04  1.9057e+03 -10.6184 < 2.2e-16 ***
## age             -3.2162e+01  6.5593e+01  -0.4903 0.6239868    
## age.square       3.7351e+00  8.1226e-01   4.5984 4.663e-06 ***
## bmi              1.9255e+01  2.2313e+01   0.8629 0.3883313    
## children         6.7099e+02  1.1551e+02   5.8091 7.853e-09 ***
## regionnorthwest -5.9546e+02  4.0081e+02  -1.4856 0.1376176    
## regionsoutheast -1.2034e+03  3.9708e+02  -3.0306 0.0024881 ** 
## regionsouthwest -1.2248e+03  3.6858e+02  -3.3230 0.0009148 ***
## smokeryes:bmi    1.4360e+03  6.1579e+01  23.3201 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

BOX and COX

fitN <- lm(log(charges) ~ smoker + age + age.square + bmi + children + region + smoker*bmi, data = data)
summary(fitN)
## 
## Call:
## lm(formula = log(charges) ~ smoker + age + age.square + bmi + 
##     children + region + smoker * bmi, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89022 -0.18281 -0.06178  0.04568  2.22354 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.9750692  0.1237014  56.386  < 2e-16 ***
## smokeryes        0.1702813  0.1460671   1.166 0.243915    
## age              0.0537165  0.0057717   9.307  < 2e-16 ***
## age.square      -0.0002377  0.0000720  -3.301 0.000989 ***
## bmi              0.0034600  0.0022704   1.524 0.127761    
## children         0.0924349  0.0102331   9.033  < 2e-16 ***
## regionnorthwest -0.0695986  0.0337700  -2.061 0.039501 *  
## regionsoutheast -0.1624147  0.0339368  -4.786 1.89e-06 ***
## regionsouthwest -0.1370804  0.0338895  -4.045 5.54e-05 ***
## smokeryes:bmi    0.0448377  0.0046622   9.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4297 on 1328 degrees of freedom
## Multiple R-squared:  0.7831, Adjusted R-squared:  0.7816 
## F-statistic: 532.7 on 9 and 1328 DF,  p-value: < 2.2e-16
plot(fitN)

y_p<-fitN$fitted.values
library(car)
par(mfrow=c(1,2), fg = "black",bg = "white",mar=c(0.6,0.6,0.6,0.1),mai=c(0.6,0.6,0.6,0.1))
hist(y,freq=F,main="Orginal charges distribution",
     xlab="",ylab="",las=0, cex.main=1)
curve(dnorm(x,mean(y),sd(y)),add=T)
box()
hist(y_p,freq=F,main="Charges distribution \n after the Box and Cox Transformation",
     xlab="",ylab="",las=0, cex.main=1)
curve(dnorm(x,mean(y_p),sd(y_p)),add=T)
box()

ResidualsN<-fitN$residuals
jarque.bera.test(ResidualsN)
## 
##  Jarque Bera Test
## 
## data:  ResidualsN
## X-squared = 2332.4, df = 2, p-value < 2.2e-16
fitX <- lm((((charges^2)-1)/2) ~ smoker + age + age.square + bmi + children + region + smoker*bmi, data = data)
summary(fitN)
## 
## Call:
## lm(formula = log(charges) ~ smoker + age + age.square + bmi + 
##     children + region + smoker * bmi, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.89022 -0.18281 -0.06178  0.04568  2.22354 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.9750692  0.1237014  56.386  < 2e-16 ***
## smokeryes        0.1702813  0.1460671   1.166 0.243915    
## age              0.0537165  0.0057717   9.307  < 2e-16 ***
## age.square      -0.0002377  0.0000720  -3.301 0.000989 ***
## bmi              0.0034600  0.0022704   1.524 0.127761    
## children         0.0924349  0.0102331   9.033  < 2e-16 ***
## regionnorthwest -0.0695986  0.0337700  -2.061 0.039501 *  
## regionsoutheast -0.1624147  0.0339368  -4.786 1.89e-06 ***
## regionsouthwest -0.1370804  0.0338895  -4.045 5.54e-05 ***
## smokeryes:bmi    0.0448377  0.0046622   9.617  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4297 on 1328 degrees of freedom
## Multiple R-squared:  0.7831, Adjusted R-squared:  0.7816 
## F-statistic: 532.7 on 9 and 1328 DF,  p-value: < 2.2e-16
ResidualsX<-fitX$residuals
jarque.bera.test(ResidualsX)
## 
##  Jarque Bera Test
## 
## data:  ResidualsX
## X-squared = 31063, df = 2, p-value < 2.2e-16

EVALUATION OF PREDICTIONS

Fit

Let’s use a 10 fold cross validation

ctrl <- trainControl(method = "cv", number = 10)
cv_results <- train(charges ~ age + bmi + children + sexdummy +regionnorthwest + regionsoutheast+ regionsouthwest+ smokerdummy, data = data, method = "lm", trControl = ctrl)

print(cv_results)
## Linear Regression 
## 
## 1338 samples
##    8 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1204, 1205, 1203, 1205, 1203, 1203, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   6069.809  0.7479912  4205.872
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
ctrl <- trainControl(method = "cv", number = 10)
cv_results <- train(charges ~ age + bmi + children + smokerdummy, data = data, method = "lm", trControl = ctrl)

print(cv_results)
## Linear Regression 
## 
## 1338 samples
##    4 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1204, 1203, 1205, 1204, 1206, 1204, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   6068.695  0.7487517  4197.798
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
ctrl <- trainControl(method = "cv", number = 10)
cv_results <- train(charges ~ smoker + age + age.square + bmi + children + region, data = data, method = "lm", trControl = ctrl)

print(cv_results)
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1205, 1205, 1205, 1204, 1204, 1203, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   6020.295  0.7529543  4145.761
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
ctrl <- trainControl(method = "cv", number = 10)
cv_results <- train(charges ~ smoker + age + age.square + bmi + children + region + bmi*smoker, data = data, method = "lm", trControl = ctrl)

print(cv_results)
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1203, 1204, 1205, 1205, 1203, 1203, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   4822.885  0.8381332  2907.501
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
ctrl <- trainControl(method = "cv", number = 10)
cv_results <- train(log(charges) ~ smoker + age + age.square + bmi + children + region + bmi*smoker, data = data, method = "lm", trControl = ctrl)

print(cv_results)
## Linear Regression 
## 
## 1338 samples
##    6 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1203, 1203, 1204, 1206, 1204, 1204, ... 
## Resampling results:
## 
##   RMSE       Rsquared  MAE      
##   0.4277992  0.782956  0.2671705
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

LASSO REGRESSION

X.l <- X <- subset(data, select = c(smokerdummy, age, age.square, bmi, children, regionsoutheast,regionsouthwest,regionnorthwest))
X.l <- as.matrix(X.l)
fit.lasso <- Lasso(x = X.l, y = y, fix.lambda=F, cv.method = "cv", 
                   parallel = F, intercept = T) 
fit.lasso
## $beta0
## [1] -7515.049
## 
## $beta
## [1] 23793.928513     0.000000     3.239569   330.172639   594.247121
## [6]  -860.311194  -800.699683  -215.361770
## 
## $lambda
## [1] 20.53189
## 
## $meanx
##     smokerdummy             age      age.square             bmi        children 
##       0.2047833      39.2070254    1734.4446936      30.6633969       1.0949178 
## regionsoutheast regionsouthwest regionnorthwest 
##       0.2720478       0.2428999       0.2428999 
## 
## $mu
## [1] 13270.42

Let’s use cross validation within 1 standard error

fit.lasso2 <- Lasso(x = X.l, y = y, fix.lambda=F, cv.method = "cv1se", 
                    parallel = F, intercept = T)
fit.lasso2
## $beta0
## [1] -3529.093
## 
## $beta
## [1] 22209.621284    27.820443     2.392107   226.080084    72.369103
## [6]     0.000000     0.000000     0.000000
## 
## $lambda
## [1] 641.7686
## 
## $meanx
##     smokerdummy             age      age.square             bmi        children 
##       0.2047833      39.2070254    1734.4446936      30.6633969       1.0949178 
## regionsoutheast regionsouthwest regionnorthwest 
##       0.2720478       0.2428999       0.2428999 
## 
## $mu
## [1] 13270.42

And then a ten fold cross validation

X.lasso <- as.matrix(X[,which(fit.lasso$beta!=0)])
ctrl_lasso <- trainControl(method = "cv", number = 10)
cv_results_lasso <- train(y ~ ., data = data.frame(y = y, X.lasso), method = "lm", trControl = ctrl_lasso)

print(cv_results_lasso)
## Linear Regression 
## 
## 1338 samples
##    7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1204, 1204, 1205, 1203, 1203, 1205, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   6024.971  0.7505328  4147.351
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

RIDGE REGRESSION

fit.R <- glmnet(x = X.l, y = y, alpha = 0)

For first we perform a k-fold cross validation to find the optimal lambda value that minimize MSE

cv_model <- cv.glmnet(x = X.l, y = y, alpha = 0)

best_lambda <- cv_model$lambda.min
best_lambda
## [1] 953.006

Let’s do the plot of MSE as a function of the lambda’s value

plot(cv_model)

The coefficients of the best model are the following:

best_model <- glmnet(x = X.l, y = y, alpha = 0, lambda = best_lambda)
coef(best_model)
## 9 x 1 sparse Matrix of class "dgCMatrix"
##                           s0
## (Intercept)     -8123.093850
## smokerdummy     22081.680556
## age               105.336930
## age.square          1.812914
## bmi               309.122218
## children          521.561554
## regionsoutheast  -693.982439
## regionsouthwest  -806.627975
## regionnorthwest  -279.400542

Now we use the best model adapted in order to do predictions

y_predicted <- predict(best_model, s = best_lambda, newx = X.l)

# Calculate Mean Squared Error (MSE)
mse <- mean((y_predicted - y)^2)
print(paste("Mean Squared Error (MSE):", mse))
## [1] "Mean Squared Error (MSE): 36758052.3937664"
# Calculate Root Mean Squared Error (RMSE)
rmse <- sqrt(mse)
print(paste("Root Mean Squared Error (RMSE):", rmse))
## [1] "Root Mean Squared Error (RMSE): 6062.8419403582"
# Calculate R-squared
y_mean <- mean(y)
sst <- sum((y - y_mean)^2)
ssr <- sum((y_predicted - y_mean)^2)
r_squared <- ssr / sst
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.652592619035881"