Lab 3B - Regression (Hard)

1. Explore the Data

Set the working directory to “C:/Workshop/Data”.

setwd("C:/Workshop/Data")

Load the Rates.csv file into a data frame called “policies”.

policies <- read.csv("Rates.csv")

Inspect the data.

head(policies)

##   Gender State State.Rate Height Weight      BMI Age       Rate
## 1   Male    MA 0.10043368    184   67.8 20.02599  77 0.33200000
## 2   Male    VA 0.14172319    163   89.4 33.64824  82 0.86914779
## 3   Male    NY 0.09080315    170   81.2 28.09689  31 0.01000000
## 4   Male    TN 0.11997276    175   99.7 32.55510  39 0.02153204
## 5   Male    FL 0.11034460    184   72.1 21.29608  68 0.14975000
## 6   Male    WA 0.16292470    166   98.4 35.70910  64 0.21123703

Summarize the data.

summary(policies)

##     Gender        State        State.Rate         Height     
##  Female:956   CA     : 191   Min.   :0.0010   Min.   :150.0  
##  Male  :986   TX     : 169   1st Qu.:0.1103   1st Qu.:162.0  
##               FL     : 104   Median :0.1276   Median :170.0  
##               NY     :  94   Mean   :0.1381   Mean   :169.7  
##               IL     :  80   3rd Qu.:0.1443   3rd Qu.:176.0  
##               OH     :  77   Max.   :0.3181   Max.   :190.0  
##               (Other):1227                                   
##      Weight            BMI             Age             Rate        
##  Min.   : 44.10   Min.   :16.02   Min.   :18.00   Min.   :0.00100  
##  1st Qu.: 68.60   1st Qu.:23.74   1st Qu.:34.00   1st Qu.:0.01475  
##  Median : 81.30   Median :28.06   Median :51.00   Median :0.04628  
##  Mean   : 81.16   Mean   :28.29   Mean   :50.84   Mean   :0.13806  
##  3rd Qu.: 93.80   3rd Qu.:32.46   3rd Qu.:68.00   3rd Qu.:0.17269  
##  Max.   :116.50   Max.   :46.80   Max.   :84.00   Max.   :0.99900  
##

Load the RColorBrewer package.

library(RColorBrewer)

Create a “Reds” color palette with 9 colors.

palette <- brewer.pal(9, "Reds")

Create a scatterplot matrix using the following code.

plot(
  x = policies,
  col = palette[cut(
    x = policies$Rate, 
    breaks = 9)])

Load the corrgram package.

library(corrgram)

Create a correlogram of policies.

corrgram(policies)

Inspect the correlation coefficients.

cor(policies[3:8])

##              State.Rate      Height      Weight         BMI         Age
## State.Rate  1.000000000 -0.01652294 0.009233267  0.01924141  0.11234748
## Height     -0.016522938  1.00000000 0.238085304 -0.31696110 -0.16478131
## Weight      0.009233267  0.23808530 1.000000000  0.83962760  0.01167918
## BMI         0.019241409 -0.31696110 0.839627602  1.00000000  0.10231657
## Age         0.112347476 -0.16478131 0.011679178  0.10231657  1.00000000
## Rate        0.226852143 -0.12858150 0.060939196  0.14050657  0.78007905
##                  Rate
## State.Rate  0.2268521
## Height     -0.1285815
## Weight      0.0609392
## BMI         0.1405066
## Age         0.7800790
## Rate        1.0000000

Question: Which variable is most strongly correlated with mortality rate?
Get the correlation for age and rate.

cor(
  x = policies$Age, 
  y = policies$Rate)

## [1] 0.780079

Create a scatterplot of Age and Rate.

plot(
  x = policies$Age, 
  y = policies$Rate)

Question: Does this seem like a good relationship to model with linear regression? Why or why not?

2. Split the Data into Test and Training Sets

Set the seed to 42 to make randomness reproducable.

set.seed(42)

Load the caret package

library(caret)

Create training set indexes using the following code.

indexes <- createDataPartition(
  y = policies$Rate,
  p = 0.80,
  list = FALSE)

Create the training set using the row indexes.

train <- policies[indexes, ]

Create a test set using the remaining rows.

test <- policies[-indexes, ]

Verify the number of rows in the training set.

print(nrow(train))

## [1] 1555

Verify the number of rows in the test set.

print(nrow(test))

## [1] 387

3. Predict with Simple Linear Regression

Create a simple linear regression model.

simpleModel <- lm(
  formula = Rate ~ Age,
  data = train)

Draw a regression line on the plot.

plot(
  x = policies$Age, 
  y = policies$Rate)
  
lines(
  x = train$Age,
  y = simpleModel$fitted, 
  col = "red",
  lwd = 3)

Summarize the model.

summary(simpleModel)

## 
## Call:
## lm(formula = Rate ~ Age, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18237 -0.09092 -0.02208  0.06002  0.62697 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.2650115  0.0087679  -30.23   <2e-16 ***
## Age          0.0079630  0.0001609   49.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1236 on 1553 degrees of freedom
## Multiple R-squared:  0.6121, Adjusted R-squared:  0.6118 
## F-statistic:  2450 on 1 and 1553 DF,  p-value: < 2.2e-16

Create predictions with the test set.

simplePredictions <- predict(
  object = simpleModel,
  newdata = test)

Plot the predictions on a scatterplot of Age vs Rate.

plot(
  x = policies$Age, 
  y = policies$Rate)


points(
  x = test$Age,
  y = simplePredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

Compute the Root Mean Squared Error (RMSE) for the predictions.

simpleRMSE <- sqrt(mean((test$Rate - simplePredictions)^2))

Inspect the RMSE.

print(simpleRMSE)

## [1] 0.1148266

Question: Why was simple linear regression a poor choice for modeling this relationship?

4. Predict with Multiple Linear Regression

Create multiple linear regression model.

multipleModel <- lm(
  formula = Rate ~ Age + Gender + State.Rate + BMI,
  data = train)

Inspect the model.

summary(multipleModel)

## 
## Call:
## lm(formula = Rate ~ Age + Gender + State.Rate + BMI, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.24620 -0.08738 -0.02936  0.05979  0.60437 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.4233471  0.0189843 -22.300  < 2e-16 ***
## Age          0.0077309  0.0001561  49.538  < 2e-16 ***
## GenderMale   0.0355968  0.0060606   5.873 5.21e-09 ***
## State.Rate   0.6258740  0.0686779   9.113  < 2e-16 ***
## BMI          0.0023340  0.0005240   4.454 9.03e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1188 on 1550 degrees of freedom
## Multiple R-squared:  0.6424, Adjusted R-squared:  0.6415 
## F-statistic: 696.3 on 4 and 1550 DF,  p-value: < 2.2e-16

Create predictions with test set.

multiplePredictions <- predict(
  object = multipleModel,
  newdata = test)

Plot the predictions on a scatterplot of Age vs Rate.

plot(
  x = policies$Age, 
  y = policies$Rate)

points(
  x = test$Age,
  y = multiplePredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

Compute the RMSE for the predictions.

multipleRMSE <- sqrt(mean((test$Rate - multiplePredictions)^2))

Inspect the RMSE.

print(multipleRMSE)

## [1] 0.1102457

Question: What problems are there with using multiple linear regression to model this relationship?

6. Predict with Neural Network Regression

Create a function to scale and center.

normalize <- function(x) {
  (x - min(x)) / (max(x) - min(x)) - 0.5
}

Create a function to undo scale and center.

denormalize <- function(x, y) {
  ((x + 0.5) * (max(y) - min(y))) + min(y)
}

Scale and center the policies data set.

scaledPolicies <- data.frame(
  Gender = policies$Gender,
  State.Rate = normalize(policies$State.Rate),
  BMI = normalize(policies$BMI),
  Age = normalize(policies$Age),
  Rate = normalize(policies$Rate))

Create a scaled training set using the row indexes.

scaledTrain <- scaledPolicies[indexes, ]

Create a scaled test set from remaining indexes.

scaledTest <- scaledPolicies[-indexes, ]

Load the nnet package.

library(nnet)

Create a neural network regressor.

neuralRegressor <- nnet(
  formula = Rate ~ .,
  data = scaledTrain,
  linout = TRUE,
  size = 5,
  decay = 0.0001,
  maxit = 1000)

Predict new values with the model.

scaledPredictions <- predict(
  object = neuralRegressor, 
  newdata = scaledTest)

Denormalize the predictions.

neuralPredictions <- denormalize(
  x = scaledPredictions, 
  y = policies$Rate)

Plot the predictions on scatterplot of Age vs Rate.

plot(
  x = train$Age, 
  y = train$Rate)

points(
  x = test$Age,
  y = neuralPredictions,
  col = "blue",
  pch = 4,
  lwd = 2)

Load the NeuralNetTools package.

library(NeuralNetTools)

Plot the neural network.

plotnet(neuralRegressor)

Compute the RMSE for the predictions.

neuralRMSE <- sqrt(mean((test$Rate - neuralPredictions)^2))

Inspect the RMSE

print(neuralRMSE)

## [1] 0.039926

Question: Why was a neural network able to more accurately model this relationship?

7. Evaluate the Regression Models

Compare the RMSE for all three models.

print(simpleRMSE)

## [1] 0.1148266

print(multipleRMSE)

## [1] 0.1102457

print(neuralRMSE)

## [1] 0.039926

Question: Which model would you choose? Why?