Lab 5B: ML in Practice (Hard)

1. Explore the Data

Set working directory

setwd("C:/Workshop/Data")

Read CSV file

raw <- read.csv("Risk.csv")

Peek at the data

head(raw)

##   Gender State State.Rate Height Weight      BMI Age Risk
## 1   Male    MA 0.10043368    184   67.8 20.02599  77 High
## 2   Male    VA 0.14172319    163   89.4 33.64824  82 High
## 3   Male    NY 0.09080315    170   81.2 28.09689  31  Low
## 4   Male    TN 0.11997276    175   99.7 32.55510  39  Low
## 5   Male    FL 0.11034460    184   72.1 21.29608  68 High
## 6   Male    WA 0.16292470    166   98.4 35.70910  64 High

Summarize the data set

summary(raw)

##     Gender        State        State.Rate         Height     
##  Female:956   CA     : 191   Min.   :0.0010   Min.   :150.0  
##  Male  :986   TX     : 169   1st Qu.:0.1103   1st Qu.:162.0  
##               FL     : 104   Median :0.1276   Median :170.0  
##               NY     :  94   Mean   :0.1381   Mean   :169.7  
##               IL     :  80   3rd Qu.:0.1443   3rd Qu.:176.0  
##               OH     :  77   Max.   :0.3181   Max.   :190.0  
##               (Other):1227                                   
##      Weight            BMI             Age          Risk     
##  Min.   : 44.10   Min.   :16.02   Min.   :18.00   High: 576  
##  1st Qu.: 68.60   1st Qu.:23.74   1st Qu.:34.00   Low :1366  
##  Median : 81.30   Median :28.06   Median :51.00              
##  Mean   : 81.16   Mean   :28.29   Mean   :50.84              
##  3rd Qu.: 93.80   3rd Qu.:32.46   3rd Qu.:68.00              
##  Max.   :116.50   Max.   :46.80   Max.   :84.00              
##

Visualize the data set

plot(raw)

Load the corrgram package

library(corrgram)

Visualize the correlations

corrgram(raw)

Count the rows with missing values

sum(is.na(raw))

## [1] 0

Load the dplyr package

library(dplyr)

Transform, clean, engineer, and select features

clean <- raw %>%
  select(
    Gender,
    State.Rate,
    Height,
    Weight,
    BMI,
    Age,
    Risk) %>%
  as.data.frame()

2. Create Training and Test Set

Load the caret package

library(caret)

Set the seed to 42 to make randomness reproducable.

set.seed(42)

Create row indexes for the training set

indexes <- createDataPartition(
  clean$Risk, 
  p = .8, 
  list = FALSE, 
  times = 1)

Create the training set using the row indexes

train <- clean[indexes, ]

Create the test set using the remaining rows

test <- clean[-indexes, ]

Specify center and scale as preprocessing steps

preProcess <- c("center", "scale")

Specify training control parameters

control <- trainControl(
  method = "cv",
  number = 10)

3. Train KNN Models

Train k-nearest neighbor models

knnModel <- train(
  form = Risk ~ .,
  data = train,
  method = "knn",
  preProcess = preProcess,
  trControl = control,
  tuneLength = 5,
  metric = "Accuracy")

Display model details

print(knnModel)

## k-Nearest Neighbors 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1398, 1399, 1398, 1399, 1399, 1399, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9562448  0.8953109
##    7  0.9568983  0.8958922
##    9  0.9562531  0.8942799
##   11  0.9549586  0.8916590
##   13  0.9575352  0.8978399
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.

Plot model accuracy

plot(knnModel)

4. Train Decision Tree Models

Train decision tree models

treeModel <- train(
  form = Risk ~ .,
  data = train,
  method = "rpart",
  preProcess = preProcess,
  trControl = control,
  tuneLength = 5,
  metric = "Accuracy")

Display model summary

print(treeModel)

## CART 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1399, 1399, 1399, 1398, 1399, 1399, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.000000000  0.9768401  0.9438743
##   0.006507592  0.9742636  0.9375723
##   0.016268980  0.9632999  0.9129280
##   0.023861171  0.9549623  0.8924622
##   0.843817787  0.8211078  0.4256705
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.

Plot model accuracy

plot(treeModel)

5. Train Neural Network Models

Specify hyperparameter-tuning grid

neuralTuneGrid <- data.frame(
  size = c(3, 4, 5, 3, 4, 5, 3, 4, 5),
  decay = c(0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001))

Train neural network models

neuralModel <- train(
  form = Risk ~ .,
  data = train,
  method = "nnet",
  preProcess = preProcess,
  trControl = control,
  tuneGrid = neuralTuneGrid)

Display model summary

print(neuralModel)

## Neural Network 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1399, 1398, 1399, 1399, 1398, 1399, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa    
##   3     0.001  0.9742514  0.9382290
##   3     0.010  0.9774690  0.9457605
##   3     0.100  0.9742680  0.9383723
##   4     0.001  0.9703846  0.9289725
##   4     0.010  0.9691232  0.9261795
##   4     0.100  0.9729777  0.9354895
##   5     0.001  0.9774731  0.9458424
##   5     0.010  0.9729735  0.9348942
##   5     0.100  0.9749090  0.9401572
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.001.

4 Plot model accuracy

plot(neuralModel)

6. Evaluate the Models

Combine results

results <- resamples(list(
  knn = knnModel,
  tree = treeModel,
  nnet = neuralModel))

Summarize results

summary(results)

## 
## Call:
## summary.resamples(object = results)
## 
## Models: knn, tree, nnet 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.9419355 0.9454404 0.9583333 0.9575352 0.9661911 0.9741935    0
## tree 0.9548387 0.9696630 0.9806452 0.9768401 0.9807692 1.0000000    0
## nnet 0.9612903 0.9741935 0.9774814 0.9774731 0.9855149 0.9871795    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.8563781 0.8707380 0.8994570 0.8978399 0.9192812 0.9389403    0
## tree 0.8897246 0.9256184 0.9533986 0.9438743 0.9538047 1.0000000    0
## nnet 0.9072597 0.9367811 0.9462005 0.9458424 0.9653263 0.9691700    0

Create a dot plot to compare top models

dotplot(results)

Create a box plot to compare models

bwplot(results)

Create a density plot to compare models

densityplot(results, auto.key = TRUE)

Question: Which model would you choose? Why?

7. Evalute the Final Model

Make final predictions using hold-out test set

final_predictions <- predict(
  object = neuralModel,
  newdata = test)

Determine final prediction accuracy

finalMatrix <- confusionMatrix(
  data = final_predictions,
  reference = test$Risk)

Inspect final prediction accuracy

print(finalMatrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  110   4
##       Low     5 269
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9442          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9565          
##             Specificity : 0.9853          
##          Pos Pred Value : 0.9649          
##          Neg Pred Value : 0.9818          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2835          
##    Detection Prevalence : 0.2938          
##       Balanced Accuracy : 0.9709          
##                                           
##        'Positive' Class : High            
##

8. Deploy the Model

Is Jack (from the Titanic) a high or low risk client?

predict(
  object = neuralModel,
  newdata = data.frame(
    Gender = "Male",
    State.Rate = 0.09080315,
    Height = 183,
    Weight = 75,
    BMI = 22.4,
    Age = 20),
  type = "prob")

##   High Low
## 1    0   1

Question: Would you offer life insurance to Jack?