1. Explore the Data

  1. Set working directory
setwd("C:/Workshop/Data")
  1. Read CSV file
raw <- read.csv("Risk.csv")
  1. Peek at the data
head(raw)
##   Gender State State.Rate Height Weight      BMI Age Risk
## 1   Male    MA 0.10043368    184   67.8 20.02599  77 High
## 2   Male    VA 0.14172319    163   89.4 33.64824  82 High
## 3   Male    NY 0.09080315    170   81.2 28.09689  31  Low
## 4   Male    TN 0.11997276    175   99.7 32.55510  39  Low
## 5   Male    FL 0.11034460    184   72.1 21.29608  68 High
## 6   Male    WA 0.16292470    166   98.4 35.70910  64 High
  1. Summarize the data set
summary(raw)
##     Gender        State        State.Rate         Height     
##  Female:956   CA     : 191   Min.   :0.0010   Min.   :150.0  
##  Male  :986   TX     : 169   1st Qu.:0.1103   1st Qu.:162.0  
##               FL     : 104   Median :0.1276   Median :170.0  
##               NY     :  94   Mean   :0.1381   Mean   :169.7  
##               IL     :  80   3rd Qu.:0.1443   3rd Qu.:176.0  
##               OH     :  77   Max.   :0.3181   Max.   :190.0  
##               (Other):1227                                   
##      Weight            BMI             Age          Risk     
##  Min.   : 44.10   Min.   :16.02   Min.   :18.00   High: 576  
##  1st Qu.: 68.60   1st Qu.:23.74   1st Qu.:34.00   Low :1366  
##  Median : 81.30   Median :28.06   Median :51.00              
##  Mean   : 81.16   Mean   :28.29   Mean   :50.84              
##  3rd Qu.: 93.80   3rd Qu.:32.46   3rd Qu.:68.00              
##  Max.   :116.50   Max.   :46.80   Max.   :84.00              
## 
  1. Visualize the data set
plot(raw)

  1. Load the corrgram package
library(corrgram)
  1. Visualize the correlations
corrgram(raw)

  1. Count the rows with missing values
sum(is.na(raw))
## [1] 0
  1. Load the dplyr package
library(dplyr)
  1. Transform, clean, engineer, and select features
clean <- raw %>%
  select(
    Gender,
    State.Rate,
    Height,
    Weight,
    BMI,
    Age,
    Risk) %>%
  as.data.frame()

2. Create Training and Test Set

  1. Load the caret package
library(caret)
  1. Set the seed to 42 to make randomness reproducable.
set.seed(42)
  1. Create row indexes for the training set
indexes <- createDataPartition(
  clean$Risk, 
  p = .8, 
  list = FALSE, 
  times = 1)
  1. Create the training set using the row indexes
train <- clean[indexes, ]
  1. Create the test set using the remaining rows
test <- clean[-indexes, ]
  1. Specify center and scale as preprocessing steps
preProcess <- c("center", "scale")
  1. Specify training control parameters
control <- trainControl(
  method = "cv",
  number = 10)

3. Train KNN Models

  1. Train k-nearest neighbor models
knnModel <- train(
  form = Risk ~ .,
  data = train,
  method = "knn",
  preProcess = preProcess,
  trControl = control,
  tuneLength = 5,
  metric = "Accuracy")
  1. Display model details
print(knnModel)
## k-Nearest Neighbors 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1398, 1399, 1398, 1399, 1399, 1399, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9562448  0.8953109
##    7  0.9568983  0.8958922
##    9  0.9562531  0.8942799
##   11  0.9549586  0.8916590
##   13  0.9575352  0.8978399
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
  1. Plot model accuracy
plot(knnModel)

4. Train Decision Tree Models

  1. Train decision tree models
treeModel <- train(
  form = Risk ~ .,
  data = train,
  method = "rpart",
  preProcess = preProcess,
  trControl = control,
  tuneLength = 5,
  metric = "Accuracy")
  1. Display model summary
print(treeModel)
## CART 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1399, 1399, 1399, 1398, 1399, 1399, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.000000000  0.9768401  0.9438743
##   0.006507592  0.9742636  0.9375723
##   0.016268980  0.9632999  0.9129280
##   0.023861171  0.9549623  0.8924622
##   0.843817787  0.8211078  0.4256705
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
  1. Plot model accuracy
plot(treeModel)

5. Train Neural Network Models

  1. Specify hyperparameter-tuning grid
neuralTuneGrid <- data.frame(
  size = c(3, 4, 5, 3, 4, 5, 3, 4, 5),
  decay = c(0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001))
  1. Train neural network models
neuralModel <- train(
  form = Risk ~ .,
  data = train,
  method = "nnet",
  preProcess = preProcess,
  trControl = control,
  tuneGrid = neuralTuneGrid)
  1. Display model summary
print(neuralModel)
## Neural Network 
## 
## 1554 samples
##    6 predictor
##    2 classes: 'High', 'Low' 
## 
## Pre-processing: centered (6), scaled (6) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 1399, 1398, 1399, 1399, 1398, 1399, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa    
##   3     0.001  0.9742514  0.9382290
##   3     0.010  0.9774690  0.9457605
##   3     0.100  0.9742680  0.9383723
##   4     0.001  0.9703846  0.9289725
##   4     0.010  0.9691232  0.9261795
##   4     0.100  0.9729777  0.9354895
##   5     0.001  0.9774731  0.9458424
##   5     0.010  0.9729735  0.9348942
##   5     0.100  0.9749090  0.9401572
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.001.

4 Plot model accuracy

plot(neuralModel)

6. Evaluate the Models

  1. Combine results
results <- resamples(list(
  knn = knnModel,
  tree = treeModel,
  nnet = neuralModel))
  1. Summarize results
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: knn, tree, nnet 
## Number of resamples: 10 
## 
## Accuracy 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.9419355 0.9454404 0.9583333 0.9575352 0.9661911 0.9741935    0
## tree 0.9548387 0.9696630 0.9806452 0.9768401 0.9807692 1.0000000    0
## nnet 0.9612903 0.9741935 0.9774814 0.9774731 0.9855149 0.9871795    0
## 
## Kappa 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## knn  0.8563781 0.8707380 0.8994570 0.8978399 0.9192812 0.9389403    0
## tree 0.8897246 0.9256184 0.9533986 0.9438743 0.9538047 1.0000000    0
## nnet 0.9072597 0.9367811 0.9462005 0.9458424 0.9653263 0.9691700    0
  1. Create a dot plot to compare top models
dotplot(results)

  1. Create a box plot to compare models
bwplot(results)

  1. Create a density plot to compare models
densityplot(results, auto.key = TRUE)

  1. Question: Which model would you choose? Why?

7. Evalute the Final Model

  1. Make final predictions using hold-out test set
final_predictions <- predict(
  object = neuralModel,
  newdata = test)
  1. Determine final prediction accuracy
finalMatrix <- confusionMatrix(
  data = final_predictions,
  reference = test$Risk)
  1. Inspect final prediction accuracy
print(finalMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  110   4
##       Low     5 269
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9442          
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9565          
##             Specificity : 0.9853          
##          Pos Pred Value : 0.9649          
##          Neg Pred Value : 0.9818          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2835          
##    Detection Prevalence : 0.2938          
##       Balanced Accuracy : 0.9709          
##                                           
##        'Positive' Class : High            
## 

8. Deploy the Model

  1. Is Jack (from the Titanic) a high or low risk client?
predict(
  object = neuralModel,
  newdata = data.frame(
    Gender = "Male",
    State.Rate = 0.09080315,
    Height = 183,
    Weight = 75,
    BMI = 22.4,
    Age = 20),
  type = "prob")
##   High Low
## 1    0   1
  1. Question: Would you offer life insurance to Jack?