1. Explore the Data

  1. Set the working directory to “C:/Workshop/Data”
setwd("C:/Workshop/Data")
  1. Load the “Risk.csv” file into a data frame called “policies”
policies <- read.csv("Risk.csv")
  1. Inspect the data with the head function
head(policies)
##   Gender State State.Rate Height Weight      BMI Age Risk
## 1   Male    MA 0.10043368    184   67.8 20.02599  77 High
## 2   Male    VA 0.14172319    163   89.4 33.64824  82 High
## 3   Male    NY 0.09080315    170   81.2 28.09689  31  Low
## 4   Male    TN 0.11997276    175   99.7 32.55510  39  Low
## 5   Male    FL 0.11034460    184   72.1 21.29608  68 High
## 6   Male    WA 0.16292470    166   98.4 35.70910  64 High
  1. Load the RColorBrewer package
library(RColorBrewer)
  1. Create a Set2 color palette with 3 colors
palette <- brewer.pal(3, "Set2")
  1. Create a scatterplot matrix colored by risk
plot(
  x = policies, 
  col = palette[as.numeric(policies$Risk)],
  pch = 19)

  1. Create a scatterplot of Age vs BMI colored by risk
plot(
  x = policies$Age, 
  y = policies$BMI, 
  col = palette[as.numeric(policies$Risk)],
  pch = 19)

  1. Question: Does Risk appear to be linearly separable (i.e. could you draw a straight line that perfectly divides the two colors)?

2. Create Training and Test Set

  1. Set seed to 42 to make randomness reproducable
set.seed(42)
  1. Load the caret package
library(caret)
  1. Create the training set indexes
indexes <- createDataPartition(
  y = policies$Risk,
  p = 0.80,
  list = FALSE)
  1. Create the training set from the indexes
train <- policies[indexes, ]
  1. Create the test set from the remaining rows
test <- policies[-indexes, ]
  1. Verify the number of rows in the training set
nrow(train)
## [1] 1554
  1. Verify the number of rows in the test set
nrow(test)
## [1] 388

3. Predict with a k-Nearest Neighbors Classifier

  1. Train a knn model to predict risk with k = 3
    Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”
knnModel <- knn3(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train,
  k = 3)
  1. Predict the test set with the model
knnPredictions <- predict(
  object = knnModel,
  newdata = test,
  type = "class")
  1. Summarize the prediction results
table(
  x = knnPredictions, 
  y = test$Risk)
##       y
## x      High Low
##   High  112   6
##   Low     3 267
  1. Create a confusion matrix
knnMatrix <- confusionMatrix(
  data = knnPredictions, 
  reference = test$Risk)
  1. Inspect the results
print(knnMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  112   6
##       Low     3 267
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9448          
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9780          
##          Pos Pred Value : 0.9492          
##          Neg Pred Value : 0.9889          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2887          
##    Detection Prevalence : 0.3041          
##       Balanced Accuracy : 0.9760          
##                                           
##        'Positive' Class : High            
## 
  1. Question: What is the accuracy of the KNN classifier?

4. Predict with a Decision Tree Classifier

  1. Load the tree package
library(tree)
  1. Train a tree model to predict Risk
    Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”
treeModel <- tree(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train)
  1. Inspect the model
summary(treeModel)
## 
## Classification tree:
## tree(formula = Risk ~ Age + BMI + Gender + State.Rate, data = train)
## Variables actually used in tree construction:
## [1] "Age"    "BMI"    "Gender"
## Number of terminal nodes:  7 
## Residual mean deviance:  0.09994 = 154.6 / 1547 
## Misclassification error rate: 0.01673 = 26 / 1554
  1. Plot the tree model and add text labels
plot(treeModel)
text(treeModel)

  1. Predict test set with the model
treePredictions <- predict(
  object = treeModel,
  newdata = test,
  type = "class")
  1. Create a confusion matrix
treeMatrix <- confusionMatrix(
  data = treePredictions, 
  reference = test$Risk)
  1. Inspect the results
print(treeMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  112   5
##       Low     3 268
##                                           
##                Accuracy : 0.9794          
##                  95% CI : (0.9598, 0.9911)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9508          
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9817          
##          Pos Pred Value : 0.9573          
##          Neg Pred Value : 0.9889          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2887          
##    Detection Prevalence : 0.3015          
##       Balanced Accuracy : 0.9778          
##                                           
##        'Positive' Class : High            
## 
  1. Question: What is the accuracy of the decision tree classifier?

5. Predict with a Neural Network Classifier

  1. Load nnet package
library(nnet)
  1. Train a neural network model
    Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”
    Note: Use size of 10, decay of 0.0001, and maxit of 500
neuralModel <- nnet(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train,
  size = 10,
  decay = 0.0001,
  maxit = 500)
  1. Inspect the model
summary(neuralModel)
## a 4-10-1 network with 61 weights
## options were - entropy fitting  decay=1e-04
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
##  19.84  -0.09  -0.44   0.05 -19.30 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
##  -0.04   0.18   0.19   0.01   0.04 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 
##   0.10   2.53   3.33   0.07   0.05 
##  b->h4 i1->h4 i2->h4 i3->h4 i4->h4 
##  -0.04   0.23   0.19  -0.04   0.03 
##  b->h5 i1->h5 i2->h5 i3->h5 i4->h5 
##   0.13  -4.76   6.15   2.21   0.01 
##  b->h6 i1->h6 i2->h6 i3->h6 i4->h6 
##  13.92  -0.12  -0.16  -1.52  -7.19 
##  b->h7 i1->h7 i2->h7 i3->h7 i4->h7 
##  -0.02   0.18   0.04   0.02   0.02 
##  b->h8 i1->h8 i2->h8 i3->h8 i4->h8 
##  75.30  -0.98  -0.18 -20.59  70.02 
##  b->h9 i1->h9 i2->h9 i3->h9 i4->h9 
##   0.24   6.11  -8.92  11.50   4.71 
##  b->h10 i1->h10 i2->h10 i3->h10 i4->h10 
##   -5.56    0.16    0.07   -3.28  -13.27 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o  h6->o  h7->o  h8->o  h9->o 
##   6.65 -43.26   6.77   6.00   6.59 -11.45  98.18   6.65 -19.41 -12.89 
## h10->o 
## -42.81
  1. Load neural net tools
library(NeuralNetTools)
  1. Plot the neural network
plotnet(neuralModel, alpha=0.5)

  1. Predict test set with the model
neuralPredictions <- predict(
  object = neuralModel,
  newdata = test,
  type = "class")
  1. Create a confusion matrix
neuralMatrix <- confusionMatrix(
  data = as.factor(neuralPredictions), 
  reference = test$Risk)
  1. Inspect results
print(neuralMatrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  113   7
##       Low     2 266
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9451          
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9826          
##             Specificity : 0.9744          
##          Pos Pred Value : 0.9417          
##          Neg Pred Value : 0.9925          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2912          
##    Detection Prevalence : 0.3093          
##       Balanced Accuracy : 0.9785          
##                                           
##        'Positive' Class : High            
## 
  1. Question: What is the accuracy of the neural network classifier?

6. Evaluate the Classifiers

  1. Compare the accuracy of all 3 classifiers
print(knnMatrix$overall[1])
##  Accuracy 
## 0.9768041
print(treeMatrix$overall[1])
##  Accuracy 
## 0.9793814
print(neuralMatrix$overall[1])
##  Accuracy 
## 0.9768041
  1. Question: Which model would you choose? Why?