Lab 2B - Classification (Hard)

1. Explore the Data

Set the working directory to “C:/Workshop/Data”

setwd("C:/Workshop/Data")

Load the “Risk.csv” file into a data frame called “policies”

policies <- read.csv("Risk.csv")

Inspect the data with the head function

head(policies)

##   Gender State State.Rate Height Weight      BMI Age Risk
## 1   Male    MA 0.10043368    184   67.8 20.02599  77 High
## 2   Male    VA 0.14172319    163   89.4 33.64824  82 High
## 3   Male    NY 0.09080315    170   81.2 28.09689  31  Low
## 4   Male    TN 0.11997276    175   99.7 32.55510  39  Low
## 5   Male    FL 0.11034460    184   72.1 21.29608  68 High
## 6   Male    WA 0.16292470    166   98.4 35.70910  64 High

Load the RColorBrewer package

library(RColorBrewer)

Create a Set2 color palette with 3 colors

palette <- brewer.pal(3, "Set2")

Create a scatterplot matrix colored by risk

plot(
  x = policies, 
  col = palette[as.numeric(policies$Risk)],
  pch = 19)

Create a scatterplot of Age vs BMI colored by risk

plot(
  x = policies$Age, 
  y = policies$BMI, 
  col = palette[as.numeric(policies$Risk)],
  pch = 19)

Question: Does Risk appear to be linearly separable (i.e. could you draw a straight line that perfectly divides the two colors)?

2. Create Training and Test Set

Set seed to 42 to make randomness reproducable

set.seed(42)

Load the caret package

library(caret)

Create the training set indexes

indexes <- createDataPartition(
  y = policies$Risk,
  p = 0.80,
  list = FALSE)

Create the training set from the indexes

train <- policies[indexes, ]

Create the test set from the remaining rows

test <- policies[-indexes, ]

Verify the number of rows in the training set

nrow(train)

## [1] 1554

Verify the number of rows in the test set

nrow(test)

## [1] 388

3. Predict with a k-Nearest Neighbors Classifier

Train a knn model to predict risk with k = 3
Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”

knnModel <- knn3(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train,
  k = 3)

Predict the test set with the model

knnPredictions <- predict(
  object = knnModel,
  newdata = test,
  type = "class")

Summarize the prediction results

table(
  x = knnPredictions, 
  y = test$Risk)

##       y
## x      High Low
##   High  112   6
##   Low     3 267

Create a confusion matrix

knnMatrix <- confusionMatrix(
  data = knnPredictions, 
  reference = test$Risk)

Inspect the results

print(knnMatrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  112   6
##       Low     3 267
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9448          
##  Mcnemar's Test P-Value : 0.505           
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9780          
##          Pos Pred Value : 0.9492          
##          Neg Pred Value : 0.9889          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2887          
##    Detection Prevalence : 0.3041          
##       Balanced Accuracy : 0.9760          
##                                           
##        'Positive' Class : High            
##

Question: What is the accuracy of the KNN classifier?

4. Predict with a Decision Tree Classifier

Load the tree package

library(tree)

Train a tree model to predict Risk
Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”

treeModel <- tree(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train)

Inspect the model

summary(treeModel)

## 
## Classification tree:
## tree(formula = Risk ~ Age + BMI + Gender + State.Rate, data = train)
## Variables actually used in tree construction:
## [1] "Age"    "BMI"    "Gender"
## Number of terminal nodes:  7 
## Residual mean deviance:  0.09994 = 154.6 / 1547 
## Misclassification error rate: 0.01673 = 26 / 1554

Plot the tree model and add text labels

plot(treeModel)
text(treeModel)

Predict test set with the model

treePredictions <- predict(
  object = treeModel,
  newdata = test,
  type = "class")

Create a confusion matrix

treeMatrix <- confusionMatrix(
  data = treePredictions, 
  reference = test$Risk)

Inspect the results

print(treeMatrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  112   5
##       Low     3 268
##                                           
##                Accuracy : 0.9794          
##                  95% CI : (0.9598, 0.9911)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9508          
##  Mcnemar's Test P-Value : 0.7237          
##                                           
##             Sensitivity : 0.9739          
##             Specificity : 0.9817          
##          Pos Pred Value : 0.9573          
##          Neg Pred Value : 0.9889          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2887          
##    Detection Prevalence : 0.3015          
##       Balanced Accuracy : 0.9778          
##                                           
##        'Positive' Class : High            
##

Question: What is the accuracy of the decision tree classifier?

5. Predict with a Neural Network Classifier

Load nnet package

library(nnet)

Train a neural network model
Note: Use the formula “Risk ~ Age + BMI + Gender + State.Rate”
Note: Use size of 10, decay of 0.0001, and maxit of 500

neuralModel <- nnet(
  formula = Risk ~ Age + BMI + Gender + State.Rate,
  data = train,
  size = 10,
  decay = 0.0001,
  maxit = 500)

Inspect the model

summary(neuralModel)

## a 4-10-1 network with 61 weights
## options were - entropy fitting  decay=1e-04
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
##  19.84  -0.09  -0.44   0.05 -19.30 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
##  -0.04   0.18   0.19   0.01   0.04 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 
##   0.10   2.53   3.33   0.07   0.05 
##  b->h4 i1->h4 i2->h4 i3->h4 i4->h4 
##  -0.04   0.23   0.19  -0.04   0.03 
##  b->h5 i1->h5 i2->h5 i3->h5 i4->h5 
##   0.13  -4.76   6.15   2.21   0.01 
##  b->h6 i1->h6 i2->h6 i3->h6 i4->h6 
##  13.92  -0.12  -0.16  -1.52  -7.19 
##  b->h7 i1->h7 i2->h7 i3->h7 i4->h7 
##  -0.02   0.18   0.04   0.02   0.02 
##  b->h8 i1->h8 i2->h8 i3->h8 i4->h8 
##  75.30  -0.98  -0.18 -20.59  70.02 
##  b->h9 i1->h9 i2->h9 i3->h9 i4->h9 
##   0.24   6.11  -8.92  11.50   4.71 
##  b->h10 i1->h10 i2->h10 i3->h10 i4->h10 
##   -5.56    0.16    0.07   -3.28  -13.27 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o  h6->o  h7->o  h8->o  h9->o 
##   6.65 -43.26   6.77   6.00   6.59 -11.45  98.18   6.65 -19.41 -12.89 
## h10->o 
## -42.81

Load neural net tools

library(NeuralNetTools)

Plot the neural network

plotnet(neuralModel, alpha=0.5)

Predict test set with the model

neuralPredictions <- predict(
  object = neuralModel,
  newdata = test,
  type = "class")

Create a confusion matrix

neuralMatrix <- confusionMatrix(
  data = as.factor(neuralPredictions), 
  reference = test$Risk)

Inspect results

print(neuralMatrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction High Low
##       High  113   7
##       Low     2 266
##                                           
##                Accuracy : 0.9768          
##                  95% CI : (0.9564, 0.9893)
##     No Information Rate : 0.7036          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9451          
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9826          
##             Specificity : 0.9744          
##          Pos Pred Value : 0.9417          
##          Neg Pred Value : 0.9925          
##              Prevalence : 0.2964          
##          Detection Rate : 0.2912          
##    Detection Prevalence : 0.3093          
##       Balanced Accuracy : 0.9785          
##                                           
##        'Positive' Class : High            
##

Question: What is the accuracy of the neural network classifier?

6. Evaluate the Classifiers

Compare the accuracy of all 3 classifiers

print(knnMatrix$overall[1])

##  Accuracy 
## 0.9768041

print(treeMatrix$overall[1])

##  Accuracy 
## 0.9793814

print(neuralMatrix$overall[1])

##  Accuracy 
## 0.9768041

Question: Which model would you choose? Why?