Lab 7A: Machine Learning (Easy)

Predict with K-Means Cluster Analysis

  1. Load the Iris data set.
data(iris)
  1. Create a scatterplot matrix colored by species.
library(RColorBrewer)

palette <- brewer.pal(3, "Set2")

plot(
  x = iris[1:4], 
  col = palette[as.numeric(iris$Species)],
  pch = 19)

  1. View scatterplot of petal length vs width.
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width)

  1. Color scatterplot by species.
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width, 
  col = palette[as.numeric(iris$Species)],
  pch = 19)

  1. Create K-means clusters.
clusters <- kmeans(
  x = iris[, 1:4], 
  centers = 3, 
  nstart = 10)
  1. Plot each cluster as a shape and plot centroid of clusters.
plot(
  x = iris$Petal.Length, 
  y = iris$Petal.Width, 
  col = palette[as.numeric(iris$Species)], 
  pch = clusters$cluster)

points(
  x = clusters$centers[, "Petal.Length"], 
  y = clusters$centers[, "Petal.Width"],
  pch = 4, 
  lwd = 4, 
  col = "blue")

  1. View a matrix of the actual vs. predicted clusters.
table(
  x = clusters$cluster, 
  y = iris$Species)
##    y
## x   setosa versicolor virginica
##   1      0          2        36
##   2      0         48        14
##   3     50          0         0

Split Data into Test and Training Set

  1. Set the seed to make randomness reproducable
set.seed(42)
  1. Randomly sample 100 of 150 row indexes
indexes <- sample(
    x = 1:150, 
    size = 100)
  1. Create a training set from indexes
train <- iris[indexes, ]
  1. Create a test set from remaining indexes
test <- iris[-indexes, ]

Predict using Decision Tree

  1. Load the decision tree package
library(tree)
  1. Train tree model
treeModel <- tree(
    formula = Species ~ .,
    data = train)
  1. Inspect the model
summary(treeModel)
## 
## Classification tree:
## tree(formula = Species ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Petal.Width" 
## Number of terminal nodes:  4 
## Residual mean deviance:  0.05213 = 5.004 / 96 
## Misclassification error rate: 0.01 = 1 / 100
  1. Plot the model
plot(treeModel)
text(treeModel)

  1. Plot the decision boundaries
plot(
    x = iris$Petal.Length, 
    y = iris$Petal.Width,
    pch = 19,
    col = palette[as.numeric(iris$Species)],
    main = "Iris Petal Length vs. Width",
    xlab = "Petal Length (cm)",
    ylab = "Petal Width (cm)")

partition.tree(
    tree = treeModel,
    label = "Species",
    add = TRUE)

  1. Predict with the model
treePredictions <- predict(
    object = treeModel,
    newdata = test,
    type = "class")
  1. Inspect the prediction accuracy
table(
    x = treePredictions, 
    y = test$Species)
##             y
## x            setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         16         0
##   virginica       0          2        15
  1. Load the caret package
library(caret)
  1. Evaluate the prediction results
confusionMatrix(
    data = treePredictions, 
    reference = test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         16         0
##   virginica       0          2        15
## 
## Overall Statistics
##                                           
##                Accuracy : 0.96            
##                  95% CI : (0.8629, 0.9951)
##     No Information Rate : 0.36            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.94            
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                   1.00            0.8889           1.0000
## Specificity                   1.00            1.0000           0.9429
## Pos Pred Value                1.00            1.0000           0.8824
## Neg Pred Value                1.00            0.9412           1.0000
## Prevalence                    0.34            0.3600           0.3000
## Detection Rate                0.34            0.3200           0.3000
## Detection Prevalence          0.34            0.3200           0.3400
## Balanced Accuracy             1.00            0.9444           0.9714

Predict using Naive Bayes Classifier

  1. Load the e1071 package
library(e1071)
  1. Train the model
bayesModel <- naiveBayes(
    formula = Species ~ ., 
    data = train)
  1. Inspect the model
summary(bayesModel)
##         Length Class  Mode     
## apriori 3      table  numeric  
## tables  4      -none- list     
## levels  3      -none- character
## call    4      -none- call
  1. Predict with the model
bayesPredictions <- predict(
    object = bayesModel, 
    newdata = test[, 1:4])
  1. Evaluate the prediction results
confusionMatrix(
    data = bayesPredictions, 
    reference = test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         16         1
##   virginica       0          2        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.94            
##                  95% CI : (0.8345, 0.9875)
##     No Information Rate : 0.36            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9099          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                   1.00            0.8889           0.9333
## Specificity                   1.00            0.9688           0.9429
## Pos Pred Value                1.00            0.9412           0.8750
## Neg Pred Value                1.00            0.9394           0.9706
## Prevalence                    0.34            0.3600           0.3000
## Detection Rate                0.34            0.3200           0.2800
## Detection Prevalence          0.34            0.3400           0.3200
## Balanced Accuracy             1.00            0.9288           0.9381

Predict with Neural Network

  1. Load the Neural Network package
library(nnet)
  1. Train the model
neuralModel <- nnet(
    formula = Species ~ .,
    data = train,
    size = 4,
    decay = 0.0001,
    maxit = 500)
  1. Inspect the model
summary(neuralModel)
## a 4-4-3 network with 35 weights
## options were - softmax modelling  decay=1e-04
##  b->h1 i1->h1 i2->h1 i3->h1 i4->h1 
##  -1.88  -6.20  12.82   1.37   0.09 
##  b->h2 i1->h2 i2->h2 i3->h2 i4->h2 
##   6.61   5.95   8.32 -10.54  -9.20 
##  b->h3 i1->h3 i2->h3 i3->h3 i4->h3 
##   0.42   0.92   1.12  -2.72  -1.33 
##  b->h4 i1->h4 i2->h4 i3->h4 i4->h4 
##  -0.41  -0.81  -1.10   2.40   1.15 
##  b->o1 h1->o1 h2->o1 h3->o1 h4->o1 
##   0.43   0.49   2.17   6.38  -6.22 
##  b->o2 h1->o2 h2->o2 h3->o2 h4->o2 
##  -3.42  -9.28  17.97  -6.43   3.07 
##  b->o3 h1->o3 h2->o3 h3->o3 h4->o3 
##   2.98   8.78 -20.14   0.05   3.15
  1. Load the NeuralNetTools package
library(NeuralNetTools)
  1. Visualize the neural network
plotnet(neuralModel)

  1. Predict with the model
neuralPredictions <- predict(
    object = neuralModel,
    newdata = test[, 1:4],
    type = "class")
  1. Evaluate the prediction results
confusionMatrix(
    data = neuralPredictions, 
    reference = test$Species)
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         17          0         0
##   versicolor      0         17         0
##   virginica       0          1        15
## 
## Overall Statistics
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.8935, 0.9995)
##     No Information Rate : 0.36            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.97            
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                   1.00            0.9444           1.0000
## Specificity                   1.00            1.0000           0.9714
## Pos Pred Value                1.00            1.0000           0.9375
## Neg Pred Value                1.00            0.9697           1.0000
## Prevalence                    0.34            0.3600           0.3000
## Detection Rate                0.34            0.3400           0.3000
## Detection Prevalence          0.34            0.3400           0.3200
## Balanced Accuracy             1.00            0.9722           0.9857
  1. Set working directory
setwd("C:/Workshop/Data")
  1. Save the tree model
save(treeModel, file = "Tree.RData")