Lab 7A: Machine Learning (Easy)
Predict with K-Means Cluster Analysis
- Load the Iris data set.
data(iris)
- Create a scatterplot matrix colored by species.
library(RColorBrewer)
palette <- brewer.pal(3, "Set2")
plot(
x = iris[1:4],
col = palette[as.numeric(iris$Species)],
pch = 19)
- View scatterplot of petal length vs width.
plot(
x = iris$Petal.Length,
y = iris$Petal.Width)
- Color scatterplot by species.
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
col = palette[as.numeric(iris$Species)],
pch = 19)
- Create K-means clusters.
clusters <- kmeans(
x = iris[, 1:4],
centers = 3,
nstart = 10)
- Plot each cluster as a shape and plot centroid of clusters.
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
col = palette[as.numeric(iris$Species)],
pch = clusters$cluster)
points(
x = clusters$centers[, "Petal.Length"],
y = clusters$centers[, "Petal.Width"],
pch = 4,
lwd = 4,
col = "blue")
- View a matrix of the actual vs. predicted clusters.
table(
x = clusters$cluster,
y = iris$Species)
## y
## x setosa versicolor virginica
## 1 0 2 36
## 2 0 48 14
## 3 50 0 0
Split Data into Test and Training Set
- Set the seed to make randomness reproducable
set.seed(42)
- Randomly sample 100 of 150 row indexes
indexes <- sample(
x = 1:150,
size = 100)
- Create a training set from indexes
train <- iris[indexes, ]
- Create a test set from remaining indexes
test <- iris[-indexes, ]
Predict using Decision Tree
- Load the decision tree package
library(tree)
- Train tree model
treeModel <- tree(
formula = Species ~ .,
data = train)
- Inspect the model
summary(treeModel)
##
## Classification tree:
## tree(formula = Species ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Petal.Length" "Petal.Width"
## Number of terminal nodes: 4
## Residual mean deviance: 0.05213 = 5.004 / 96
## Misclassification error rate: 0.01 = 1 / 100
- Plot the model
plot(treeModel)
text(treeModel)
- Plot the decision boundaries
plot(
x = iris$Petal.Length,
y = iris$Petal.Width,
pch = 19,
col = palette[as.numeric(iris$Species)],
main = "Iris Petal Length vs. Width",
xlab = "Petal Length (cm)",
ylab = "Petal Width (cm)")
partition.tree(
tree = treeModel,
label = "Species",
add = TRUE)
- Predict with the model
treePredictions <- predict(
object = treeModel,
newdata = test,
type = "class")
- Inspect the prediction accuracy
table(
x = treePredictions,
y = test$Species)
## y
## x setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 16 0
## virginica 0 2 15
- Load the caret package
library(caret)
- Evaluate the prediction results
confusionMatrix(
data = treePredictions,
reference = test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 16 0
## virginica 0 2 15
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.8629, 0.9951)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.00 0.8889 1.0000
## Specificity 1.00 1.0000 0.9429
## Pos Pred Value 1.00 1.0000 0.8824
## Neg Pred Value 1.00 0.9412 1.0000
## Prevalence 0.34 0.3600 0.3000
## Detection Rate 0.34 0.3200 0.3000
## Detection Prevalence 0.34 0.3200 0.3400
## Balanced Accuracy 1.00 0.9444 0.9714
Predict using Naive Bayes Classifier
- Load the e1071 package
library(e1071)
- Train the model
bayesModel <- naiveBayes(
formula = Species ~ .,
data = train)
- Inspect the model
summary(bayesModel)
## Length Class Mode
## apriori 3 table numeric
## tables 4 -none- list
## levels 3 -none- character
## call 4 -none- call
- Predict with the model
bayesPredictions <- predict(
object = bayesModel,
newdata = test[, 1:4])
- Evaluate the prediction results
confusionMatrix(
data = bayesPredictions,
reference = test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 16 1
## virginica 0 2 14
##
## Overall Statistics
##
## Accuracy : 0.94
## 95% CI : (0.8345, 0.9875)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9099
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.00 0.8889 0.9333
## Specificity 1.00 0.9688 0.9429
## Pos Pred Value 1.00 0.9412 0.8750
## Neg Pred Value 1.00 0.9394 0.9706
## Prevalence 0.34 0.3600 0.3000
## Detection Rate 0.34 0.3200 0.2800
## Detection Prevalence 0.34 0.3400 0.3200
## Balanced Accuracy 1.00 0.9288 0.9381
Predict with Neural Network
- Load the Neural Network package
library(nnet)
- Train the model
neuralModel <- nnet(
formula = Species ~ .,
data = train,
size = 4,
decay = 0.0001,
maxit = 500)
- Inspect the model
summary(neuralModel)
## a 4-4-3 network with 35 weights
## options were - softmax modelling decay=1e-04
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1
## -1.88 -6.20 12.82 1.37 0.09
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2
## 6.61 5.95 8.32 -10.54 -9.20
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3
## 0.42 0.92 1.12 -2.72 -1.33
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4
## -0.41 -0.81 -1.10 2.40 1.15
## b->o1 h1->o1 h2->o1 h3->o1 h4->o1
## 0.43 0.49 2.17 6.38 -6.22
## b->o2 h1->o2 h2->o2 h3->o2 h4->o2
## -3.42 -9.28 17.97 -6.43 3.07
## b->o3 h1->o3 h2->o3 h3->o3 h4->o3
## 2.98 8.78 -20.14 0.05 3.15
- Load the NeuralNetTools package
library(NeuralNetTools)
- Visualize the neural network
plotnet(neuralModel)
- Predict with the model
neuralPredictions <- predict(
object = neuralModel,
newdata = test[, 1:4],
type = "class")
- Evaluate the prediction results
confusionMatrix(
data = neuralPredictions,
reference = test$Species)
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 17 0 0
## versicolor 0 17 0
## virginica 0 1 15
##
## Overall Statistics
##
## Accuracy : 0.98
## 95% CI : (0.8935, 0.9995)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.97
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.00 0.9444 1.0000
## Specificity 1.00 1.0000 0.9714
## Pos Pred Value 1.00 1.0000 0.9375
## Neg Pred Value 1.00 0.9697 1.0000
## Prevalence 0.34 0.3600 0.3000
## Detection Rate 0.34 0.3400 0.3000
## Detection Prevalence 0.34 0.3400 0.3200
## Balanced Accuracy 1.00 0.9722 0.9857
- Set working directory
setwd("C:/Workshop/Data")
- Save the tree model
save(treeModel, file = "Tree.RData")