setwd("C:/Workshop/Data")
raw <- read.csv("Risk.csv")
head(raw)
## Gender State State.Rate Height Weight BMI Age Risk
## 1 Male MA 0.10043368 184 67.8 20.02599 77 High
## 2 Male VA 0.14172319 163 89.4 33.64824 82 High
## 3 Male NY 0.09080315 170 81.2 28.09689 31 Low
## 4 Male TN 0.11997276 175 99.7 32.55510 39 Low
## 5 Male FL 0.11034460 184 72.1 21.29608 68 High
## 6 Male WA 0.16292470 166 98.4 35.70910 64 High
summary(raw)
## Gender State State.Rate Height
## Female:956 CA : 191 Min. :0.0010 Min. :150.0
## Male :986 TX : 169 1st Qu.:0.1103 1st Qu.:162.0
## FL : 104 Median :0.1276 Median :170.0
## NY : 94 Mean :0.1381 Mean :169.7
## IL : 80 3rd Qu.:0.1443 3rd Qu.:176.0
## OH : 77 Max. :0.3181 Max. :190.0
## (Other):1227
## Weight BMI Age Risk
## Min. : 44.10 Min. :16.02 Min. :18.00 High: 576
## 1st Qu.: 68.60 1st Qu.:23.74 1st Qu.:34.00 Low :1366
## Median : 81.30 Median :28.06 Median :51.00
## Mean : 81.16 Mean :28.29 Mean :50.84
## 3rd Qu.: 93.80 3rd Qu.:32.46 3rd Qu.:68.00
## Max. :116.50 Max. :46.80 Max. :84.00
##
plot(raw)
library(corrgram)
corrgram(raw)
sum(is.na(raw))
## [1] 0
library(dplyr)
clean <- raw %>%
select(
Gender,
State.Rate,
Height,
Weight,
BMI,
Age,
Risk) %>%
as.data.frame()
library(caret)
set.seed(42)
indexes <- createDataPartition(
clean$Risk,
p = .8,
list = FALSE,
times = 1)
train <- clean[indexes, ]
test <- clean[-indexes, ]
preProcess <- c("center", "scale")
control <- trainControl(
method = "cv",
number = 10)
knnModel <- train(
form = Risk ~ .,
data = train,
method = "knn",
preProcess = preProcess,
trControl = control,
tuneLength = 5,
metric = "Accuracy")
print(knnModel)
## k-Nearest Neighbors
##
## 1554 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1398, 1399, 1398, 1399, 1399, 1399, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9562448 0.8953109
## 7 0.9568983 0.8958922
## 9 0.9562531 0.8942799
## 11 0.9549586 0.8916590
## 13 0.9575352 0.8978399
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.
plot(knnModel)
treeModel <- train(
form = Risk ~ .,
data = train,
method = "rpart",
preProcess = preProcess,
trControl = control,
tuneLength = 5,
metric = "Accuracy")
print(treeModel)
## CART
##
## 1554 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1399, 1399, 1399, 1398, 1399, 1399, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.000000000 0.9768401 0.9438743
## 0.006507592 0.9742636 0.9375723
## 0.016268980 0.9632999 0.9129280
## 0.023861171 0.9549623 0.8924622
## 0.843817787 0.8211078 0.4256705
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
plot(treeModel)
neuralTuneGrid <- data.frame(
size = c(3, 4, 5, 3, 4, 5, 3, 4, 5),
decay = c(0.1, 0.1, 0.1, 0.01, 0.01, 0.01, 0.001, 0.001, 0.001))
neuralModel <- train(
form = Risk ~ .,
data = train,
method = "nnet",
preProcess = preProcess,
trControl = control,
tuneGrid = neuralTuneGrid)
print(neuralModel)
## Neural Network
##
## 1554 samples
## 6 predictor
## 2 classes: 'High', 'Low'
##
## Pre-processing: centered (6), scaled (6)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1399, 1398, 1399, 1399, 1398, 1399, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 3 0.001 0.9742514 0.9382290
## 3 0.010 0.9774690 0.9457605
## 3 0.100 0.9742680 0.9383723
## 4 0.001 0.9703846 0.9289725
## 4 0.010 0.9691232 0.9261795
## 4 0.100 0.9729777 0.9354895
## 5 0.001 0.9774731 0.9458424
## 5 0.010 0.9729735 0.9348942
## 5 0.100 0.9749090 0.9401572
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 5 and decay = 0.001.
4 Plot model accuracy
plot(neuralModel)
results <- resamples(list(
knn = knnModel,
tree = treeModel,
nnet = neuralModel))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: knn, tree, nnet
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.9419355 0.9454404 0.9583333 0.9575352 0.9661911 0.9741935 0
## tree 0.9548387 0.9696630 0.9806452 0.9768401 0.9807692 1.0000000 0
## nnet 0.9612903 0.9741935 0.9774814 0.9774731 0.9855149 0.9871795 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## knn 0.8563781 0.8707380 0.8994570 0.8978399 0.9192812 0.9389403 0
## tree 0.8897246 0.9256184 0.9533986 0.9438743 0.9538047 1.0000000 0
## nnet 0.9072597 0.9367811 0.9462005 0.9458424 0.9653263 0.9691700 0
dotplot(results)
bwplot(results)
densityplot(results, auto.key = TRUE)
final_predictions <- predict(
object = neuralModel,
newdata = test)
finalMatrix <- confusionMatrix(
data = final_predictions,
reference = test$Risk)
print(finalMatrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction High Low
## High 110 4
## Low 5 269
##
## Accuracy : 0.9768
## 95% CI : (0.9564, 0.9893)
## No Information Rate : 0.7036
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9442
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9565
## Specificity : 0.9853
## Pos Pred Value : 0.9649
## Neg Pred Value : 0.9818
## Prevalence : 0.2964
## Detection Rate : 0.2835
## Detection Prevalence : 0.2938
## Balanced Accuracy : 0.9709
##
## 'Positive' Class : High
##
predict(
object = neuralModel,
newdata = data.frame(
Gender = "Male",
State.Rate = 0.09080315,
Height = 183,
Weight = 75,
BMI = 22.4,
Age = 20),
type = "prob")
## High Low
## 1 0 1