1. Load all required libraries.
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
2. Set the working directory to "C:\Workshop\Data".
os.chdir("C:\Workshop\Data")
3. Read the Risk.csv file into a data frame called policies.
policies = pd.read_csv("Risk.csv")
1. Inspect the policies data using the head
function.
policies.head()
2. Summarize the columns in the data frame using the info
function.
policies.info()
3. Sumarize the data in the data frame using the describe
function.
policies.describe(
include = "all")
4. Create a correlation matrix using the corr
function.
correlations = policies.corr()
5. Create a correlogram using the seaborn heatmap
function.
sns.heatmap(
data = correlations,
cmap = sns.diverging_palette(
h_neg = 10,
h_pos = 220,
as_cmap = True));
6. Inspect missing values with the isnull
and sum
functions.
policies.isnull().sum()
1. Assign the following features to a data frame named X: Gender, State Rate, Height, Weight, BMI, and Age.
X = policies[["Gender", "State_Rate", "Height", "Weight", "BMI", "Age"]]
2. Encode the categorical Gender variable {Female, Male} as an integer {0, 1}.
X.Gender.replace(("Female", "Male"), (0, 1), inplace = True)
3. Inspect the transfomred data with the head
function.
X.head()
4. Create a new series for the labels named y.
y = policies.Risk
5. Scale the feature data using the standard scaler.
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
1. Set the random number seed to 42.
np.random.seed(42)
2. Create stratified training and test sets (80/20).
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
stratify = y,
train_size = 0.80,
test_size = 0.20)
1. Create a KNN model.
knn_model = KNeighborsClassifier()
2. Define the KNN hyperparameters to test (i.e. k = {2, 7, 9, 11, 13})
knn_params = [5, 7, 9, 11, 13]
knn_param_grid = {"n_neighbors" : knn_params }
3. Create 10 KNN models for each of the five hyper-parameters using 10-fold cross validation.
knn_models = GridSearchCV(
estimator = knn_model,
param_grid = knn_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
knn_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each of the five hyperparameters.
knn_avg_scores = knn_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(knn_params[i], knn_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
knn_params,
knn_avg_scores)
plt.xlabel("k (neighbors)")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error of the top performing model.
knn_top_index = np.argmax(knn_avg_scores)
knn_top_param = knn_params[knn_top_index]
knn_top_score = knn_avg_scores[knn_top_index]
knn_top_error = knn_models.cv_results_["std_test_score"][knn_top_index]
9. Inspect the top performing model.
print("Top knn model is k = {:d} at {:0.2f} +/- {:0.3f} accuracy"
.format(knn_top_param, knn_top_score, knn_top_error))
1. Create a decision tree model.
tree_model = DecisionTreeClassifier()
2. Define the hyper-parameters to test (i.e. max_depth = {3, 4, 5, 6, 7}).
tree_params = [3, 4, 5, 6, 7]
tree_param_grid = {"max_depth" : tree_params }
3. Create 10 tree models for each of the 5 hyper-parameters using 10-fold cross validation.
tree_models = GridSearchCV(
estimator = tree_model,
param_grid = tree_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
tree_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each hyper-parameter.
tree_avg_scores = tree_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(tree_params[i], tree_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
tree_params,
tree_avg_scores)
plt.xlabel("Max Depth (nodes)")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error for the top-performing model.
tree_top_index = np.argmax(tree_avg_scores)
tree_top_param = tree_params[tree_top_index]
tree_top_score = tree_avg_scores[tree_top_index]
tree_top_error = tree_models.cv_results_["std_test_score"][tree_top_index]
9. Inspect the top-performing model.
print("Top tree model is k = {:d} at {:0.2f} +/- {:0.3} accuracy"
.format(tree_top_param, tree_top_score, tree_top_error))
1. Create a neural network model with tanh activation functions and 5000 max iterations.
neural_model = MLPClassifier(
activation = "tanh",
solver = "sgd",
max_iter = 5000)
2. Define hyper-parameters to test (i.e. hidden_layer_sizes = {3, 4, 5, 6, 7}).
neural_params = [3, 4, 5, 6, 7]
neural_param_grid = {"hidden_layer_sizes" : neural_params }
3. Create 10 models for each of the 5 hyper-parameters using 10-fold cross validation.
neural_models = GridSearchCV(
estimator = neural_model,
param_grid = neural_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
Note: This could take a few minutes.
neural_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each hyper-parameter.
neural_avg_scores = neural_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(neural_params[i], neural_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
neural_params,
neural_avg_scores)
plt.xlabel("Hidden Layer Nodes")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error for the top-performing model.
neural_top_index = np.argmax(neural_avg_scores)
neural_top_param = neural_params[neural_top_index]
neural_top_score = neural_avg_scores[neural_top_index]
neural_top_error = neural_models.cv_results_["std_test_score"][neural_top_index]
9. Inspect the statistics of the top-performing 10 models.
print("Top nnet model is k = {:d} at {:0.2f} +/- {:0.3f} accuracy"
.format(neural_top_param, neural_top_score, neural_top_error))
1. Compare the top three performers numerically.
print("KNN: {:0.2f} +/- {:0.3f} accuracy"
.format(knn_top_score, knn_top_error))
print("Tree: {:0.2f} +/- {:0.3f} accuracy"
.format(tree_top_score, tree_top_error))
print("NNet: {:0.2f} +/- {:0.3f} accuracy"
.format(neural_top_score, neural_top_error))
2. Compare the top-three performing models visually.
plt.errorbar(
x = [knn_top_score, tree_top_score, neural_top_score],
y = ["KNN", "Tree", "NNet"],
xerr = [knn_top_error, tree_top_error, neural_top_error],
linestyle = "none",
marker = "o")
plt.xlim(0, 1)
3. Question: Which model would you choose based on this information?
1. Create a final model based on the top-performing algorithm and hyper-parameter.
final_model = DecisionTreeClassifier(
max_depth = 3)
2. Train the final model using the entire training set.
final_model.fit(
X = X_train,
y = y_train)
6. Predict the labels of the test set using the hold-out test set.
final_predictions = final_model.predict(X_test)
7. Get the final prediction accuracy.
final_score = accuracy_score(
y_true = y_test,
y_pred = final_predictions)
8. Inspect the final prediction accuracy.
print(final_score)
Question to be answered: Is Jack (from the Titanic) a high risk or low risk policy?
1. Create an input feature for Jack.
X_jack = pd.DataFrame(
columns = ["Gender", "State_Rate", "Height", "Weight", "BMI", "Age"],
data = [[1, 0.09080315, 183, 75, 22.4, 20]])
2. Predict the risk class of Jack.
final_model.predict(X_jack)[0]
3. Predict the probablility that Jack belongs to the above risk class.
final_model.predict_proba(X_jack)[0][1]
4. Question: Would you offer life insurance to Jack?