1. Load all required libraries.
import os
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
2. Set the working directory to "C:\Workshop\Data".
os.chdir("C:\Workshop\Data")
3. Read the Titanic CSV file into a data frame called titanic.
titanic = pd.read_csv("Titanic.csv")
1. Inspect the data using the head
function.
titanic.head()
2. Summarize the columns in the data frame using the info
function.
titanic.info()
3. Sumarize the data in the data frame using the describe
function.
titanic.describe(
include = "all")
4. Create a correlation matrix using the corr
function.
correlations = titanic.corr()
5. Create a correlogram using the seaborn heatmap
function.
sns.heatmap(
data = correlations,
cmap = sns.diverging_palette(
h_neg = 10,
h_pos = 220,
as_cmap = True));
6. Inspect missing values with the isnull
and sum
functions.
titanic.isnull().sum()
Note: It may be helpful to inspect the result of each transformation using the head
function.
1. Assign the raw data to a temporary data frame.
temp = titanic
2. Encode the categorical variable sex as one-hot dummy variables for female and male.
dummies = pd.get_dummies(temp.sex)
temp = pd.concat([temp, dummies], axis = 1)
3. Imput missing values for Age using the mean age.
meanAge = temp.age.mean()
temp.age = temp.age.fillna(meanAge)
4. Engineer a new feature named family as the total siblings, spouses, parents, and children.
temp["family"] = temp.sibsp + temp.parch
5. Encode the integer survived variable as a categorical variable with levels "Yes" and "No".
temp.survived.replace((1, 0), ('Yes', 'No'), inplace = True)
6. Select only the following features: pclass, male, female, age, family, and survived.
temp = temp.loc[:, ["pclass", "male", "female", "age", "family", "survived"]]
7. Rename the selected columns: Class, Male, Female, Age, Family, and Survived.
temp.columns = ["Class", "Male", "Female", "Age", "Family", "Survived"]
8. Inspect the transformed data with the head
function.
temp.head()
9. Create a data frame of the features named X.
X = temp.iloc[:, 0:5]
10. Create a new series for the labels named y.
y = temp.Survived
11. Scale the feature data using the standard scaler.
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
1. Set the random number seed to 42
np.random.seed(42)
2. Create stratified training and test sets (80/20).
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
stratify = y,
train_size = 0.80,
test_size = 0.20)
1. Create a KNN model.
knn_model = KNeighborsClassifier()
2. Define the KNN hyperparameters to test (i.e. k = {2, 7, 9, 11, 13})
knn_params = [5, 7, 9, 11, 13]
knn_param_grid = {"n_neighbors" : knn_params }
3. Create 10 KNN models for each of the five hyper-parameters using 10-fold cross validation.
knn_models = GridSearchCV(
estimator = knn_model,
param_grid = knn_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
knn_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each of the five hyperparameters.
knn_avg_scores = knn_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(knn_params[i], knn_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
knn_params,
knn_avg_scores)
plt.xlabel("k (neighbors)")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error of the top performing model.
knn_top_index = np.argmax(knn_avg_scores)
knn_top_param = knn_params[knn_top_index]
knn_top_score = knn_avg_scores[knn_top_index]
knn_top_error = knn_models.cv_results_["std_test_score"][knn_top_index]
9. Inspect the top performing model.
print("Top knn model is k = {:d} at {:0.2f} +/- {:0.3f} accuracy"
.format(knn_top_param, knn_top_score, knn_top_error))
1. Create a decision tree model.
tree_model = DecisionTreeClassifier()
2. Define the hyper-parameters to test (i.e. max_depth = {3, 4, 5, 6, 7}).
tree_params = [3, 4, 5, 6, 7]
tree_param_grid = {"max_depth" : tree_params }
3. Create 10 tree models for each of the 5 hyper-parameters using 10-fold cross validation.
tree_models = GridSearchCV(
estimator = tree_model,
param_grid = tree_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
tree_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each hyper-parameter.
tree_avg_scores = tree_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(tree_params[i], tree_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
tree_params,
tree_avg_scores)
plt.xlabel("Max Depth (nodes)")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error for the top-performing model.
tree_top_index = np.argmax(tree_avg_scores)
tree_top_param = tree_params[tree_top_index]
tree_top_score = tree_avg_scores[tree_top_index]
tree_top_error = tree_models.cv_results_["std_test_score"][tree_top_index]
9. Inspect the top-performing model.
print("Top tree model is k = {:d} at {:0.2f} +/- {:0.3} accuracy"
.format(tree_top_param, tree_top_score, tree_top_error))
1. Create a neural network model with tanh activation functions and 5000 max iterations.
neural_model = MLPClassifier(
activation = "tanh",
max_iter = 5000)
2. Define hyper-parameters to test (i.e. hidden_layer_sizes = {3, 4, 5, 6, 7}).
neural_params = [3, 4, 5, 6, 7]
neural_param_grid = {"hidden_layer_sizes" : neural_params }
3. Create 10 models for each of the 5 hyper-parameters using 10-fold cross validation.
neural_models = GridSearchCV(
estimator = neural_model,
param_grid = neural_param_grid,
scoring = "accuracy",
cv = 10,
verbose = 1)
4. Train all 50 models using the training set.
Note: This could take a few minutes.
neural_models.fit(
X = X_train,
y = y_train)
5. Get the average accuracy for each hyper-parameter.
neural_avg_scores = neural_models.cv_results_["mean_test_score"]
6. Display the average accuracy for each hyper-parameter.
for i in range(0, 5):
print("{:>3} : {:0.3f}"
.format(neural_params[i], neural_avg_scores[i]))
7. Plot the change in accuracy over each hyper-parameter.
plt.plot(
neural_params,
neural_avg_scores)
plt.xlabel("Hidden Layer Nodes")
plt.ylabel("Accuracy")
plt.show()
8. Get the hyper-parameter, average accuracy, and standard error for the top-performing model.
neural_top_index = np.argmax(neural_avg_scores)
neural_top_param = neural_params[neural_top_index]
neural_top_score = neural_avg_scores[neural_top_index]
neural_top_error = neural_models.cv_results_["std_test_score"][neural_top_index]
9. Inspect the statistics of the top-performing 10 models.
print("Top nnet model is k = {:d} at {:0.2f} +/- {:0.3f} accuracy"
.format(neural_top_param, neural_top_score, neural_top_error))
1. Compare the top three performers numerically.
print("KNN: {:0.2f} +/- {:0.3f} accuracy"
.format(knn_top_score, knn_top_error))
print("Tree: {:0.2f} +/- {:0.3f} accuracy"
.format(tree_top_score, tree_top_error))
print("NNet: {:0.2f} +/- {:0.3f} accuracy"
.format(neural_top_score, neural_top_error))
2. Compare the top-three performing models visually.
plt.errorbar(
x = [knn_top_score, tree_top_score, neural_top_score],
y = ["KNN", "Tree", "NNet"],
xerr = [knn_top_error, tree_top_error, neural_top_error],
linestyle = "none",
marker = "o")
plt.xlim(0, 1)
3. Question: Which model would you choose based on this information?
1. Create a final model based on the top-performing algorithm and hyper-parameter.
final_model = DecisionTreeClassifier(
max_depth = 3)
2. Train the final model using the entire training set.
final_model.fit(
X = X_train,
y = y_train)
6. Predict the labels of the test set using the hold-out test set.
final_predictions = final_model.predict(X_test)
7. Get the final prediction accuracy.
final_score = accuracy_score(
y_true = y_test,
y_pred = final_predictions)
8. Inspect the final prediction accuracy.
print(final_score)
Question to be answered: How likely is it that Jack will survive the Titanic?
1. Create an input feature data frame for Jack.
X_jack = pd.DataFrame(
columns = ["Class", "Male", "Female", "Age", "Family"],
data = [[3, 1, 0, 20, 0]])
2. Predict if Jack survives.
final_model.predict(X_jack)[0]
3. What is the liklihood that Jack survives?
final_model.predict_proba(X_jack)[0][1]
4. Question: Would you take that ticket?