1. Import the OS library.
import os
2. Set the working directory.
os.chdir("C:\\Workshop\\Data")
3. Import the pandas library as "pd".
import pandas as pd
4. Read the Rates.csv file into a data frame called policies.
policies = pd.read_csv("Rates.csv")
1. Inspect the policy rates data set using the head
function.
Note: Notice this data set has a numeric Rate variable instead of a categorical Risk variable.
policies.head()
2. Import the matplotlib pyplot library as "plt".
import matplotlib.pyplot as plt
3. Create a scatterplot matrix of the policies data set.
Note: The semicolon at the end prevents text output from being displayed with the plot.
pd.plotting.scatter_matrix(
frame = policies,
alpha = 1,
s = 100,
diagonal = 'none');
4. Create a correlation matrix of the policies data set.
correlations = policies.corr()
print(correlations)
5. Import the seaborn library as "sns".
import seaborn as sns
6. Create a correlogram using the correlation matrix.
sns.heatmap(
data = correlations,
cmap = sns.diverging_palette(
h_neg = 10,
h_pos = 220,
as_cmap=True));
7. Question: Which variable is most strongly correlated with Rate?
8. Get the correlation between Age and Rate.
policies.Age \
.corr(policies.Rate)
9. Create a scatterplot of Rate (on the y-axis) vs Age (on the x-axis).
plt.scatter(
x = policies.Age,
y = policies.Rate)
plt.xlabel("Age")
plt.ylabel("Rate")
plt.show()
1. Inspect the policies data set.
policies.head()
2. Create a data frame named X containing feature variables Age, Gender, State Rate, and BMI.
X = policies[["Gender", "Age", "State_Rate", "BMI"]]
3. Inspect the features X.
X.head()
4. Convert the categorical variable Gender into a set of one-hot-encoding variables.
dummies = pd.get_dummies(X.Gender)
5. Inspect the one-hot encoded variables.
dummies.head()
6. Append the one-hot-encoded gender variables to the features data set X.
X = pd.concat([X, dummies], axis = 1)
7. Drop the Gender column from the features data frame X.
X = X.drop("Gender", 1)
8. Inspect the features data frame X.
X.head()
9. Create a series named y containing just the labels (i.e. Rate).
y = policies.Rate
10. Inspect the series of labels y.
y.head()
### 4. Create the Training and Test Set
1. Import the numpy library as "np".
import numpy as np
2. Set the random number seed to 42.
np.random.seed(42)
3. Import the test_train_split function from sklearn.
from sklearn.model_selection import train_test_split
4. Randomly sample 80% of the rows for the training set and 20% of the rows for the test set.
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
train_size = 0.80,
test_size = 0.20)
5. Inspect the shape of the training and test sets using the shape
property.
print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test: ", y_test.shape)
1. Import the linear regression class from sklearn.
from sklearn.linear_model import LinearRegression
2. Create a simple linear regression model.
simple_model = LinearRegression()
3. Create a data frame named x1_train containing only the Age feature from the training set.
x1_train = X_train.loc[:, ["Age"]]
4. Create a data frame named x1_test containing onl the Age feature from the test set.
x1_test = X_test.loc[:, ["Age"]]
5. Train the model using the training data.
Note: You should be using x1_train as your training data.
simple_model.fit(
X = x1_train,
y = y_train)
6. Draw the regression line on top of a scatterplot of Rate (y-axis) vs Age (x-axis).
plt.scatter(
x = policies.Age,
y = policies.Rate,
color = "grey")
plt.plot(
x1_test,
simple_model.predict(
x1_test),
color = "blue",
linewidth = 3)
plt.xlabel("Age")
plt.ylabel("Rate")
plt.show()
7. Inspect the slope (m) and y-intercept (b) parameter estimates.
print("y-intercept (b): ", simple_model.intercept_)
print("Slope (m): ", simple_model.coef_[0])
8. Question: How do you interpret these two values?
9. Predict the labels of the test set using the model.
simple_predictions = simple_model.predict(x1_test)
10. Visualize the prediction error.
# Plot the training set (grey dots)
plt.scatter(
x = x1_train.Age,
y = y_train,
color = "grey",
facecolor = "none")
# Plot the predictions (blue dots)
plt.scatter(
x = x1_test.Age,
y = simple_predictions,
color = "blue",
marker = 'x')
# Plot the correct answer (green dots)
plt.scatter(
x = x1_test.Age,
y = y_test,
color = "green")
# Plot the error (red lines)
plt.plot(
[x1_test.Age, x1_test.Age],
[simple_predictions, y_test],
color = "red",
zorder = 0)
# Finish the plot
plt.xlabel("Age")
plt.ylabel("Risk")
plt.show()
11. How do you interpret this graph?
12. Compute the root mean squared error (RMSE) the these predictions.
simple_rmse = np.sqrt(np.mean((y_test - simple_predictions) ** 2))
print(simple_rmse)
13. Question: Was simple linear regression a good choice for modeling this relationship? Why or why not?
1. Create a linear regression model.
multiple_model = LinearRegression()
2. Train the model using all features of the training data.
multiple_model.fit(
X = X_train,
y = y_train)
3. Inspect the parameter estimates.
print("{:<12}: {: .3f}"
.format("y-intercept", multiple_model.intercept_))
for i, column_name in enumerate(X_train.columns):
print("{:<12}: {: .3f}".format(
column_name,
multiple_model.coef_[i]))
4. Question: How do you interpret these values?
5. Predict output values for the input values in the test set.
multiple_predictions = multiple_model.predict(X_test)
6. Visualize the prediction error.
plt.scatter(
x = X_train.Age,
y = y_train,
color = "black",
facecolor = "none")
plt.scatter(
x = X_test.Age,
y = multiple_predictions,
color = "blue",
marker = 'x')
plt.scatter(
x = X_test.Age,
y = y_test,
color = "green")
plt.plot(
[X_test.Age, X_test.Age],
[multiple_predictions, y_test],
color = "red",
zorder = 0)
plt.xlabel("Age")
plt.ylabel("Rate")
plt.show()
7. Question: How do you interpret this graph?
8. Compute the root mean squared error (RMSE) of these predictions.
multiple_rmse = np.sqrt(np.mean((y_test - multiple_predictions) ** 2))
print(multiple_rmse)
9. Question: Is this a better predictive model of the data?
1. Import the standard scaler from sklearn.
from sklearn.preprocessing import StandardScaler
2. Create standard scalers for training and test data.
X_scaler = StandardScaler()
y_scaler = StandardScaler()
3. Fit the scaler to all training data.
X_scaler.fit(X)
y_scaler.fit(y.values.reshape(-1, 1))
4. Scale the training and test data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train.values.reshape(-1, 1))
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1))
5. Import the neural network regressor class from sklearn.
from sklearn.neural_network import MLPRegressor
6. Create a neural network regressor with 4 hidden nodes, a tanh activation function, an LBFGS solver, and 1000 maximum iterations.
neural_model = MLPRegressor(
hidden_layer_sizes = (4),
activation = "tanh",
solver = "lbfgs",
max_iter = 1000)
7. Train the model with the training set.
neural_model.fit(
X = X_train_scaled,
y = y_train_scaled.reshape(-1, ))
8. Predict output values for the test set.
scaled_predictions = neural_model.predict(X_test_scaled)
9. Unscale the predictions.
neural_predictions = y_scaler.inverse_transform(scaled_predictions)
10. Visualize the prediction error.
plt.scatter(
x = X_train.Age,
y = y_train,
color = "black",
facecolor = "none")
plt.scatter(
x = X_test.Age,
y = neural_predictions,
color = "blue",
marker = 'x')
plt.scatter(
x = X_test.Age,
y = y_test,
color = "green")
plt.plot(
[X_test.Age, X_test.Age],
[neural_predictions, y_test],
color = "red",
zorder = 0)
plt.xlabel("Age")
plt.ylabel("Rate")
plt.show()
11. Compute the root mean squared error (RMSE) of these predictions.
neural_rmse = np.sqrt(np.mean((y_test - neural_predictions) ** 2))
12. Inspect the RMSE of these predictions.
print(neural_rmse)
1. Compare all three results.
print("Simple RMSE: ", simple_rmse)
print("Multiple RMSE: ", multiple_rmse)
print("Neural RMSE: ", neural_rmse)
2. Question: Which of these models would you choose? Why?