nan.0 Full ML Example
Example:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Aug 3 23:07:32 2021
@author: jonahbrown-joel
"""
### Baseline MULITI-Classification Models ###
# 6.1 Create a 'pre_processing' pipeline that combines multiple mini-pipelines for
# transforming groups of columns, such as categorical and numeric columns, seporately
import pandas as pd
from sklearn.pipeline import Pipeline
# Use this to combine multiple pipelines
from sklearn.compose import ColumnTransformer
# data processing unctions
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
###############################################################################
# Import the data
# Import the Training data
# train_path = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/project_data/training_data.csv"
# DF_train = pd.read_csv(train_path)
# Import the validation data and use it to train your model since you're using Cross Validation
train_path = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/project_data/validation_data.csv"
DF_train = pd.read_csv(train_path)
# Specify the target variable
target_variable = 'pclass'
# create X and y datasets
DF_target = DF_train[target_variable]
DF_train = DF_train.loc[:, DF_train.columns != target_variable] # drop the target variable from your training data
###############################################################################
# Create a pipeline to process the numeric data
# Specify what variables to use in this mini-pipe
list_numeric_vars = [ 'age', 'sibsp', 'fare']
numeric_pipeline = Pipeline([
("my_imputer", SimpleImputer(strategy="mean")),
("my_std_scaler", StandardScaler()),
])
###############################################################################
# Create a pipeline to process the categorical data
# Specify what variables to use in this mini-pipe
list_categorical_vars = [ #'name' ,
'sex'
, 'embarked', 'ticket' , 'cabin' , 'embarked' , 'home.dest', 'survived'
]
categorical_pipeline = Pipeline([
("impute_missing", SimpleImputer(strategy = 'constant')), # missing values will be filled with "missing_value"
("add_dummies", OneHotEncoder(handle_unknown='ignore')), # this ignores new values not seen in training
])
###############################################################################
# Create a preprocessing pipeline that combines the two mini-pipes
# Combine the numeric_pipeline and categorical_pipeline with
preprocessing_pipeline = ColumnTransformer([
# The Numeric Pipeline
("numeric_pipeline" # the name of this mini-pipe
, numeric_pipeline # the pipeline from above
, list_numeric_vars), # the DF to be used with this pipeline
# The Categorical Pipeline
("categorical_pipeline"
, categorical_pipeline
, list_categorical_vars),
])
###############################################################################
# 6.2 Use the preprocessor_pipe as the first step in a full pipeline generates predictions using a model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
# Create Pipelines for 4 models:
# 1. Linear Model
# 2. Random Forest
# 3. Support Vector Machine
# 4. Neural Network
###############################################################################
ML_Pipe_Base_Regression = Pipeline(steps=[
('preprocessor', preprocessing_pipeline), # no "()" for the preprocessor
('logistic_regression', LogisticRegression() ) # the model does have a "()"
]
)
ML_Pipe_Base_RandomForest = Pipeline(steps=[
('preprocessor', preprocessing_pipeline), # no "()" for the preprocessor
('random_forest_classifier', RandomForestClassifier() ) # the model does have a "()"
]
)
ML_Pipe_Base_SVM = Pipeline(steps=[
('preprocessor', preprocessing_pipeline), # no "()" for the preprocessor
('svc', SVC() ) # the model does have a "()"
]
)
ML_Pipe_Base_NeuralNet = Pipeline(steps=[
('preprocessor', preprocessing_pipeline), # no "()" for the preprocessor
('neural_net_mlp_classifier', MLPClassifier() ) # the model does have a "()"
]
)
###############################################################################
# Fit the 4 mmodels using grid search cross validation
from sklearn.model_selection import GridSearchCV
param_grid = {
# 'preprocessor__numeric_pipeline__my_imputer__strategy' : ['mean', 'median'], # see what imputation strategy works best
# 'preprocessor__categorical_pipeline__impute_missing__add_indicator' : ['True', 'False'] # see if it helps to add a missing indicator
}
# TODO - this supports one scoring metric currently, update to use multiple metrics
scoring_method = 'accuracy'
GridSearchCV_ML_Pipe_Base_Regression = GridSearchCV(ML_Pipe_Base_Regression
, param_grid=param_grid
, cv=5
, scoring = scoring_method )
GridSearchCV_ML_Pipe_Base_RandomForest = GridSearchCV(ML_Pipe_Base_RandomForest
, param_grid=param_grid
, cv=5
, scoring = scoring_method )
GridSearchCV_ML_Pipe_Base_SVM = GridSearchCV(ML_Pipe_Base_SVM
, param_grid=param_grid
, cv=5
, scoring = scoring_method )
GridSearchCV_ML_Pipe_Base_NeuralNet = GridSearchCV(ML_Pipe_Base_NeuralNet
, param_grid=param_grid
, cv=5
, scoring = scoring_method )
###############################################################################
# Fit your models to the data
GridSearchCV_ML_Pipe_Base_Regression.fit(X = DF_train, y = DF_target)
GridSearchCV_ML_Pipe_Base_RandomForest.fit(X = DF_train, y = DF_target)
GridSearchCV_ML_Pipe_Base_SVM.fit(X = DF_train, y = DF_target)
GridSearchCV_ML_Pipe_Base_NeuralNet.fit(X = DF_train, y = DF_target)
###############################################################################
# Create an empty DF
model_name = 'GridSearchCV_ML_Pipe_Base_Regression'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_Regression.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std'] = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std'] = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Empty = df_grid.head(0)
##########################
model_name = 'GridSearchCV_ML_Pipe_Base_Regression'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_Regression.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std'] = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std'] = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Empty.append(df_grid)
model_name = 'GridSearchCV_ML_Pipe_Base_RandomForest'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_RandomForest.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std'] = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std'] = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)
model_name = 'GridSearchCV_ML_Pipe_Base_SVM'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_SVM.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std'] = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std'] = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)
model_name = 'GridSearchCV_ML_Pipe_Base_NeuralNet'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_NeuralNet.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std'] = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std'] = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)
# Sort the final df by the best scores
DF_Model_Scores = DF_Model_Scores.sort_values('mean_test_score_abs')
# Save a report of the accuracy of your models
# TODO consider
path_reports = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/reports/"
file_name = "train_validation_MULTI_classification_scores.csv"
DF_Model_Scores.to_csv(path_reports + file_name)
###############################################################################
# Save the best models from your grid search as your final 'Best' model
Best_ML_Pipe_Base_Regression = GridSearchCV_ML_Pipe_Base_Regression.best_estimator_
Best_ML_Pipe_Base_RandomForest = GridSearchCV_ML_Pipe_Base_RandomForest.best_estimator_
Best_ML_Pipe_Base_SVM = GridSearchCV_ML_Pipe_Base_SVM.best_estimator_
Best_ML_Pipe_Base_NeuralNet = GridSearchCV_ML_Pipe_Base_NeuralNet.best_estimator_
###############################################################################
# Pickle your best models
import pickle
# path to where you'll pickle your 'Candidate_Models'
path_candidate_models = '/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/candidate_models/candidate_multi_classification_models/'
filename = 'Best_ML_Pipe_Base_Regression.sav'
pickle.dump(Best_ML_Pipe_Base_Regression, open(path_candidate_models + filename, 'wb'))
filename = 'Best_ML_Pipe_Base_RandomForest.sav'
pickle.dump(Best_ML_Pipe_Base_RandomForest, open(path_candidate_models + filename, 'wb'))
filename = 'Best_ML_Pipe_Base_SVM.sav'
pickle.dump(Best_ML_Pipe_Base_SVM, open(path_candidate_models + filename, 'wb'))
filename = 'Best_ML_Pipe_Base_NeuralNet.sav'
pickle.dump(Best_ML_Pipe_Base_NeuralNet, open(path_candidate_models + filename, 'wb'))
###############################################################################
# Reference:
# load your pickled models
#loaded_model = pickle.load(open(filename, 'rb'))