#!/usr/bin/env python3
# -*- coding: utf-8 -*-

Created on Tue Aug  3 23:07:32 2021

@author: jonahbrown-joel
### Baseline MULITI-Classification Models ###

# 6.1 Create a 'pre_processing' pipeline that combines multiple mini-pipelines for 
# transforming groups of columns, such as categorical and numeric columns, seporately
import pandas as pd

from sklearn.pipeline      import Pipeline

# Use this to combine multiple pipelines
from sklearn.compose       import ColumnTransformer

# data processing unctions 
from sklearn.impute        import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Import the data 
# Import the Training data 
# train_path = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/project_data/training_data.csv"
# DF_train = pd.read_csv(train_path)

# Import the validation data and use it to train your model since you're using Cross Validation
train_path = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/project_data/validation_data.csv"
DF_train = pd.read_csv(train_path)

# Specify the target variable
target_variable       = 'pclass'

# create X and y datasets
DF_target = DF_train[target_variable]
DF_train = DF_train.loc[:, DF_train.columns != target_variable] # drop the target variable from your training data

# Create a pipeline to process the numeric data 

# Specify what variables to use in this mini-pipe
list_numeric_vars     = [ 'age', 'sibsp', 'fare']

numeric_pipeline = Pipeline([
    ("my_imputer", SimpleImputer(strategy="mean")),
    ("my_std_scaler", StandardScaler()),  

# Create a pipeline to process the categorical data

# Specify what variables to use in this mini-pipe
list_categorical_vars = [ #'name' , 
                         , 'embarked', 'ticket' , 'cabin' , 'embarked' , 'home.dest', 'survived'

categorical_pipeline = Pipeline([
    ("impute_missing", SimpleImputer(strategy = 'constant')),  # missing values will be filled with "missing_value"
    ("add_dummies", OneHotEncoder(handle_unknown='ignore')),   # this ignores new values not seen in training

# Create a preprocessing pipeline that combines the two mini-pipes
# Combine the numeric_pipeline and categorical_pipeline with 
preprocessing_pipeline = ColumnTransformer([

    # The Numeric Pipeline
    ("numeric_pipeline"      # the name of this mini-pipe  
     , numeric_pipeline      # the pipeline from above
     , list_numeric_vars),    # the DF to be used with this pipeline

    # The Categorical Pipeline
     , categorical_pipeline
     , list_categorical_vars), 


# 6.2 Use the preprocessor_pipe as the first step in a full pipeline generates predictions using a model

from sklearn.linear_model   import LogisticRegression
from sklearn.ensemble       import RandomForestClassifier
from sklearn.svm            import SVC
from sklearn.neural_network import MLPClassifier

# Create Pipelines for 4 models:
# 1. Linear Model
# 2. Random Forest
# 3. Support Vector Machine
# 4. Neural Network

ML_Pipe_Base_Regression = Pipeline(steps=[
                                      ('preprocessor', preprocessing_pipeline),  # no "()" for the preprocessor
                                      ('logistic_regression', LogisticRegression() )             # the model does have a "()"

ML_Pipe_Base_RandomForest = Pipeline(steps=[
                                      ('preprocessor', preprocessing_pipeline),  # no "()" for the preprocessor
                                      ('random_forest_classifier', RandomForestClassifier() )             # the model does have a "()"

ML_Pipe_Base_SVM = Pipeline(steps=[
                                      ('preprocessor', preprocessing_pipeline),  # no "()" for the preprocessor
                                      ('svc', SVC() )             # the model does have a "()"

ML_Pipe_Base_NeuralNet = Pipeline(steps=[
                                      ('preprocessor', preprocessing_pipeline),  # no "()" for the preprocessor
                                      ('neural_net_mlp_classifier', MLPClassifier() )             # the model does have a "()"

# Fit the 4 mmodels using grid search cross validation
from sklearn.model_selection import GridSearchCV

param_grid = {
   # 'preprocessor__numeric_pipeline__my_imputer__strategy' : ['mean', 'median'], # see what imputation strategy works best
   # 'preprocessor__categorical_pipeline__impute_missing__add_indicator' : ['True', 'False']  # see if it helps to add a missing indicator

# TODO - this supports one scoring metric currently, update to use multiple metrics
scoring_method = 'accuracy'

GridSearchCV_ML_Pipe_Base_Regression = GridSearchCV(ML_Pipe_Base_Regression
                           , param_grid=param_grid
                           , cv=5
                           , scoring = scoring_method )

GridSearchCV_ML_Pipe_Base_RandomForest = GridSearchCV(ML_Pipe_Base_RandomForest
                           , param_grid=param_grid
                           , cv=5
                           , scoring = scoring_method )

GridSearchCV_ML_Pipe_Base_SVM = GridSearchCV(ML_Pipe_Base_SVM
                           , param_grid=param_grid
                           , cv=5
                           , scoring = scoring_method )

GridSearchCV_ML_Pipe_Base_NeuralNet = GridSearchCV(ML_Pipe_Base_NeuralNet
                           , param_grid=param_grid
                           , cv=5
                           , scoring = scoring_method )

# Fit your models to the data = DF_train, y = DF_target) = DF_train, y = DF_target) = DF_train, y = DF_target) = DF_train, y = DF_target)


# Create an empty DF
model_name = 'GridSearchCV_ML_Pipe_Base_Regression'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_Regression.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std']    = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std']   = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Empty = df_grid.head(0)

model_name = 'GridSearchCV_ML_Pipe_Base_Regression'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_Regression.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std']    = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std']   = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Empty.append(df_grid)

model_name = 'GridSearchCV_ML_Pipe_Base_RandomForest'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_RandomForest.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std']    = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std']   = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)

model_name = 'GridSearchCV_ML_Pipe_Base_SVM'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_SVM.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std']    = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std']   = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)

model_name = 'GridSearchCV_ML_Pipe_Base_NeuralNet'
df_grid = pd.DataFrame(GridSearchCV_ML_Pipe_Base_NeuralNet.cv_results_).sort_values('rank_test_score')
df_grid['model_name'] = model_name
df_grid['mean_test_score_abs'] = df_grid['mean_test_score'].abs()
df_grid['score_plus_2_std']    = df_grid['mean_test_score_abs'] + (2* df_grid['std_test_score'] )
df_grid['score_minus_2_std']   = df_grid['mean_test_score_abs'] - (2* df_grid['std_test_score'] )
DF_Model_Scores = DF_Model_Scores.append(df_grid)

# Sort the final df by the best scores
DF_Model_Scores = DF_Model_Scores.sort_values('mean_test_score_abs')

# Save a report of the accuracy of your models
# TODO consider 
path_reports = "/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/reports/"
file_name   = "train_validation_MULTI_classification_scores.csv"
DF_Model_Scores.to_csv(path_reports + file_name)

# Save the best models from your grid search as your final 'Best' model

Best_ML_Pipe_Base_Regression   = GridSearchCV_ML_Pipe_Base_Regression.best_estimator_

Best_ML_Pipe_Base_RandomForest = GridSearchCV_ML_Pipe_Base_RandomForest.best_estimator_

Best_ML_Pipe_Base_SVM          = GridSearchCV_ML_Pipe_Base_SVM.best_estimator_

Best_ML_Pipe_Base_NeuralNet    = GridSearchCV_ML_Pipe_Base_NeuralNet.best_estimator_

# Pickle your best models
import pickle

# path to where you'll pickle your 'Candidate_Models'
path_candidate_models = '/Users/jonahbrown-joel/Library/Mobile Documents/com~apple~CloudDocs/Data Science Cookbook/Data Science Project Template/candidate_models/candidate_multi_classification_models/'

filename = 'Best_ML_Pipe_Base_Regression.sav'
pickle.dump(Best_ML_Pipe_Base_Regression, open(path_candidate_models + filename, 'wb'))

filename = 'Best_ML_Pipe_Base_RandomForest.sav'
pickle.dump(Best_ML_Pipe_Base_RandomForest, open(path_candidate_models + filename, 'wb'))

filename = 'Best_ML_Pipe_Base_SVM.sav'
pickle.dump(Best_ML_Pipe_Base_SVM, open(path_candidate_models + filename, 'wb'))

filename = 'Best_ML_Pipe_Base_NeuralNet.sav'
pickle.dump(Best_ML_Pipe_Base_NeuralNet, open(path_candidate_models + filename, 'wb'))

# Reference:

# load your pickled models 
#loaded_model = pickle.load(open(filename, 'rb'))