Pipelines
General Info
Useful Snippets

1.0.0 Full Pipeline example

Example:

# Create an example dataframe
# First create a dataframe
import pandas as pd
DF = pd.DataFrame()

# Add columns to your dataframe
DF['one'] = [1,2,3,6,45,4,4,4,5,3,2,2,3,2,4,4,2,3,4,3,3,2,3,4,3,2,6,6,7,6,5,4,4,3,2,4,8,8,6,5,5]
DF['two'] = [2,3,6,45,4,4,4,5,3,2,2,3,2,4,4,2,3,4,3,3,2,3,4,3,2,6,6,7,6,5,4,4,3,2,4,8,8,6,5,5,4]
DF['three'] = [3,6,45,4,4,4,5,3,2,2,3,2,4,4,2,3,4,3,3,2,3,4,3,2,6,6,7,6,5,4,4,3,2,4,8,8,6,5,5,2,3]
DF['four'] = ['A','A','B','A','B','C','A','C','A','C','A','B','C','B','A','B','C','A','B','A','B','C','B','C','B','C','B','A','B','B','B','A','B','C','B','A','C','C','B','C','A']

##############################
# Define and instantiate a full pipeline

from sklearn.pipeline import Pipeline

# import the transformers that you'll use in your example:
from sklearn.impute        import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# used to process columns seporately in mini-pipelines that get combined into a single full pipeline
from sklearn.compose       import ColumnTransformer


###############
# Numeric pipeline
num_columns = ['one', 'two', 'three'] # specify the numeric columns used in this mini-pipeline

# Create a pipeline to process your numeric columns
num_pipeline = Pipeline(steps = [
            ( 'imputer'   , SimpleImputer(strategy="median") )
          , ( 'std_scaler', StandardScaler()                 )
     ])

#################
# Categorical pipeline
cat_columns = ['four'] # specify the categorical columns used in this mini-pipeline

# Create a pipeline to process your categorical columns
cat_pipeline = Pipeline(steps = [
          ( 'onehotencoder'   , OneHotEncoder() )
     ])

###########
# Full Pipeline
# ("Mini-pipe Name" , mini-pipe , list_of_columns)
full_pipeline = ColumnTransformer([
          ( "num",  num_pipeline ,  num_columns )
        , ( "cat",  cat_pipeline ,  cat_columns )
    ])

################
# Fit the pipeline on the data, and print out a summary of the pipeline:
full_pipeline.fit(DF)

################
# Get the transformed feature names:
transformed_feature_names = full_pipeline.get_feature_names_out()
print(transformed_feature_names)

################
# Use fitted pipeline to transform a new dataset:


# create a new dataframe
DF_new = pd.DataFrame()

DF_new['one'] = [5,4,3,2,1]
DF_new['two'] = [1,2,3,4,5]
DF_new['three'] = [1,1,3,3,5] 
DF_new['four'] = ['A','B','C','A','B']
print(DF_new)

################
# Use the pipeline that was fit on the training data to transform the new data:
DF_transformed_new_data = full_pipeline.transform(DF_new)

################
# create a dataframe with the processed new data that includes the new generated column names
# Convert the numpy_array to a dataframe and add the column names:
DF_new_processed = pd.DataFrame(DF_transformed_new_data
                                                          , columns = transformed_feature_names)