Preprocessing
General Info
Useful Snippets

1.0.1 Replace Null with most frequent value

Example:

# Create an example DF with nulls (np.nan)
import pandas as pd
import numpy  as np

from sklearn.impute import SimpleImputer

# Instantiate an imputer to impute the most frequent categorical value
impute_most_frequent = SimpleImputer(strategy="most_frequent")


##################### Example:
# Create a train DF with nulls
DF_train = pd.DataFrame()

DF_train['a'] = [np.nan,'A',np.nan,'A',np.nan, 'B']
DF_train['b'] = [np.nan,'B',np.nan,'B',np.nan, 'A']

######################
# Fit the imputer on the training data
impute_most_frequent.fit(DF_train)

########################
# Create a new dataset with nulls
DF_New = pd.DataFrame()

DF_New['a'] = ['Z', np.nan, np.nan, np.nan, np.nan]
DF_New['b'] = ['Z', np.nan, np.nan, np.nan, np.nan]

#########################
# Impute the most frequent values from the training data into the new data
# The most frequent value in 'a' is 'A', and the most frequent value in 'b' is B, so that's what get's imputed
impute_most_frequent.transform(DF_New)

# array([['Z', 'Z'],
#      ['A', 'B'],
#      ['A', 'B'],
#      ['A', 'B'],
#      ['A', 'B']], dtype=object)