Skip to content

Instantly share code, notes, and snippets.

@Anjum48
Created February 5, 2020 06:13
Show Gist options
  • Select an option

  • Save Anjum48/d958014bbe4cd46a330870fd582db97d to your computer and use it in GitHub Desktop.

Select an option

Save Anjum48/d958014bbe4cd46a330870fd582db97d to your computer and use it in GitHub Desktop.
A first pass of a Nested CV wrapper for category encoders
from category_encoders import utils
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
import category_encoders as encoders
import pandas as pd
import numpy as np
import copy
class NestedCVWrapper(BaseEstimator, TransformerMixin):
"""
Extend supervised encoders to perform nested cross validation to prevent target leakage
For a validation or a test set, supervised encoders can be used as follows::
encoder.fit(X_train, y_train)
X_valid_encoded = encoder.transform(X_valid)
However, when encoding the train data in the method above will introduce bias into the data.
Using out-of-fold encodings is an effective way to prevent target leakage. This is equivalent to::
X_train_encoded = np.zeros(X.shape)
for trn, val in kfold.split(X, y):
encoder.fit(X[trn], y[trn])
X_train_encoded[val] = encoder.transform(X[val])
This can be used in place of the "inner folds" as discussed here:
https://sebastianraschka.com/faq/docs/evaluate-a-model.html
See README.md for a list of supervised encoders
Parameters
----------
feature_encoder: Object
an instance of a supervised encoder.
cv: int or sklearn cv Object
If an int is given, StratifiedKFold is used by default, where the int is the number of folds
shuffle: boolean, optional
Whether to shuffle each class’s samples before splitting into batches. Ignored if a CV method is provided
random_state: int, RandomState instance or None, optional, default=None
If int, random_state is the seed used by the random number generator. Ignored if a CV method is provided
Example
-------
>>> from category_encoders import *
>>> from sklearn.datasets import load_boston
>>> from sklearn.model_selection import GroupKFold
>>> bunch = load_boston()
>>> y = bunch.target
>>> y = (y/10).round().astype(int) # we create 6 artificial classes
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
>>> enc = TargetEncoder(cols=['CHAS', 'RAD'])
>>> enc_nested = NestedCVWrapper(enc, random_state=42)
>>> encoded = (enc_nested.fit_transform(X, y)
>>> print(encoded.info()))
"""
def __init__(self, feature_encoder, cv=5, shuffle=False, random_state=None):
self.feature_encoder = feature_encoder
self.feature_encoders = {}
self.label_encoder = None
if type(cv) == int:
self.cv = StratifiedKFold(n_splits=cv, shuffle=shuffle, random_state=random_state)
else:
self.cv = cv
def fit_transform(self, X, y=None, groups=None, **fit_params):
"""
Creates unbiased encodings from a supervised encoder
:param X: Features to be encoded
:param y: Target variable
:param groups: Groups to be passed to the cv method, e.g. for GroupKFold
:param fit_params:
:return:
"""
X = utils.convert_input(X)
y = utils.convert_input(y)
result = np.zeros(X.shape)
for fold_i, (trn_idx, val_idx) in enumerate(self.cv.split(X, y, groups)):
feature_encoder = copy.deepcopy(self.feature_encoder)
feature_encoder.fit(X.iloc[trn_idx], y.iloc[trn_idx])
result[val_idx] = feature_encoder.transform(X.iloc[val_idx])
return pd.DataFrame(result, columns=X.columns)
if __name__ == '__main__':
from sklearn.datasets import load_iris
bunch = load_iris()
y = bunch.target
X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
print(X.head())
te = encoders.target_encoder.TargetEncoder(cols=['sepal length (cm)', 'sepal width (cm)'])
te_nested = NestedCVWrapper(te, random_state=48)
print(te_nested.fit_transform(X, y).head())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment