Created
February 5, 2020 06:13
-
-
Save Anjum48/d958014bbe4cd46a330870fd582db97d to your computer and use it in GitHub Desktop.
A first pass of a Nested CV wrapper for category encoders
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from category_encoders import utils | |
| from sklearn.base import BaseEstimator, TransformerMixin | |
| from sklearn.model_selection import StratifiedKFold | |
| import category_encoders as encoders | |
| import pandas as pd | |
| import numpy as np | |
| import copy | |
| class NestedCVWrapper(BaseEstimator, TransformerMixin): | |
| """ | |
| Extend supervised encoders to perform nested cross validation to prevent target leakage | |
| For a validation or a test set, supervised encoders can be used as follows:: | |
| encoder.fit(X_train, y_train) | |
| X_valid_encoded = encoder.transform(X_valid) | |
| However, when encoding the train data in the method above will introduce bias into the data. | |
| Using out-of-fold encodings is an effective way to prevent target leakage. This is equivalent to:: | |
| X_train_encoded = np.zeros(X.shape) | |
| for trn, val in kfold.split(X, y): | |
| encoder.fit(X[trn], y[trn]) | |
| X_train_encoded[val] = encoder.transform(X[val]) | |
| This can be used in place of the "inner folds" as discussed here: | |
| https://sebastianraschka.com/faq/docs/evaluate-a-model.html | |
| See README.md for a list of supervised encoders | |
| Parameters | |
| ---------- | |
| feature_encoder: Object | |
| an instance of a supervised encoder. | |
| cv: int or sklearn cv Object | |
| If an int is given, StratifiedKFold is used by default, where the int is the number of folds | |
| shuffle: boolean, optional | |
| Whether to shuffle each class’s samples before splitting into batches. Ignored if a CV method is provided | |
| random_state: int, RandomState instance or None, optional, default=None | |
| If int, random_state is the seed used by the random number generator. Ignored if a CV method is provided | |
| Example | |
| ------- | |
| >>> from category_encoders import * | |
| >>> from sklearn.datasets import load_boston | |
| >>> from sklearn.model_selection import GroupKFold | |
| >>> bunch = load_boston() | |
| >>> y = bunch.target | |
| >>> y = (y/10).round().astype(int) # we create 6 artificial classes | |
| >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| >>> enc = TargetEncoder(cols=['CHAS', 'RAD']) | |
| >>> enc_nested = NestedCVWrapper(enc, random_state=42) | |
| >>> encoded = (enc_nested.fit_transform(X, y) | |
| >>> print(encoded.info())) | |
| """ | |
| def __init__(self, feature_encoder, cv=5, shuffle=False, random_state=None): | |
| self.feature_encoder = feature_encoder | |
| self.feature_encoders = {} | |
| self.label_encoder = None | |
| if type(cv) == int: | |
| self.cv = StratifiedKFold(n_splits=cv, shuffle=shuffle, random_state=random_state) | |
| else: | |
| self.cv = cv | |
| def fit_transform(self, X, y=None, groups=None, **fit_params): | |
| """ | |
| Creates unbiased encodings from a supervised encoder | |
| :param X: Features to be encoded | |
| :param y: Target variable | |
| :param groups: Groups to be passed to the cv method, e.g. for GroupKFold | |
| :param fit_params: | |
| :return: | |
| """ | |
| X = utils.convert_input(X) | |
| y = utils.convert_input(y) | |
| result = np.zeros(X.shape) | |
| for fold_i, (trn_idx, val_idx) in enumerate(self.cv.split(X, y, groups)): | |
| feature_encoder = copy.deepcopy(self.feature_encoder) | |
| feature_encoder.fit(X.iloc[trn_idx], y.iloc[trn_idx]) | |
| result[val_idx] = feature_encoder.transform(X.iloc[val_idx]) | |
| return pd.DataFrame(result, columns=X.columns) | |
| if __name__ == '__main__': | |
| from sklearn.datasets import load_iris | |
| bunch = load_iris() | |
| y = bunch.target | |
| X = pd.DataFrame(bunch.data, columns=bunch.feature_names) | |
| print(X.head()) | |
| te = encoders.target_encoder.TargetEncoder(cols=['sepal length (cm)', 'sepal width (cm)']) | |
| te_nested = NestedCVWrapper(te, random_state=48) | |
| print(te_nested.fit_transform(X, y).head()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment