Anjum48/nested_cv_wrapper.py

## nested_cv_wrapper.py
from category_encoders import utils
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedKFold
import category_encoders as encoders
import pandas as pd
import numpy as np
import copy


class NestedCVWrapper(BaseEstimator, TransformerMixin):
    """
    Extend supervised encoders to perform nested cross validation to prevent target leakage

    For a validation or a test set, supervised encoders can be used as follows::

        encoder.fit(X_train, y_train)
        X_valid_encoded = encoder.transform(X_valid)

    However, when encoding the train data in the method above will introduce bias into the data.
    Using out-of-fold encodings is an effective way to prevent target leakage. This is equivalent to::

        X_train_encoded = np.zeros(X.shape)
        for trn, val in kfold.split(X, y):
            encoder.fit(X[trn], y[trn])
            X_train_encoded[val] = encoder.transform(X[val])

    This can be used in place of the "inner folds" as discussed here:
    https://sebastianraschka.com/faq/docs/evaluate-a-model.html

    See README.md for a list of supervised encoders


    Parameters
    ----------
    feature_encoder: Object
        an instance of a supervised encoder.

    cv: int or sklearn cv Object
        If an int is given, StratifiedKFold is used by default, where the int is the number of folds

    shuffle: boolean, optional
        Whether to shuffle each class’s samples before splitting into batches. Ignored if a CV method is provided

    random_state: int, RandomState instance or None, optional, default=None
        If int, random_state is the seed used by the random number generator. Ignored if a CV method is provided


    Example
    -------
    >>> from category_encoders import *
    >>> from sklearn.datasets import load_boston
    >>> from sklearn.model_selection import GroupKFold
    >>> bunch = load_boston()
    >>> y = bunch.target
    >>> y = (y/10).round().astype(int)  # we create 6 artificial classes
    >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    >>> enc = TargetEncoder(cols=['CHAS', 'RAD'])
    >>> enc_nested = NestedCVWrapper(enc, random_state=42)
    >>> encoded = (enc_nested.fit_transform(X, y)
    >>> print(encoded.info()))
    """

    def __init__(self, feature_encoder, cv=5, shuffle=False, random_state=None):
        self.feature_encoder = feature_encoder
        self.feature_encoders = {}
        self.label_encoder = None

        if type(cv) == int:
            self.cv = StratifiedKFold(n_splits=cv, shuffle=shuffle, random_state=random_state)
        else:
            self.cv = cv

    def fit_transform(self, X, y=None, groups=None, **fit_params):
        """
        Creates unbiased encodings from a supervised encoder
        :param X: Features to be encoded
        :param y: Target variable
        :param groups: Groups to be passed to the cv method, e.g. for GroupKFold
        :param fit_params:
        :return:
        """
        X = utils.convert_input(X)
        y = utils.convert_input(y)

        result = np.zeros(X.shape)

        for fold_i, (trn_idx, val_idx) in enumerate(self.cv.split(X, y, groups)):
            feature_encoder = copy.deepcopy(self.feature_encoder)
            feature_encoder.fit(X.iloc[trn_idx], y.iloc[trn_idx])
            result[val_idx] = feature_encoder.transform(X.iloc[val_idx])

        return pd.DataFrame(result, columns=X.columns)


if __name__ == '__main__':
    from sklearn.datasets import load_iris

    bunch = load_iris()
    y = bunch.target
    X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
    print(X.head())

    te = encoders.target_encoder.TargetEncoder(cols=['sepal length (cm)', 'sepal width (cm)'])

    te_nested = NestedCVWrapper(te, random_state=48)
    print(te_nested.fit_transform(X, y).head())
	from category_encoders import utils
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.model_selection import StratifiedKFold
	import category_encoders as encoders
	import pandas as pd
	import numpy as np
	import copy


	class NestedCVWrapper(BaseEstimator, TransformerMixin):
	"""
	Extend supervised encoders to perform nested cross validation to prevent target leakage

	For a validation or a test set, supervised encoders can be used as follows::

	encoder.fit(X_train, y_train)
	X_valid_encoded = encoder.transform(X_valid)

	However, when encoding the train data in the method above will introduce bias into the data.
	Using out-of-fold encodings is an effective way to prevent target leakage. This is equivalent to::

	X_train_encoded = np.zeros(X.shape)
	for trn, val in kfold.split(X, y):
	encoder.fit(X[trn], y[trn])
	X_train_encoded[val] = encoder.transform(X[val])

	This can be used in place of the "inner folds" as discussed here:
	https://sebastianraschka.com/faq/docs/evaluate-a-model.html

	See README.md for a list of supervised encoders


	Parameters
	----------
	feature_encoder: Object
	an instance of a supervised encoder.

	cv: int or sklearn cv Object
	If an int is given, StratifiedKFold is used by default, where the int is the number of folds

	shuffle: boolean, optional
	Whether to shuffle each class’s samples before splitting into batches. Ignored if a CV method is provided

	random_state: int, RandomState instance or None, optional, default=None
	If int, random_state is the seed used by the random number generator. Ignored if a CV method is provided


	Example
	-------
	>>> from category_encoders import *
	>>> from sklearn.datasets import load_boston
	>>> from sklearn.model_selection import GroupKFold
	>>> bunch = load_boston()
	>>> y = bunch.target
	>>> y = (y/10).round().astype(int) # we create 6 artificial classes
	>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	>>> enc = TargetEncoder(cols=['CHAS', 'RAD'])
	>>> enc_nested = NestedCVWrapper(enc, random_state=42)
	>>> encoded = (enc_nested.fit_transform(X, y)
	>>> print(encoded.info()))
	"""

	def __init__(self, feature_encoder, cv=5, shuffle=False, random_state=None):
	self.feature_encoder = feature_encoder
	self.feature_encoders = {}
	self.label_encoder = None

	if type(cv) == int:
	self.cv = StratifiedKFold(n_splits=cv, shuffle=shuffle, random_state=random_state)
	else:
	self.cv = cv

	def fit_transform(self, X, y=None, groups=None, **fit_params):
	"""
	Creates unbiased encodings from a supervised encoder
	:param X: Features to be encoded
	:param y: Target variable
	:param groups: Groups to be passed to the cv method, e.g. for GroupKFold
	:param fit_params:
	:return:
	"""
	X = utils.convert_input(X)
	y = utils.convert_input(y)

	result = np.zeros(X.shape)

	for fold_i, (trn_idx, val_idx) in enumerate(self.cv.split(X, y, groups)):
	feature_encoder = copy.deepcopy(self.feature_encoder)
	feature_encoder.fit(X.iloc[trn_idx], y.iloc[trn_idx])
	result[val_idx] = feature_encoder.transform(X.iloc[val_idx])

	return pd.DataFrame(result, columns=X.columns)


	if __name__ == '__main__':
	from sklearn.datasets import load_iris

	bunch = load_iris()
	y = bunch.target
	X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
	print(X.head())

	te = encoders.target_encoder.TargetEncoder(cols=['sepal length (cm)', 'sepal width (cm)'])

	te_nested = NestedCVWrapper(te, random_state=48)
	print(te_nested.fit_transform(X, y).head())
No results found