ceroper/ranking.py

## ranking.py
"""
Implementation of pairwise ranking using scikit-learn LinearSVC

Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich,
    T. Graepel, K. Obermayer.

Authors: Fabian Pedregosa <fabian@fseoane.net>
         Alexandre Gramfort <alexandre.gramfort@inria.fr>
         Caroline Roper <https://www.linkedin.com/in/caroline-roper-2b623628/>
"""

import itertools
import numpy as np

from sklearn import svm, linear_model
from sklearn.model_selection import KFold


def transform_pairwise(X, y):
    """Transforms data into pairs with balanced labels for ranking

    Transforms a n-class ranking problem into a two-class classification
    problem. Subclasses implementing particular strategies for choosing
    pairs should override this method.

    In this method, all pairs are choosen, except for those that have the
    same target value. The output is an array of balanced classes, i.e.
    there are the same number of -1 as +1

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        The data
    y : array, shape (n_samples,) or (n_samples, 2)
        Target labels. If it's a 2D array, the second column represents
        the grouping of samples, i.e., samples with different groups will
        not be considered.

    Returns
    -------
    X_trans : array, shape (k, n_feaures)
        Data as pairs
    y_trans : array, shape (k,)
        Output class labels, where classes have values {-1, +1}
    """
    X_new = []
    y_new = []
    y = np.asarray(y)
    if y.ndim == 1:
        y = np.c_[y, np.ones(y.shape[0])]
    comb = itertools.combinations(range(X.shape[0]), 2)
    for k, (i, j) in enumerate(comb):
        if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
            # skip if same target or different group
            continue
        X_new.append(X[i] - X[j])
        y_new.append(np.sign(y[i, 0] - y[j, 0]))
        # output balanced classes
        if y_new[-1] != (-1) ** k:
            y_new[-1] = - y_new[-1]
            X_new[-1] = - X_new[-1]
    return np.asarray(X_new), np.asarray(y_new).ravel()


class RankSVM(svm.LinearSVC):
    """Performs pairwise ranking with an underlying LinearSVC model

    Input should be a n-class ranking problem, this object will convert it
    into a two-class classification problem, a setting known as
    `pairwise ranking`.

    See object :ref:`svm.LinearSVC` for a full description of parameters.
    """

    def fit(self, X, y):
        """
        Fit a pairwise ranking model.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)
        y : array, shape (n_samples,) or (n_samples, 2)

        Returns
        -------
        self
        """
        X_trans, y_trans = transform_pairwise(X, y)
        super(RankSVM, self).fit(X_trans, y_trans)
        return self

    def predict_group(self, X):
        """
        Predict an ordering on X. For a list of n samples, this method
        returns a list from 0 to n-1 with the relative order of the rows of X.

        Parameters
        ----------
        X : array, shape (n_samples, n_features)

        Returns
        -------
        ord : array, shape (n_samples,)
            Returns a list of integers representing the relative order of
            the rows in X.
        """
        if hasattr(self, 'coef_'):
            return np.argsort(np.dot(X, self.coef_.T).ravel())
        else:
            raise ValueError("Must call fit() prior to predict()")

    def predict(self, X):
        """
        Parameters
        ----------
        X: 2D numpy array, shape (n_samples, n_features + 1), last column is group id, rest are features

        Returns
        -------
        y_pred: numpy array, length n_samples, the model predictions
        """
        if not hasattr(self, 'coef_'):
            raise ValueError("Must call fit() prior to predict()")

        # Split features and group ids
        group_ids = X[:, -1]
        X_data = X[:, :-1]

        # Map each group to the indices belonging to it
        from collections import defaultdict
        group_to_indices = defaultdict(list)
        for idx, gid in enumerate(group_ids):
            group_to_indices[gid].append(idx)

        y_pred = np.empty(len(X_data), dtype=object)  # Use object dtype to allow storing ints or other types

        for gid, indices in group_to_indices.items():
            # Extract features for this group
            x_group = X_data[indices, :]
            # Fill missing values if needed (e.g., with 0)
            x_group = np.nan_to_num(x_group)  # replaces NaN with 0
            x_group = x_group.astype(np.int32)

            # Predict for the group
            preds_group = self.predict_group(x_group)

            # Assign predictions to correct positions
            y_pred[indices] = preds_group


        return y_pred

    def score_group(self, X, y):
        """
        Because we transformed into a pairwise problem, chance level is at 0.5
        """
        X_trans, y_trans = transform_pairwise(X, y)
        return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)


if __name__ == '__main__':
    # as showcase, we will create some non-linear data
    # and print the performance of ranking vs linear regression

    np.random.seed(1)
    n_samples, n_features = 300, 5
    true_coef = np.random.randn(n_features)
    X = np.random.randn(n_samples, n_features)
    noise = np.random.randn(n_samples) / np.linalg.norm(true_coef)
    y = np.dot(X, true_coef)
    y = np.arctan(y) # add non-linearities
    y += .1 * noise  # add noise
    Y = np.c_[y, np.mod(np.arange(n_samples), 5)]  # add query fake id
    cv = KFold(5)
    i, (train, test) = next(enumerate(cv.split(X)))

    # make a simple plot out of it
    import matplotlib.pyplot as pl
    pl.scatter(np.dot(X, true_coef), y)
    pl.title('Data to be learned')
    pl.xlabel('<X, coef>')
    pl.ylabel('y')
    pl.show()

    # print the performance of ranking
    rank_svm = RankSVM().fit(X[train], Y[train])
    print ('Performance of ranking ', rank_svm.score_group(X[test], Y[test]))

    # and that of linear regression
    ridge = linear_model.RidgeCV(fit_intercept=True)
    ridge.fit(X[train], y[train])
    X_test_trans, y_test_trans = transform_pairwise(X[test], y[test])
    score = np.mean(np.sign(np.dot(X_test_trans, ridge.coef_)) == y_test_trans)
    print ('Performance of linear regression ', score)
	"""
	Implementation of pairwise ranking using scikit-learn LinearSVC

	Reference: "Large Margin Rank Boundaries for Ordinal Regression", R. Herbrich,
	T. Graepel, K. Obermayer.

	Authors: Fabian Pedregosa <fabian@fseoane.net>
	Alexandre Gramfort <alexandre.gramfort@inria.fr>
	Caroline Roper <https://www.linkedin.com/in/caroline-roper-2b623628/>
	"""

	import itertools
	import numpy as np

	from sklearn import svm, linear_model
	from sklearn.model_selection import KFold


	def transform_pairwise(X, y):
	"""Transforms data into pairs with balanced labels for ranking

	Transforms a n-class ranking problem into a two-class classification
	problem. Subclasses implementing particular strategies for choosing
	pairs should override this method.

	In this method, all pairs are choosen, except for those that have the
	same target value. The output is an array of balanced classes, i.e.
	there are the same number of -1 as +1

	Parameters
	----------
	X : array, shape (n_samples, n_features)
	The data
	y : array, shape (n_samples,) or (n_samples, 2)
	Target labels. If it's a 2D array, the second column represents
	the grouping of samples, i.e., samples with different groups will
	not be considered.

	Returns
	-------
	X_trans : array, shape (k, n_feaures)
	Data as pairs
	y_trans : array, shape (k,)
	Output class labels, where classes have values {-1, +1}
	"""
	X_new = []
	y_new = []
	y = np.asarray(y)
	if y.ndim == 1:
	y = np.c_[y, np.ones(y.shape[0])]
	comb = itertools.combinations(range(X.shape[0]), 2)
	for k, (i, j) in enumerate(comb):
	if y[i, 0] == y[j, 0] or y[i, 1] != y[j, 1]:
	# skip if same target or different group
	continue
	X_new.append(X[i] - X[j])
	y_new.append(np.sign(y[i, 0] - y[j, 0]))
	# output balanced classes
	if y_new[-1] != (-1) ** k:
	y_new[-1] = - y_new[-1]
	X_new[-1] = - X_new[-1]
	return np.asarray(X_new), np.asarray(y_new).ravel()


	class RankSVM(svm.LinearSVC):
	"""Performs pairwise ranking with an underlying LinearSVC model

	Input should be a n-class ranking problem, this object will convert it
	into a two-class classification problem, a setting known as
	`pairwise ranking`.

	See object :ref:`svm.LinearSVC` for a full description of parameters.
	"""

	def fit(self, X, y):
	"""
	Fit a pairwise ranking model.

	Parameters
	----------
	X : array, shape (n_samples, n_features)
	y : array, shape (n_samples,) or (n_samples, 2)

	Returns
	-------
	self
	"""
	X_trans, y_trans = transform_pairwise(X, y)
	super(RankSVM, self).fit(X_trans, y_trans)
	return self

	def predict_group(self, X):
	"""
	Predict an ordering on X. For a list of n samples, this method
	returns a list from 0 to n-1 with the relative order of the rows of X.

	Parameters
	----------
	X : array, shape (n_samples, n_features)

	Returns
	-------
	ord : array, shape (n_samples,)
	Returns a list of integers representing the relative order of
	the rows in X.
	"""
	if hasattr(self, 'coef_'):
	return np.argsort(np.dot(X, self.coef_.T).ravel())
	else:
	raise ValueError("Must call fit() prior to predict()")

	def predict(self, X):
	"""
	Parameters
	----------
	X: 2D numpy array, shape (n_samples, n_features + 1), last column is group id, rest are features

	Returns
	-------
	y_pred: numpy array, length n_samples, the model predictions
	"""
	if not hasattr(self, 'coef_'):
	raise ValueError("Must call fit() prior to predict()")

	# Split features and group ids
	group_ids = X[:, -1]
	X_data = X[:, :-1]

	# Map each group to the indices belonging to it
	from collections import defaultdict
	group_to_indices = defaultdict(list)
	for idx, gid in enumerate(group_ids):
	group_to_indices[gid].append(idx)

	y_pred = np.empty(len(X_data), dtype=object) # Use object dtype to allow storing ints or other types

	for gid, indices in group_to_indices.items():
	# Extract features for this group
	x_group = X_data[indices, :]
	# Fill missing values if needed (e.g., with 0)
	x_group = np.nan_to_num(x_group) # replaces NaN with 0
	x_group = x_group.astype(np.int32)

	# Predict for the group
	preds_group = self.predict_group(x_group)

	# Assign predictions to correct positions
	y_pred[indices] = preds_group


	return y_pred

	def score_group(self, X, y):
	"""
	Because we transformed into a pairwise problem, chance level is at 0.5
	"""
	X_trans, y_trans = transform_pairwise(X, y)
	return np.mean(super(RankSVM, self).predict(X_trans) == y_trans)


	if __name__ == '__main__':
	# as showcase, we will create some non-linear data
	# and print the performance of ranking vs linear regression

	np.random.seed(1)
	n_samples, n_features = 300, 5
	true_coef = np.random.randn(n_features)
	X = np.random.randn(n_samples, n_features)
	noise = np.random.randn(n_samples) / np.linalg.norm(true_coef)
	y = np.dot(X, true_coef)
	y = np.arctan(y) # add non-linearities
	y += .1 * noise # add noise
	Y = np.c_[y, np.mod(np.arange(n_samples), 5)] # add query fake id
	cv = KFold(5)
	i, (train, test) = next(enumerate(cv.split(X)))

	# make a simple plot out of it
	import matplotlib.pyplot as pl
	pl.scatter(np.dot(X, true_coef), y)
	pl.title('Data to be learned')
	pl.xlabel('<X, coef>')
	pl.ylabel('y')
	pl.show()

	# print the performance of ranking
	rank_svm = RankSVM().fit(X[train], Y[train])
	print ('Performance of ranking ', rank_svm.score_group(X[test], Y[test]))

	# and that of linear regression
	ridge = linear_model.RidgeCV(fit_intercept=True)
	ridge.fit(X[train], y[train])
	X_test_trans, y_test_trans = transform_pairwise(X[test], y[test])
	score = np.mean(np.sign(np.dot(X_test_trans, ridge.coef_)) == y_test_trans)
	print ('Performance of linear regression ', score)
No results found