Source code for flexible_linear

# Author:  Markus Kliegl
# License: MIT
"""Regularized linear regression with custom training and regularization costs.

:class:`FlexibleLinearRegression` is a scikit-learn-compatible linear
regression estimator that allows specification of arbitrary
training and regularization cost functions.

For a linear model:

.. math::

   \\textrm{predictions} = X \\cdot W

this model attempts to find :math:`W` by minimizing:

.. math::

   \\min_{W} \\left\\{
       \\textrm{cost}(X \cdot W - y) + C \\cdot \\textrm{reg_cost}(W)
   \\right\\}

for given training data :math:`X, y`. Here :math:`C` is the
regularization strength and :math:`\\textrm{cost}` and
:math:`\\textrm{reg_cost}` are customizable cost functions
(e.g., the squared :math:`\\ell^2` norm or the :math:`\\ell^1` norm).

*Note:* In reality, we fit an intercept (bias coefficient) as well.
Think of :math:`X` in the above as having an extra column of 1's.

Ideally, the cost functions should be convex and continuously
differentiable.

We provide some cost functions: see :func:`l1_cost_func`,
:func:`l2_cost_func`, :func:`japanese_cost_func` (or the
:data:`cost_func_dict` dictionary). If you want to use a
custom cost function, it should be of the form::

    def custom_cost_func(z, **opts):
        # <code to compute cost and gradient>
        return cost, gradient

where `cost` is a float, `gradient` is an array of the same
dimensions as `z`, and you may specify any number of keyword
arguments.
"""
from __future__ import division
import numpy as np
import scipy.optimize
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted


[docs]def l2_cost_func(z):
    """Normalized squared :math:`\\ell^2` cost and gradient

    .. math::

        \\mathrm{cost}(z) = \\frac{1}{2n} ||z||_{\\ell^2}^2
        = \\frac{1}{2n} \sum_{i=1}^n |z_i|^2 \,.

    Args:
        z (ndarray): Input vector.

    Returns:
        Tuple[float, ndarray]: The cost and gradient (same shape as `z`).
    """
    N = len(z)
    return 0.5 * np.dot(z, z) / N, z / N


[docs]def l1_cost_func(z):
    """Normalized :math:`\\ell^1` cost and gradient

    .. math::

        \\mathrm{cost}(z) = \\frac{1}{n} ||z||_{\\ell^1}
        = \\frac{1}{n} \\sum_{i=1}^n |z_i| \,.


    .. note::

       This cost is not differentiable. For a smooth
       alternative, see :func:`japanese_cost_func`.

    Args:
        z (ndarray): Input vector.

    Returns:
        Tuple[float, ndarray]: The cost and gradient (same shape as `z`).
    """
    N = len(z)
    return np.sum(np.abs(z)) / N, np.sign(z) / N


[docs]def japanese_cost_func(z, eta=0.1):
    """'Japanese bracket' cost and gradient

    Computes cost and gradient for the cost function:

    .. math::

        \\mathrm{cost}(z) = \\frac{\\eta^2}{n} \\sum_{i=1}^n \\left(
            \\sqrt{ 1 + \\left( \\frac{z_i}{\\eta} \\right)^2 } - 1
        \\right) \,.

    This cost function interpolates componentwise between the
    squared :math:`\\ell^2` norm (for :math:`|z_i| \ll \\eta`) and
    the :math:`\\ell^1` norm (for :math:`|z_i| \gg \\eta`)
    and is thus useful for reducing the impact of outliers
    (or when dealing with heavy-tailed rather than Gaussian noise).
    Unlike the :math:`\\ell^1` norm, this cost function is smooth.

    The key to understanding this is that the *Japanese bracket*

    .. math::

        \\langle z \\rangle := \\sqrt{ 1 + |z|^2 }

    satisfies these asymptotics:

    .. math::

        \\sqrt{ 1 + |z|^2 } - 1 \\approx \\begin{cases}
            \\frac12 |z|^2 & \\text{for $|z| \\ll 1$}
            \\\\ |z| & \\text{for $|z| \\gg 1$}
        \\end{cases} \,.

    Args:
        z (ndarray): Input vector.
        eta (Optional[float]): Positive scale parameter.

    Returns:
        Tuple[float, ndarray]: The cost and gradient (same shape as `z`).
    """
    N = len(z)
    z_norm = z / eta
    z_jap = np.sqrt(1.0 + z_norm * z_norm)  # componentwise Japanese bracket
    cost = eta**2 * np.sum(z_jap - 1.0)
    gradient = z / z_jap
    return cost / N, gradient / N

cost_func_dict = {
    'l2': l2_cost_func,
    'l1': l1_cost_func,
    'japanese': japanese_cost_func,
}
"""Dictionary of implemented cost functions."""


[docs]class FitError(Exception):
    """Exception raised when fitting fails.

    Attributes:
        message (str): Error message.
        res (scipy.optimize.OptimizeResult): Results returned by
            `scipy.optimize.minimize`. See SciPy documentation on
            `OptimizeResult`_ for details.

    .. _OptimizeResult: http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html  # NOQA
    """

    def __init__(self, message, res):
        self.message = message
        self.res = res


[docs]class FlexibleLinearRegression(BaseEstimator):
    """Regularized linear regression with custom training/regularization costs.

    Args:
        C (Optional[float]): Nonnegative regularization
            coefficient. (Zero means no regularization.)
        cost_func (Optional[callable or str]): Training cost
            function. If not callable, should be one of 'l1', 'l2',
            or 'japanese'.
        cost_opts (Optional[dict]): Parameters to pass to
            `cost_func`.
        reg_cost_func (Optional[callable or str]): Regularization
            cost function. If not callable, should be one of 'l1',
            'l2', or 'japanese'.
        reg_cost_opts (Optional[dict]): Parameters to pass to
            `reg_cost_func`.

    Attributes:
        coef_ (ndarray): Weight matrix of shape (n_features+1,).
            (coef_[0] is the intercept coefficient.)
    """

    def __init__(
            self, C=1.0, cost_func='l2', cost_opts=None,
            reg_cost_func='l2', reg_cost_opts=None):
        self.C = C
        self.cost_func = cost_func
        self.cost_opts = cost_opts
        self.reg_cost_func = reg_cost_func
        self.reg_cost_opts = reg_cost_opts

    def _check_cost_func(self, cost_func, cost_opts):
        if not callable(cost_func):
            try:
                cost_func = cost_func_dict[cost_func]
            except KeyError:
                raise ValueError(
                    "Unknown cost function: '{}'".format(cost_func))

        if cost_opts is None:
            cost_opts = {}

        return cost_func, cost_opts

[docs]    def fit(self, X, y):
        """Fit the model.

        Args:
            X (ndarray): Training data of shape ``(n_samples, n_features)``.
            y (ndarray): Target values of shape ``(n_samples,)``.

        Returns:
            self

        Raises:
            FitError: If the fitting failed.
        """
        X, y = check_X_y(X, y, y_numeric=True)
        C = self.C
        cost_func, cost_opts = self._check_cost_func(
            self.cost_func, self.cost_opts)
        reg_cost_func, reg_cost_opts = self._check_cost_func(
            self.reg_cost_func, self.reg_cost_opts)

        # add a column of ones to X (for intercept coefficient)
        X = np.hstack((np.ones((X.shape[0], 1), dtype=float), X))

        def objective(W):
            # compute training cost/grad
            cost, outer_grad = cost_func(np.dot(X, W) - y, **cost_opts)
            grad = np.dot(outer_grad, X)  # chain rule

            # add regularization cost/grad (but don't regularize intercept)
            reg_cost, reg_grad = reg_cost_func(W[1:], **reg_cost_opts)
            cost += C * reg_cost
            grad[1:] += C * reg_grad

            return cost, grad

        initial_coef_ = np.zeros(X.shape[1])
        res = scipy.optimize.minimize(
            objective, initial_coef_, jac=True, method='L-BFGS-B')
        if res.success:
            self.coef_ = res.x
        else:
            raise FitError("Fit failed: {}".format(res.message), res=res)

        return self

[docs]    def predict(self, X):
        """Predict using the model.

        Args:
            X (ndarray): Data of shape ``(n_samples, n_features)``.

        Returns:
            y (ndarray): Predicted values of shape ``(n_samples,)``.
        """
        check_is_fitted(self, 'coef_')
        X = check_array(X)
        n_features = self.coef_.shape[0] - 1
        if X.shape[1] != n_features:
            raise ValueError(
                "X should have %d features, not %d" % (n_features, X.shape[1]))

        y = np.dot(X, self.coef_[1:]) + self.coef_[0]

        return y


[docs]def test_estimator():
    from sklearn.utils.estimator_checks import check_estimator
    check_estimator(FlexibleLinearRegression)


[docs]def test_gradients():
    z0 = np.random.randn(5)
    for cost_func in cost_func_dict.values():
        func = lambda z: cost_func(z)[0]
        grad = lambda z: cost_func(z)[1]
        assert scipy.optimize.check_grad(func, grad, z0) < 1e-5


if __name__ == '__main__':
    import nose2
    nose2.main()