Source code for cyclic_boosting.regression

from __future__ import absolute_import, division, print_function

import abc
import logging

import numexpr
import numpy as np
import six
import sklearn.base
import scipy.special

from cyclic_boosting.base import CyclicBoostingBase, CBLinkPredictionsFactors
from cyclic_boosting.features import Feature
from cyclic_boosting.link import LogLinkMixin

from typing import Tuple

_logger = logging.getLogger(__name__)


def _calc_factors_from_posterior(alpha_posterior: np.ndarray, beta_posterior: np.ndarray) -> np.ndarray:
    # The posterior distribution of f_j (factor in bin j)
    # follows a Gamma distribution. We want to use the median as estimate
    # because it is more stable against the log transformation. But calculating
    # the median is much more expensive for large values of alpha_posterior and
    # beta_posterior. Therefore we omit calculating the median and take the
    # mean instead (since mean -> median for large values of alpha_posterior
    # and beta_posterior).

    noncritical_posterior = (alpha_posterior <= 1e12) & (beta_posterior <= 1e12)
    # Median of the gamma distribution
    posterior_gamma = (
        scipy.special.gammaincinv(alpha_posterior[noncritical_posterior], 0.5) / beta_posterior[noncritical_posterior]
    )

    factors = alpha_posterior / beta_posterior
    factors[noncritical_posterior] = posterior_gamma
    return np.log(factors)


def _calc_factors_and_uncertainties(alpha: np.ndarray, beta: np.ndarray, link_func: np.ndarray) -> Tuple[np.ndarray]:
    alpha_prior, beta_prior = get_gamma_priors()
    alpha_posterior = alpha + alpha_prior
    beta_posterior = beta + beta_prior

    factors = _calc_factors_from_posterior(alpha_posterior, beta_posterior)
    # factor_uncertainties:
    # The variance used here was calculated by matching the first two moments
    # of the Gamma posterior with a log-normal distribution.
    uncertainties = np.sqrt(link_func(1 + alpha_posterior) - link_func(alpha_posterior))

    return factors, uncertainties



[docs]
def get_gamma_priors() -> Tuple[int, float]:
    """prior values for Gamma distribution with median 1"""
    alpha_prior = 2
    beta_prior = 1.67834
    return alpha_prior, beta_prior



@six.add_metaclass(abc.ABCMeta)
class CBBaseRegressor(CyclicBoostingBase, sklearn.base.RegressorMixin, LogLinkMixin):
    r"""This is the base regressor for all Cyclic Boosting regression problems.
    It implements :class:`cyclic_boosting.link.LogLinkMixin` and is usable
    for regression problems with a target range of: :math:`0 \leq y < \infty`.
    """

    def _check_y(self, y: np.ndarray) -> None:
        """Check that y has no negative values."""
        if not (y >= 0.0).all():
            raise ValueError(
                "The target y must be positive semi-definite " "and not NAN. y[~(y>=0)] = {0}".format(y[~(y >= 0)])
            )

    @abc.abstractmethod
    def calc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data):
        raise NotImplementedError("implement in subclass")

    @abc.abstractmethod
    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors) -> None:
        return None



[docs]
class CBNBinomRegressor(CBBaseRegressor):
    r"""This regressor minimizes the mean squared error. It is usable for
    regressions of target-values :math:`0 \leq y < \infty`.

    This Cyclic Boosting mode assumes an underlying negative binomial (or
    Gamma-Poisson) distribution as conditional distribution of the target. The
    prior values for the Gamma distribution :math:`\alpha = 2.0` and
    :math:`\beta = 1.67834` are chosen such that its median is
    :math:`\Gamma_{\text{median}}(\alpha, \beta) =1`, which is the neutral
    element of multiplication (Cyclic Boosting in this mode is a multiplicative
    model). The estimate for each factor is the median of the Gamma
    distribution with the measured values of :math:`\alpha` and :math:`\beta`.
    To determine the uncertainties of the factors, the variance is estimated
    from a log-normal distribution that is approximated using the first two
    moments of the Gamma distribution.

    In the default case of parameter settings :math:`a = 1.0` and
    :math:`c = 0.0`, this regressor corresponds to the special case of a
    Poisson regressor, as implemented in :class:`~.CBPoissonRegressor`.
    """

    def __init__(
        self,
        feature_groups=None,
        hierarchical_feature_groups=None,
        feature_properties=None,
        weight_column=None,
        prior_prediction_column=None,
        minimal_loss_change=1e-3,
        minimal_factor_change=1e-3,
        maximal_iterations=10,
        observers=None,
        smoother_choice=None,
        output_column=None,
        learn_rate=None,
        a=1.0,
        c=0.0,
        aggregate=True,
    ):
        CyclicBoostingBase.__init__(
            self,
            feature_groups=feature_groups,
            hierarchical_feature_groups=hierarchical_feature_groups,
            feature_properties=feature_properties,
            weight_column=weight_column,
            prior_prediction_column=prior_prediction_column,
            minimal_loss_change=minimal_loss_change,
            minimal_factor_change=minimal_factor_change,
            maximal_iterations=maximal_iterations,
            observers=observers,
            smoother_choice=smoother_choice,
            output_column=output_column,
            learn_rate=learn_rate,
            aggregate=aggregate,
        )
        self.a = a  # TODO: a and c as variable names are too vague
        self.c = c


[docs]
    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors):
        pass



[docs]
    def calc_parameters(
        self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data
    ) -> Tuple[np.ndarray]:
        a = self.a  # noqa: F841
        c = self.c  # noqa: F841
        weights = self.weights  # noqa: F841
        prediction_link = pred.predict_link()
        prediction = self.unlink_func(prediction_link)  # noqa: F841

        alpha_w = numexpr.evaluate("weights * y / (a + c * prediction)")
        beta_w = numexpr.evaluate("weights * prediction / (a + c * prediction)")

        lex_binnumbers = feature.lex_binned_data
        minlength = feature.n_bins
        alpha, beta = (np.bincount(lex_binnumbers, weights=w, minlength=minlength) for w in [alpha_w, beta_w])
        link_func = self.link_func

        return _calc_factors_and_uncertainties(alpha, beta, link_func)





[docs]
class CBPoissonRegressor(CBBaseRegressor):
    r"""This regressor minimizes the mean squared error. It is usable for
    regressions of target-values :math:`0 \leq y < \infty`.

    As Poisson regressor, it is a special case of the more general negative
    binomial regressor :class:`~.CBNBinomRegressor`, assuming *purely*
    Poisson-distributed target values.
    """


[docs]
    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors):
        return np.bincount(feature.lex_binned_data, weights=y * self.weights, minlength=feature.n_bins)



[docs]
    def calc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data):
        prediction = self.unlink_func(pred.predict_link())

        prediction_sum_of_bins = np.bincount(
            feature.lex_binned_data,
            weights=self.weights * prediction,
            minlength=feature.n_bins,
        )

        return _calc_factors_and_uncertainties(alpha=prefit_data, beta=prediction_sum_of_bins, link_func=self.link_func)




__all__ = ["get_gamma_priors", "CBPoissonRegressor", "CBNBinomRegressor"]