Source code for cyclic_boosting.classification

"""
Cyclic Boosting Classifier
"""

from __future__ import absolute_import, division, print_function

import logging

import numpy as np
import pandas as pd
import scipy.stats
import sklearn.base

from cyclic_boosting import base as cyclic_boosting_base
from cyclic_boosting.base import CyclicBoostingBase
from cyclic_boosting.link import LogitLinkMixin
from typing import Tuple, Optional, Union
from cyclic_boosting.features import Feature

_logger = logging.getLogger(__name__)



[docs]
def get_beta_priors() -> Tuple[float, float]:
    r"""Prior values for beta distribution. The prior distribution was chosen
    to be Beta(1.001, 1.001), which is almost a uniform distribution but has a
    probability density function that goes to 0 for math:`x=0` and :math:`x=1`.

    Returns
    -------
    float
        :math:`alpha=1.001` and :math:`beta=1.001`
    """
    alpha_prior = 1.001
    beta_prior = 1.001
    return alpha_prior, beta_prior




[docs]
def boost_weights(y: np.ndarray, prediction: np.ndarray) -> np.ndarray:
    r"""Returns weights for bincount operations on the CBClassifier.

    The weights are assigned so that they are suitable for boosting, i.e.
    weights for well-estimated samples are low, weights for bad estimations are
    high.

    .. math::

       w_i = \begin{cases} (1 - \left\langle y
                      \right\rangle_i) & y_{\text{truth}} = 1 \\
            \left\langle y \right\rangle_i &
            \text{otherwise} \end{cases}

    """
    epsilon = 1e-12
    prediction = np.where(prediction == 0.0, epsilon, prediction)
    prediction = np.where(prediction == 1.0, 1 - epsilon, prediction)
    return np.where(y, 1 - prediction, prediction)




[docs]
class CBClassifier(sklearn.base.ClassifierMixin, CyclicBoostingBase, LogitLinkMixin):
    """This regressor is the cyclic boosting core algorithm for classifications

    Its interface, methods and arguments are described in
    :class:`~CyclicBoostingBase`.
    """

    def _check_y(self, y: np.ndarray) -> None:
        """Check that y has only values 0. or 1."""
        if not ((y == 0.0) | (y == 1.0)).all():
            raise ValueError(
                "The target y must be either 0 or 1 "
                "and not NAN. y[(y != 0) & (y != 1)] = {0}".format(y[(y != 0) & (y != 1)])
            )


[docs]
    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred):
        return None


    def _get_posterior_dist_from_prior_dist(
        self, alpha: np.ndarray, beta: np.ndarray
    ) -> Tuple[scipy.stats.beta, np.ndarray, np.ndarray]:
        """
        Beta(1,1) is the uniform distribution, Beta(1.001, 1.001) has pdf
        zero at 0 and 1. It is thus chosen as the prior.
        """
        alpha_prior, beta_prior = get_beta_priors()
        alpha_posterior = alpha + alpha_prior
        beta_posterior = beta + beta_prior
        posterior = scipy.stats.beta(alpha_posterior, beta_posterior)
        return posterior, alpha_posterior, beta_posterior

    def _get_percentiles_from_distribution_parameters(
        self, alpha_posterior: np.ndarray, beta_posterior: np.ndarray
    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Choose perc1 and perc2 for gaussian_matching_by_quantiles such that
        for an asymmetric beta distribution, the quantiles are rather far
        from the unsafe boundaries 0 and 1.

        TODO: Why are we choosing the 0.75 and 0.25 percentiles? What is the formula for the shift?
        """
        shift = 0.4 * (alpha_posterior / (alpha_posterior + beta_posterior) - 0.5)
        perc1 = 0.75 - shift
        perc2 = 0.25 - shift
        return perc1, perc2


[docs]
    def calc_parameters(self, feature: Feature, y: np.ndarray, pred, prefit_data: np.ndarray) -> Tuple[float, float]:
        prediction = self.unlink_func(pred.predict_link())
        boosting_weights = boost_weights(y, prediction)
        event_weights = self.weights
        weights = event_weights * boosting_weights

        wsum, w2sum, alpha, beta = (
            np.bincount(feature.lex_binned_data, weights=w, minlength=feature.n_bins)
            for w in [weights, weights * boosting_weights, weights * y, weights * (1 - y)]
        )

        weight_factor = np.ones_like(wsum)

        alpha *= weight_factor
        alpha = np.where(alpha < 0, 0, alpha)

        beta *= weight_factor
        beta = np.where(beta < 0, 0, beta)

        posterior, alpha_posterior, beta_posterior = self._get_posterior_dist_from_prior_dist(alpha=alpha, beta=beta)

        perc1, perc2 = self._get_percentiles_from_distribution_parameters(
            alpha_posterior=alpha_posterior, beta_posterior=beta_posterior
        )

        # actual Gaussian matching
        (
            factors_link,
            uncertainties_l,
        ) = cyclic_boosting_base.gaussian_matching_by_quantiles(
            dist=posterior, link_func=self.link_func, perc1=perc1, perc2=perc2
        )

        return factors_link, uncertainties_l



[docs]
    def predict_proba(self, X: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None) -> np.ndarray:
        probability_signal = super(CBClassifier, self).predict(X, y=y)
        return np.c_[1 - probability_signal, probability_signal]



[docs]
    def predict(self, X: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None) -> np.ndarray:
        probability_signal = super(CBClassifier, self).predict(X, y=y)
        return np.asarray(probability_signal > 0.5, dtype=np.float64)




__all__ = ["CBClassifier", "boost_weights", "get_beta_priors"]