Source code for cyclic_boosting.GBSregression

"""
Cyclic Boosting Regression for Generalized Background Subtraction regression.
"""

from __future__ import absolute_import, division, print_function

import logging

import numpy as np
from sklearn.base import RegressorMixin
import pandas as pd

from cyclic_boosting.base import CyclicBoostingBase, Feature, CBLinkPredictionsFactors
from cyclic_boosting.link import IdentityLinkMixin
from typing import Tuple, Union

_logger = logging.getLogger(__name__)



[docs]
class CBGBSRegressor(RegressorMixin, CyclicBoostingBase, IdentityLinkMixin):
    r"""
    Variant form of Cyclic Boosting's location regressor, that corresponds to
    the regression of the outcome of a previous statistical subtraction of two
    classes of observations from each other (e.g. groups A and B: A - B).

    For this, the target y has to be set to positive values for group A and
    negative values for group B.

    Additional Parameter
    --------------------
    regalpha: float
        A hyperparameter to steer the strength of regularization, i.e. a
        shrinkage of the regression result for A _B to 0. A value of 0
        corresponds to no regularization.
    """

    def __init__(
        self,
        feature_groups=None,
        hierarchical_feature_groups=None,
        feature_properties=None,
        weight_column=None,
        minimal_loss_change=1e-10,
        minimal_factor_change=1e-10,
        maximal_iterations=10,
        observers=None,
        smoother_choice=None,
        output_column=None,
        learn_rate=None,
        regalpha=0.0,
        aggregate=True,
    ):
        CyclicBoostingBase.__init__(
            self,
            feature_groups=feature_groups,
            hierarchical_feature_groups=hierarchical_feature_groups,
            feature_properties=feature_properties,
            weight_column=weight_column,
            minimal_loss_change=minimal_loss_change,
            minimal_factor_change=minimal_factor_change,
            maximal_iterations=maximal_iterations,
            observers=observers,
            smoother_choice=smoother_choice,
            output_column=output_column,
            learn_rate=learn_rate,
            aggregate=aggregate,
        )

        self.regalpha = regalpha


[docs]
    def calc_parameters(
        self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors, prefit_data
    ) -> Tuple[np.ndarray, np.ndarray]:
        lex_binnumbers = feature.lex_binned_data
        minlength = feature.n_bins
        prediction = pred.predict_link()

        n = (y - prediction) * self.weights
        d = self.weights * (1 + self.regalpha)

        sum_n, sum_d, sum_nd, sum_n2, sum_d2 = (
            np.bincount(lex_binnumbers, weights=w, minlength=minlength) for w in [n, d, n * d, n * n, d * d]
        )

        sum_d += 1
        sum_d2 += 1**2

        summand = sum_n / sum_d
        variance_summand = (sum_d**2 * sum_n2 - 2.0 * sum_n * sum_d * sum_nd + sum_n**2 * sum_d2) / sum_d**4

        return summand, np.sqrt(variance_summand)


    def _check_y(self, y: np.ndarray):
        pass

    def _init_global_scale(self, X: Union[pd.DataFrame, np.ndarray], y: np.ndarray) -> None:
        if self.weights is None:
            raise RuntimeError("The weights have to be initialized.")
        self.global_scale_link_ = (y * self.weights).sum() / self.weights.sum()


[docs]
    def loss(self, prediction: np.ndarray, y: np.ndarray, weights: np.ndarray) -> float:
        wvisitsum = ((y != 0).astype(int) * weights).sum()
        loss = (weights * (prediction - y) ** 2).sum() / wvisitsum
        return loss



[docs]
    def precalc_parameters(self, feature: Feature, y: np.ndarray, pred: CBLinkPredictionsFactors) -> None:
        return None




__all__ = ["CBGBSRegressor"]