Source code for cyclic_boosting.location

"""
Cyclic Boosting Regression for a location parameter target.
"""

from __future__ import absolute_import, division, print_function

import logging

import numexpr
import numpy as np
import sklearn.base

from cyclic_boosting.base import CyclicBoostingBase, calc_factors_generic
from cyclic_boosting.link import IdentityLinkMixin
from cyclic_boosting.utils import weighted_stddev

_logger = logging.getLogger(__name__)


[docs] class CBLocPoissonRegressor(CyclicBoostingBase, sklearn.base.RegressorMixin, IdentityLinkMixin):
[docs] def precalc_parameters(self, feature, y, pred): lex_binnumbers = feature.lex_binned_data minlength = feature.n_bins weights = self.weights global_std = weighted_stddev(y, weights) y_sum = np.bincount(lex_binnumbers, weights=y * weights, minlength=minlength) bincount = np.bincount(lex_binnumbers, weights=weights, minlength=minlength) return y_sum, bincount, global_std
def _check_y(self, y): """Check that y has no negative values.""" if not np.isfinite(y).all(): raise ValueError("The target y must be real value and not NAN.") def _regularize_summands(self, bincount, summands, uncertainties, global_std): r"""Regularize the summands to the prior mean. Depenending on the measured statistics of the bin. :math:`\mu_n = \sigma_{n}^{2} \left(\frac{\mu_0}{sigma_{0}^{2}} + \frac{n \bar{x}}{\sigma_{x}^{2}}` """ prior_mean = 0.0 reg_mean = ( 1.0 / (bincount / uncertainties**2 + 1.0 / global_std**2) * (bincount * summands / uncertainties**2 + prior_mean / global_std**2) ) return reg_mean
[docs] def calc_parameters(self, feature, y, pred, prefit_data=None): lex_binnumbers = feature.lex_binned_data minlength = feature.n_bins prediction = self.unlink_func(pred.predict_link()) prediction = np.where(prediction > 0, prediction, 0) y_sum, bincount, global_std = prefit_data weights = self.weights variance = numexpr.evaluate("where(prediction <= 0., 1, prediction)") factor_numerator = np.bincount( lex_binnumbers, weights=weights * (y - prediction) / variance, minlength=minlength, ) denominator = np.bincount(lex_binnumbers, weights=weights / variance, minlength=minlength) uncertainty_numerator = np.bincount(lex_binnumbers, weights=weights**2 / variance, minlength=minlength) denominator = np.where(denominator > 0, denominator, 1) summands = factor_numerator / denominator uncertainties = uncertainty_numerator / denominator uncertainties = np.where(uncertainties > 0, uncertainties, global_std) summands = self._regularize_summands(bincount, summands, uncertainties, global_std) return summands, uncertainties
[docs] def predict(self, X, y=None): result = super(CBLocPoissonRegressor, self).predict(X, y=y) return np.where(result > 0, result, 0)
def precalc_variance_y(feature, y, weights, n_prior=1): """Calculate expected value of posterior of variance parameter for a gaussian with a gamma distributed prior Gamma(a_0, b_0) and known mean for each bin. Reference: Bishop: Pattern Recognition and Machine Learning, page 100 The prior variance b_0 / a_0 is set to the global variance of y. Parameters ---------- lex_binnumbers: :class:`numpy.ndarray` (float64, ndims=1) 1-dimensional numpy array containing the bin numbers. y: :class:`numpy.ndarray` (float64, ndims=1) target, truth weights: :class:`numpy.ndarray` (float64, ndims=1) sample weights minlength: int number of bins for this feature including the `nan` bin n_prior: 'effective' prior observations, see bishop page 101 for a discussion Returns ------- ndarray The estimated variance of y for each bin """ lex_binnumbers = feature.lex_binned_data minlength = feature.n_bins weighted_mean_y = np.sum(y * weights) / np.sum(weights) variance_prior = np.sum((y - weighted_mean_y) * (y - weighted_mean_y) * weights) / np.sum(weights) if variance_prior <= 1e-9: # No variation in y; happens only in tests variance_prior = 1.0 sum_y = np.bincount(lex_binnumbers, weights=weights * y, minlength=minlength) sum_weights = np.bincount(lex_binnumbers, weights=weights, minlength=minlength) mean_y = sum_y[lex_binnumbers] / sum_weights[lex_binnumbers] weighted_squared_residual_sum = np.bincount( lex_binnumbers, weights=weights * (y - mean_y) ** 2, minlength=minlength ) a_0 = 0.5 * n_prior b_0 = a_0 * variance_prior a = a_0 + 0.5 * sum_weights b = b_0 + 0.5 * weighted_squared_residual_sum return b / a def calc_parameters_intercept(lex_binnumbers, prediction, minlength, y, variance_y, weights): """Calculates intercepts and uncertainties for each bin of a feature group. Parameters ---------- lex_binnumbers: :class:`numpy.ndarray` (float64, ndims=1) 1-dimensional numpy array containing the bin numbers. prediction: :class:`numpy.ndarray` (float64, ndims=1) prediction of all *other* features. minlength: int number of bins for this feature including the `nan` bin y: :class:`numpy.ndarray` (float64, ndims=1) target, truth variance_y: :class:`numpy.ndarray` (float64, ndims=1) Variance estimate for each target value weights: :class:`numpy.ndarray` (float64, ndims=1) sample weights external_col: :class:`numpy.ndarray` (float64, ndims=1) external column array Returns ------- tuple ``intercepts`` and ``uncertainties``. """ w = weights / variance_y w_x = w * (y - prediction) w_x2 = w * (y - prediction) ** 2 x0 = 0 w0 = 1e-2 return calc_factors_generic(lex_binnumbers, w_x, w, w_x2, weights, minlength, x0, w0)
[docs] class CBLocationRegressor(sklearn.base.RegressorMixin, CyclicBoostingBase, IdentityLinkMixin): def _check_y(self, y): """Check that y has no negative values.""" if not np.isfinite(y).all(): raise ValueError("The target y must be real value and not NAN.")
[docs] def precalc_parameters(self, feature, y, pred): """Calculations that are not dependent on intermediate predictions. If these are not needed, return :obj:`None` in the subclass. Results returned by this method will be served to :meth:`factors_and_uncertainties` as the ``prefit_data`` argument. Parameters ---------- feature: :class:`~.Feature` class containing all features y: np.ndarray target, truth prediction_link: np.ndarray prediction in link space. """ return precalc_variance_y(feature, y, self.weights)
[docs] def calc_parameters(self, feature, y, pred, prefit_data): """Calculates factors and uncertainties of the bins of a feature group in the original space (not the link space) and transforms them to the link space afterwards The factors and uncertainties cannot be determined in link space, not least because target values like 0 diverge in link spaces like `log` or `logit`. Parameters ---------- feature: :class:`~.Feature` class containing all features y: np.ndarray target, truth prediction_link: np.ndarray prediction in link space of all *other* features. prefit_data data returned by :meth:`~.precalc_parameters` during fit Returns ------- tuple This method must return a tuple of ``factors`` and ``uncertainties`` in the **link space**. """ lex_binnumbers = feature.lex_binned_data minlength = feature.n_bins variance_y = prefit_data[lex_binnumbers] return calc_parameters_intercept(lex_binnumbers, pred.predict_link(), minlength, y, variance_y, self.weights)
[docs] def calibrate_to_weighted_mean(self, feature): if feature.missing_not_learned: calibrated_factors_link = ( feature.factors_link[:-1] - (feature.factors_link[:-1] * feature.bin_weightsums[:-1]).sum() / feature.bin_weightsums[:-1].sum() ) calibrated_factors_link = np.append(calibrated_factors_link, self.neutral_factor_link) else: calibrated_factors_link = ( feature.factors_link - (feature.factors_link * feature.bin_weightsums).sum() / feature.bin_weightsums.sum() ) return calibrated_factors_link
__all__ = ["CBLocationRegressor", "CBLocPoissonRegressor"]