"""
Cyclic Boosting Regression for a location parameter target.
"""
from __future__ import absolute_import, division, print_function
import logging
import numexpr
import numpy as np
import sklearn.base
from cyclic_boosting.base import CyclicBoostingBase, calc_factors_generic
from cyclic_boosting.link import IdentityLinkMixin
from cyclic_boosting.utils import weighted_stddev
_logger = logging.getLogger(__name__)
[docs]
class CBLocPoissonRegressor(CyclicBoostingBase, sklearn.base.RegressorMixin, IdentityLinkMixin):
[docs]
def precalc_parameters(self, feature, y, pred):
lex_binnumbers = feature.lex_binned_data
minlength = feature.n_bins
weights = self.weights
global_std = weighted_stddev(y, weights)
y_sum = np.bincount(lex_binnumbers, weights=y * weights, minlength=minlength)
bincount = np.bincount(lex_binnumbers, weights=weights, minlength=minlength)
return y_sum, bincount, global_std
def _check_y(self, y):
"""Check that y has no negative values."""
if not np.isfinite(y).all():
raise ValueError("The target y must be real value and not NAN.")
def _regularize_summands(self, bincount, summands, uncertainties, global_std):
r"""Regularize the summands to the prior mean. Depenending on the
measured statistics of the bin.
:math:`\mu_n = \sigma_{n}^{2} \left(\frac{\mu_0}{sigma_{0}^{2}} +
\frac{n \bar{x}}{\sigma_{x}^{2}}`
"""
prior_mean = 0.0
reg_mean = (
1.0
/ (bincount / uncertainties**2 + 1.0 / global_std**2)
* (bincount * summands / uncertainties**2 + prior_mean / global_std**2)
)
return reg_mean
[docs]
def calc_parameters(self, feature, y, pred, prefit_data=None):
lex_binnumbers = feature.lex_binned_data
minlength = feature.n_bins
prediction = self.unlink_func(pred.predict_link())
prediction = np.where(prediction > 0, prediction, 0)
y_sum, bincount, global_std = prefit_data
weights = self.weights
variance = numexpr.evaluate("where(prediction <= 0., 1, prediction)")
factor_numerator = np.bincount(
lex_binnumbers,
weights=weights * (y - prediction) / variance,
minlength=minlength,
)
denominator = np.bincount(lex_binnumbers, weights=weights / variance, minlength=minlength)
uncertainty_numerator = np.bincount(lex_binnumbers, weights=weights**2 / variance, minlength=minlength)
denominator = np.where(denominator > 0, denominator, 1)
summands = factor_numerator / denominator
uncertainties = uncertainty_numerator / denominator
uncertainties = np.where(uncertainties > 0, uncertainties, global_std)
summands = self._regularize_summands(bincount, summands, uncertainties, global_std)
return summands, uncertainties
[docs]
def predict(self, X, y=None):
result = super(CBLocPoissonRegressor, self).predict(X, y=y)
return np.where(result > 0, result, 0)
def precalc_variance_y(feature, y, weights, n_prior=1):
"""Calculate expected value of posterior of variance parameter for a gaussian
with a gamma distributed prior Gamma(a_0, b_0) and known mean for each bin.
Reference: Bishop: Pattern Recognition and Machine Learning, page 100
The prior variance b_0 / a_0 is set to the global variance of y.
Parameters
----------
lex_binnumbers: :class:`numpy.ndarray` (float64, ndims=1)
1-dimensional numpy array containing the bin numbers.
y: :class:`numpy.ndarray` (float64, ndims=1)
target, truth
weights: :class:`numpy.ndarray` (float64, ndims=1)
sample weights
minlength: int
number of bins for this feature including the `nan` bin
n_prior: 'effective' prior observations,
see bishop page 101 for a discussion
Returns
-------
ndarray
The estimated variance of y for each bin
"""
lex_binnumbers = feature.lex_binned_data
minlength = feature.n_bins
weighted_mean_y = np.sum(y * weights) / np.sum(weights)
variance_prior = np.sum((y - weighted_mean_y) * (y - weighted_mean_y) * weights) / np.sum(weights)
if variance_prior <= 1e-9: # No variation in y; happens only in tests
variance_prior = 1.0
sum_y = np.bincount(lex_binnumbers, weights=weights * y, minlength=minlength)
sum_weights = np.bincount(lex_binnumbers, weights=weights, minlength=minlength)
mean_y = sum_y[lex_binnumbers] / sum_weights[lex_binnumbers]
weighted_squared_residual_sum = np.bincount(
lex_binnumbers, weights=weights * (y - mean_y) ** 2, minlength=minlength
)
a_0 = 0.5 * n_prior
b_0 = a_0 * variance_prior
a = a_0 + 0.5 * sum_weights
b = b_0 + 0.5 * weighted_squared_residual_sum
return b / a
def calc_parameters_intercept(lex_binnumbers, prediction, minlength, y, variance_y, weights):
"""Calculates intercepts and uncertainties for each bin of a feature group.
Parameters
----------
lex_binnumbers: :class:`numpy.ndarray` (float64, ndims=1)
1-dimensional numpy array containing the bin numbers.
prediction: :class:`numpy.ndarray` (float64, ndims=1)
prediction of all *other* features.
minlength: int
number of bins for this feature including the `nan` bin
y: :class:`numpy.ndarray` (float64, ndims=1)
target, truth
variance_y: :class:`numpy.ndarray` (float64, ndims=1)
Variance estimate for each target value
weights: :class:`numpy.ndarray` (float64, ndims=1)
sample weights
external_col: :class:`numpy.ndarray` (float64, ndims=1)
external column array
Returns
-------
tuple
``intercepts`` and ``uncertainties``.
"""
w = weights / variance_y
w_x = w * (y - prediction)
w_x2 = w * (y - prediction) ** 2
x0 = 0
w0 = 1e-2
return calc_factors_generic(lex_binnumbers, w_x, w, w_x2, weights, minlength, x0, w0)
[docs]
class CBLocationRegressor(sklearn.base.RegressorMixin, CyclicBoostingBase, IdentityLinkMixin):
def _check_y(self, y):
"""Check that y has no negative values."""
if not np.isfinite(y).all():
raise ValueError("The target y must be real value and not NAN.")
[docs]
def precalc_parameters(self, feature, y, pred):
"""Calculations that are not dependent on intermediate predictions. If
these are not needed, return :obj:`None` in the subclass.
Results returned by this method will be served to
:meth:`factors_and_uncertainties` as the ``prefit_data`` argument.
Parameters
----------
feature: :class:`~.Feature`
class containing all features
y: np.ndarray
target, truth
prediction_link: np.ndarray
prediction in link space.
"""
return precalc_variance_y(feature, y, self.weights)
[docs]
def calc_parameters(self, feature, y, pred, prefit_data):
"""Calculates factors and uncertainties of the bins of a feature group
in the original space (not the link space) and transforms them to the
link space afterwards
The factors and uncertainties cannot be determined in link space, not
least because target values like 0 diverge in link spaces like `log`
or `logit`.
Parameters
----------
feature: :class:`~.Feature`
class containing all features
y: np.ndarray
target, truth
prediction_link: np.ndarray
prediction in link space of all *other* features.
prefit_data
data returned by :meth:`~.precalc_parameters` during fit
Returns
-------
tuple
This method must return a tuple of ``factors`` and
``uncertainties`` in the **link space**.
"""
lex_binnumbers = feature.lex_binned_data
minlength = feature.n_bins
variance_y = prefit_data[lex_binnumbers]
return calc_parameters_intercept(lex_binnumbers, pred.predict_link(), minlength, y, variance_y, self.weights)
[docs]
def calibrate_to_weighted_mean(self, feature):
if feature.missing_not_learned:
calibrated_factors_link = (
feature.factors_link[:-1]
- (feature.factors_link[:-1] * feature.bin_weightsums[:-1]).sum() / feature.bin_weightsums[:-1].sum()
)
calibrated_factors_link = np.append(calibrated_factors_link, self.neutral_factor_link)
else:
calibrated_factors_link = (
feature.factors_link
- (feature.factors_link * feature.bin_weightsums).sum() / feature.bin_weightsums.sum()
)
return calibrated_factors_link
__all__ = ["CBLocationRegressor", "CBLocPoissonRegressor"]