Source code for cyclic_boosting.smoothing.meta_smoother

"""
base module for abstract smoothers
"""
from __future__ import absolute_import, division, print_function

import logging
from enum import Enum

import numpy as np
from six.moves import range

from cyclic_boosting import utils
from cyclic_boosting.smoothing.base import AbstractBinSmoother, SetNBinsMixin

_logger = logging.getLogger(__name__)


[docs] class RegressionType(Enum): """Type of regression that is supported by the smoother. Three variants are supported: * discontinuous: It cannot be interpolated between the values seen by the smoother (e.g. the values are unordered labels). Therefore all values in predict are set to nan that are out of the bin boundries in the fit or where no fit events have been seen. * interpolating: Interpolation is possible between the values seen by the smoother. So only values above or below the bin boundries are critical. Therefore all values in predict are set to nan that are out of the bin boundries in the fit. * extrapolating: Extrapolation allows arbitrary values independent of the values seen in the fit. Therefore no restrictions apply. """ discontinuous = "discontinuous" interpolating = "interpolating" extrapolating = "extrapolating"
def check_reg_type(reg_type): if ( (reg_type == RegressionType.discontinuous) or (reg_type == RegressionType.interpolating) or (reg_type == RegressionType.extrapolating) ): return reg_type else: raise KeyError( "Only the following regression types are allowed:" "{}, {}, {}".format( RegressionType.discontinuous, RegressionType.interpolating, RegressionType.extrapolating, ) ) def not_interpolating(reg_type): return check_reg_type(reg_type) == RegressionType.discontinuous def not_extrapolating(reg_type): return check_reg_type(reg_type) != RegressionType.extrapolating
[docs] class NormalizationSmoother(AbstractBinSmoother): """Meta-smoother that normalizes the values for its subestimator. Parameters ---------- smoother: :class:`AbstractBinSmoother` smoother used to fit and predict on the `normalized` data points. """ def __init__(self, smoother): self.smoother = smoother
[docs] def calc_norm(self, X_for_smoother, y): """Calculate the weighted mean of the target y that can then be used to give the subestimator a normalized distribution. Parameters ---------- X_for_smoother: :class:`numpy.ndarray` Multidimensional array with at least three columns. For a k-dimensional feature the first k columns contain the x-values for the smoothing. The ``k + 1`` column are the weights of these x-values, while the ``k + 2`` column contains the uncertainties. smoothed_y: :class:`numpy.ndarray` Array that contains the original target values. """ w = X_for_smoother[:, -2] weightsum = np.sum(w) if weightsum > 0: self.norm_ = np.sum(y * w) / weightsum else: self.norm_ = 0 if not np.isfinite(self.norm_): _logger.info( "The norm in smoother {0} is not finite: " "norm= {1}; weights= {2}; target= {3}".format(self.__class__.__name__, self.norm_, X_for_smoother, y) ) self.norm_ = 0.0
[docs] def fit(self, X_for_smoother, y): self.calc_norm(X_for_smoother, y) self.smoother.fit(X_for_smoother, y - self.norm_)
[docs] def predict(self, X_for_smoother): smoothed_y = self.smoother.predict(X_for_smoother) return smoothed_y + self.norm_
def _selected_events_interpolating(X_for_smoother, ndim, nbins): selected_events = np.min(X_for_smoother[:, :ndim], axis=1) < 0.0 for dim in range(ndim): selected_events |= X_for_smoother[:, dim] > (nbins[dim] - 0.5) return selected_events
[docs] class RegressionTypeSmoother(AbstractBinSmoother, SetNBinsMixin): """Meta-smoother to constrain all values according to their :class:`~cyclic_boosting.smoothing.RegressionType` from the ``predict`` method of the subsmoother. Parameters ---------- smoother: :class:`AbstractBinSmoother` smoother used to fit and predict on the `normalized` data points. reg_type: :class:`RegressionType` defines the regression type that is used to constrain the values. Regression Types ---------------- * discontinuous: Set all values in predict to nan that are out of the bin boundries in the fit or where no fit events have been seen. * interpolating: Set all values in predict to nan that are out of the bin boundries in the fit. * extrapolating: No restrictions for values in predict. """ def __init__(self, smoother, reg_type): self.smoother = smoother self.reg_type = check_reg_type(reg_type)
[docs] def apply_cut(self, X_for_smoother, smoothed_y): """Constrain all values in smoothed_y according to their :class:`RegressionType`. Parameters ---------- X_for_smoother: :class:`numpy.ndarray` Multidimensional array with at least three columns. For a k-dimensional feature the first k columns contain the x-values for the smoothing. The ``k + 1`` column are the weights of these x-values, while the ``k + 2`` column contains the uncertainties. smoothed_y: :class:`numpy.ndarray` Array that contains the result of the subsmoother. """ if not_interpolating(self.reg_type): selected_events = utils.not_seen_events(X_for_smoother[:, : self.ndim_], self.bin_weights_, self.n_bins_) elif not_extrapolating(self.reg_type): selected_events = _selected_events_interpolating(X_for_smoother, self.ndim_, self.n_bins_) else: return smoothed_y smoothed_y = np.asarray(smoothed_y, dtype=np.float64) smoothed_y[selected_events] = np.nan return smoothed_y
[docs] def fit(self, X_for_smoother, y): self.set_n_bins(X_for_smoother) self.smoother.fit(X_for_smoother, y)
[docs] def predict(self, X_for_smoother): smoothed_y = self.smoother.predict(X_for_smoother) return self.apply_cut(X_for_smoother, smoothed_y)
[docs] class NormalizationRegressionTypeSmoother(NormalizationSmoother, RegressionTypeSmoother): """Meta-smoother to constrain all values according to their :class:`~cyclic_boosting.smoothing.RegressionType` from the ``predict`` method of the subsmoother. Parameters ---------- smoother: :class:`AbstractBinSmoother` smoother used to fit and predict on the `normalized` data points. reg_type: :class:`RegressionType` defines the regression type that is used to constrain the values. Regression Types ---------------- * discontinuous: Set all values in predict to nan that are out of the bin boundries in the fit or where no fit events have been seen. * interpolating: Set all values in predict to nan that are out of the bin boundries in the fit. * extrapolating: No restrictions for values in predict. """ def __init__(self, smoother, reg_type): self.smoother = smoother self.reg_type = check_reg_type(reg_type)
[docs] def fit(self, X_for_smoother, y): self.set_n_bins(X_for_smoother) self.calc_norm(X_for_smoother, y) self.smoother.fit(X_for_smoother, y - self.norm_)
[docs] def predict(self, X_for_smoother): smoothed_y = self.smoother.predict(X_for_smoother) + self.norm_ return self.apply_cut(X_for_smoother, smoothed_y)
[docs] class SectionSmoother(AbstractBinSmoother): """Meta-smoother that splits the fitted data into two parts which are fitted by different subsmoothers. Parameters ---------- split_point : float value of x to split the data :math:`x_{below} <= C_{split} < x_{above}` smoother_lower: :class:`AbstractBinSmoother` smoother used to fit and predict on the data points below and including the split_point smoother_upper: :class:`AbstractBinSmoother` smoother used to fit and predict on the data points above the split_point nan_representation: float optional argument to define ``not a number values`` (default = ``np.nan``). epsilon: float Floating point accuracy when comparing with the ``split_point``. (E.g. :math:`x_{below} <= C_{split} + /epsilon`) """ def __init__( self, split_point, smoother_lower, smoother_upper, nan_representation=np.nan, epsilon=0.001, ): self.split_point = split_point self.smoother_lower = smoother_lower self.smoother_upper = smoother_upper self.nan_representation = np.nan self.epsilon = epsilon self._reset_smoother_status() def _reset_smoother_status(self): self.smoother_lower_fitted = False self.smoother_upper_fitted = False def _split_condition(self, X_for_smoother): return X_for_smoother[:, 0] <= self.split_point + self.epsilon
[docs] def fit(self, X_for_smoother, y): self._reset_smoother_status() cond_lower = self._split_condition(X_for_smoother) if np.sum(cond_lower) > 0: self.smoother_lower.fit(X_for_smoother[cond_lower], y[cond_lower]) self.smoother_lower_fitted = True if np.sum(~cond_lower) > 0: self.smoother_upper.fit(X_for_smoother[~cond_lower], y[~cond_lower]) self.smoother_upper_fitted = True assert self.smoother_upper_fitted or self.smoother_lower_fitted
[docs] def predict(self, X_for_smoother): if not self.smoother_upper_fitted and not self.smoother_lower_fitted: raise ValueError("The {} has not been fitted!".format(self.__class__.__name__)) cond_lower = self._split_condition(X_for_smoother) pred = np.ones(len(X_for_smoother)) * self.nan_representation if self.smoother_lower_fitted and np.sum(cond_lower) > 0: pred[cond_lower] = self.smoother_lower.predict(X_for_smoother[cond_lower]) if self.smoother_upper_fitted and np.sum(~cond_lower) > 0: pred[~cond_lower] = self.smoother_upper.predict(X_for_smoother[~cond_lower]) return pred
__all__ = [ "NormalizationSmoother", "RegressionTypeSmoother", "NormalizationRegressionTypeSmoother", "SectionSmoother", ]