Source code for cyclic_boosting.smoothing.multidim

"""
Multidimensional smoothers
"""
from __future__ import absolute_import, division, print_function

import numpy as np
import pandas as pd
from scipy import sparse

from cyclic_boosting import utils
from cyclic_boosting.smoothing.base import AbstractBinSmoother, SetNBinsMixin


[docs] class PredictingBinValueMixin(SetNBinsMixin): """Mixin for smoothers of multidimensional bins with :meth:`predict` returning the corresponding entry in the estimated parameter ``smoothed_y_`` which must have been calculated in :meth:`fit`. Please create the array ``smoothed_y_`` in your :meth:`fit` method. **Estimated parameters** :param `smoothed_y_`: the bin values for ``y`` (as received from the profile function in the fit) after some smoothing; a pseudobin for the missing values is not supported; the values are indexed in lexicographical ordering using the :type `smoothed_y_`: :class:`numpy.ndarray` (float64, shape `(n_bins,)`) :param `n_bins_`: number of bins in each dimension; it is permitted to append additional entries to this array. They are ignored in :meth:`predict` anyway. Please use :meth:`set_n_bins` to initialize this estimated parameter in your implementation of :meth:`fit`. :type `n_bins_`: :class:`numpy.ndarray` (int64, shape `(n_dims + x,)`) For examples, see the subclass :class:`BinValuesSmoother`. """
[docs] def predict(self, X): if not hasattr(self, "n_bins_"): raise ValueError('Please call the method "fit" before "predict" and ' '"set_n_bins" in your "fit" method') if self._bin_steps is None: self.n_bins_ = self.n_bins_[: X.shape[1]] self._bin_steps = utils.bin_steps(self.n_bins_) binnos = X binnos_round = np.asarray(np.floor(X), dtype=int) is_valid = np.all( np.isfinite(binnos) & (binnos >= 0) & (binnos_round < (self.n_bins_[None, :])), axis=1, ) pred = utils.nans(len(binnos)) pred[is_valid] = self.smoothed_y_[np.dot(binnos_round[is_valid], self._bin_steps[1:])] return pred
[docs] class BinValuesSmoother(AbstractBinSmoother, PredictingBinValueMixin): """Smoother of multidimensional bins that outputs the saved bin values of y (as received from the profile function in the fit) as the prediction. This smoother only considers the first ``n_dim`` columns of ``X_for_smoother`` passed by the **profile function**. These columns are supposed to contain all the coordinates of the ``n_dim``-dimensional bin centers. **Estimated parameters** :param `smoothed_y_`: the bin values for ``y`` (as received from the profile function in the fit), in this case without any smoothing; a pseudobin for the missing values is not supported :type `smoothed_y_`: :class:`numpy.ndarray` (float64, shape `(n_bins,)`) :param `n_bins_`: see :class:`PredictingBinValueMixin` >>> from cyclic_boosting import smoothing >>> X = np.c_[[0., 0, 1, 1], ... [0, 1, 0, 1], ... [1, 1, 1, 1]] # ignored >>> y = np.array([90, 80, 50, 40]) >>> reg = smoothing.multidim.BinValuesSmoother() >>> assert reg.fit(X, y) is reg >>> assert np.allclose(reg.smoothed_y_, y) >>> X = np.c_[[1.1, 0.4, 0.0, 0.1, 2.], ... [1.2, 1.1, 0.4, 0.4, 0.]] >>> reg.predict(X) array([ 40., 80., 90., 90., nan]) """ elems = ["smoothed_y_", "bin_weights_"]
[docs] def fit(self, X_for_smoother, y): self.set_n_bins(X_for_smoother) self.smoothed_y_ = y return self
def __getstate__(self): """Return state values to be pickled.""" state = self.__dict__.copy() for elem in self.elems: if elem in state and state[elem] is not None: state[elem] = sparse.csr_matrix(state[elem]) return state def __setstate__(self, state): """Restore state from the unpickled state values.""" for elem in self.elems: if elem in state and state[elem] is not None: state[elem] = state[elem].toarray()[0, :] self.__dict__.update(state)
[docs] class RegularizeToPriorExpectationSmoother(AbstractBinSmoother, PredictingBinValueMixin): r"""Smoother of multidimensional bins regularizing values with uncertainties to a prior expectation. For details, see :func:`cyclic_boosting.utils.regularize_to_prior_expectation`. :param prior_expectation: The prior dominate the regularized value if the uncertainties are large. :type prior_expectation: :class:`numpy.ndarray` (float64, dim=1) or float :param threshold: Threshold in terms of sigma. If the significance of a value: .. math:: \text{sig}(x_i) = \frac{x_i - \text{prior\_expectation}_i} {\text{uncertainty}_i} is below the threshold, the prior expectation replaces the value. :type threshold: float **Required columns** in the ``X_for_smoother`` passed to :meth:`fit`: * columns 0 to``n_dim - 1``: coordinates of the bin centers (ignored here) * column ``n_dim``: ignored * column ``n_dim + 1``, which must be the last: uncertainty of the average ``y`` in each bin **Doctests** >>> from cyclic_boosting import smoothing >>> y = np.array([0, 1, 2, 3]) >>> X = np.c_[ ... [0, 0, 1, 1], ... [0, 1, 0, 1], ... [1]*4, ... [0.1]*4] >>> est = smoothing.multidim.RegularizeToPriorExpectationSmoother(1.) >>> assert est.fit(X, y) is est >>> y_smoothed = est.predict(X[:, :2]) >>> y_smoothed array([ 0.03175416, 1. , 1.96824584, 2.98431348]) >>> np.allclose(1 - np.sqrt(((y[0] - 1) / 0.1)**2 - 2.5**2) * 0.1, ... y_smoothed[0]) True >>> np.allclose(1 + np.sqrt(((y[-1] - 1) / 0.1)**2 - 2.5**2) * 0.1, ... y_smoothed[-1]) True """ def __init__(self, prior_expectation, threshold=2.5): self.prior_expectation = prior_expectation self.threshold = threshold
[docs] def fit(self, X_for_smoother, y): self.set_n_bins(X_for_smoother) self.smoothed_y_ = utils.regularize_to_prior_expectation( y, X_for_smoother[:, -1], self.prior_expectation, threshold=self.threshold ) return self
[docs] class RegularizeToOneSmoother(RegularizeToPriorExpectationSmoother): """Smoother for multidimensional bins regularizing values with uncertainties to the prior expectation 1. For details, see the superclass :class:`RegularizeToPriorExpectationSmoother` and the underlying function :func:`cyclic_boosting.utils.regularize_to_prior_expectation`. :param threshold: threshold in terms of sigma. If the significance of a factor is below the threshold, the global measurement replaces the factor. Internally :func:`cyclic_boosting.utils.regularize_to_prior_expectation` is used. :type threshold: float """ def __init__(self, threshold=2.5): RegularizeToPriorExpectationSmoother.__init__(self, prior_expectation=1, threshold=threshold)
[docs] class WeightedMeanSmoother(AbstractBinSmoother, PredictingBinValueMixin): r"""Smoother for multidimensional bins regularizing values with uncertainties to the weighted mean. For details see :func:`cyclic_boosting.utils.regularize_to_error_weighted_mean`. :param prior_prediction: If the `prior_prediction` is specified, all values are regularized with it and not with the error weighted mean. :type prior_prediction: float >>> from cyclic_boosting import smoothing >>> y_for_smoother = np.array([0.9, 0.9, 0.9, 1.8, 1.8, 0.4, 0.4]) >>> X_for_smoother = np.c_[ ... [1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0], ... [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0], ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], ... [0.05, 0.05, 0.05, 0.05, 0.15, 0.15, 0.05]] >>> smoother = smoothing.multidim.WeightedMeanSmoother() >>> smoother.fit(X_for_smoother, y_for_smoother) >>> smoother.smoothed_y_ array([ 0.90096366, 0.90096366, 0.90096366, 1.79077293, 1.723854 , 0.45467402, 0.40662518]) """ def __init__(self, prior_prediction=None): self.prior_prediction = prior_prediction
[docs] def fit(self, X_for_smoother, y): self.set_n_bins(X_for_smoother) self.smoothed_y_ = utils.regularize_to_error_weighted_mean(y, X_for_smoother[:, -1], self.prior_prediction)
class PriorExpectationMetaSmoother(AbstractBinSmoother, PredictingBinValueMixin): """Meta-Smoother that takes another multi-dimensional smoother which results are addionally smoothed using :func:`cyclic_boosting.utils.regularize_to_prior_expectation`. :param prior_expectation: The prior dominate the regularized value if the uncertainties are large. :type prior_expectation: :class:`numpy.ndarray` (float64, dim=1) or float :param threshold: Threshold in terms of sigma. If the significance of a value: .. math:: \text{sig}(x_i) = \frac{x_i - \text{prior_{expectation}_i}} {\text{uncertainty}_i} is below the threshold, the prior expectation replaces the value. :type threshold: float """ def __init__(self, est, prior_expectation, threshold=2.5): self.est = est self.prior_expectation = prior_expectation self.threshold = threshold def fit(self, X_for_smoother, y): """ """ d = X_for_smoother.shape[1] - 2 if d < 2: raise ValueError("You need at least 4 columns for multidim smoothing.") self.est.fit(X_for_smoother.copy(), y) self.set_n_bins(X_for_smoother) self.smoothed_y_ = utils.regularize_to_prior_expectation( self.est.predict(X_for_smoother[:, :d]), X_for_smoother[:, -1], self.prior_expectation, threshold=self.threshold, ) return self def _fit_est_on_group(X, n_group_columns, est): est = utils.clone(est) est.fit(X.iloc[:, n_group_columns:-1].values, X.iloc[:, -1].values) return est def _predict_groups(x, gb, n_group_columns): try: est = gb.loc(axis=0)[x.name] except KeyError: p = np.nan else: p = est.predict(np.c_[x.values]) return p
[docs] class GroupBySmoother(AbstractBinSmoother): """Multidimensional smoother that groups on the *first* k-1 columns of a k dimensional feature and smoothes a clone of the specified 1-dimensional smoother on each group. Parameters ---------- est: :class:`AbstractBinSmoother` One-dimensional smoother whose clones are fitted on the grouped columns ndim: int Number of dimensions of the feature. index_weight_col: int Index of weight column. If specified, rows with zero weight are removed. If `None`, no rows are dropped. """ @property def n_group_columns(self): return self.n_dim - 1 def __init__(self, est, n_dim, index_weight_col=None): self.est = est self.n_dim = n_dim self.index_weight_col = index_weight_col
[docs] def fit(self, X_for_smoother, y): if self.index_weight_col is not None: Xp = pd.DataFrame(X_for_smoother) gb_cols = list(range(self.n_dim - 1)) gb = Xp.groupby(gb_cols)[self.n_dim].sum() Xp = Xp[gb_cols].merge(gb.reset_index(), how="left", on=gb_cols) mask = Xp[self.n_dim].values > 0 X_for_smoother = X_for_smoother[mask] y = y[mask] X = pd.DataFrame(np.c_[X_for_smoother, y]) self.group_cols = list(range(self.n_group_columns)) self.gb = X.groupby(self.group_cols, sort=False).apply(_fit_est_on_group, self.n_group_columns, self.est)
[docs] def predict(self, X): X = pd.DataFrame(X) pred = X.groupby(self.group_cols, sort=False)[X.columns[-1]].transform( _predict_groups, self.gb, self.n_group_columns ) return pred.values
[docs] class GroupBySmootherCB(GroupBySmoother): """GroupBySmoother for cyclic boosting. Samples with zero weights are dropped to save memory. """ def __init__(self, est, n_dim): GroupBySmoother.__init__(self, est, n_dim, -2)
[docs] class Neutralize2DMetaSmoother(AbstractBinSmoother, PredictingBinValueMixin): """Meta-Smoother that takes another multi-dimensional smoother which inputs are smoothed by neutralize_one_dim_influence. This means, the influence of one-dimensional features on two-dimensional features is removed by iteratively projecting the two-dimensional factor matrix onto one dimension and substract this weighted by the uncertainties from the matrix. In a cyclic boosting model this prevents two-dimensional features to include the effect which should be learned by the one-dimension features. :func:`cyclic_boosting.utils.neutralize_one_dim_influence`. """ def __init__(self, est): self.est = est
[docs] def fit(self, X_for_smoother, y): """Fit the transformer to training samples.""" d = X_for_smoother.shape[1] - 2 if d < 2: raise ValueError("You need at least 4 columns for multidim smoothing.") # We get the y values as 1d array, hence we have to reshape it into # the correct 2d array new_shape = np.max(X_for_smoother[:, :2], axis=0).astype(np.int64) + 1 values = np.reshape(y, new_shape) uncertainties = np.reshape(X_for_smoother[:, -1], new_shape) neutralized_values = utils.neutralize_one_dim_influence(values, uncertainties) y = np.reshape(neutralized_values, np.prod(new_shape)) self.est.fit(X_for_smoother.copy(), y) self.set_n_bins(X_for_smoother) self.smoothed_y_ = self.est.predict(X_for_smoother[:, :d]) return self
__all__ = [ "PredictingBinValueMixin", "BinValuesSmoother", "RegularizeToPriorExpectationSmoother", "RegularizeToOneSmoother", "WeightedMeanSmoother", "Neutralize2DMetaSmoother", "GroupBySmoother", "GroupBySmootherCB", ]