from __future__ import absolute_import, division, print_function
import copy
import numpy as np
from cyclic_boosting import utils
[docs]
class BaseObserver(object):
"""
Base class for observers
Observers are used to extract information from cyclic boosting estimators
that might be of further interest, but are not needed by the estimator in
order to make predictions.
"""
[docs]
def observe_iterations(self, iteration, X, y, prediction, weights, estimator_state):
"""
Called after each iteration of the algorithm.
Parameters
----------
iteration : int
number of the iteration
X : :class:`pandas.DataFrame` or :class:`numpy.ndarray`
feature matrix
y : numpy.ndarray
target array
prediction : numpy.ndarray
current target prediction
weights : numpy.ndarray
target weights
estimator_state : dict
state of the estimator. See
:meth:`cyclic_boosting.base.CyclicBoostingBase.get_state` for
information on what will be passed here
"""
[docs]
def observe_feature_iterations(self, iteration, feature_i, X, y, prediction, weights, estimator_state):
"""
Called after each feature has been processed as part of a full
iteration of the algorithm.
Parameters
----------
iteration : int
number of the iteration
feature_i : int
number of the feature that was processed
X : :class:`pandas.DataFrame` or :class:`numpy.ndarray`
feature matrix
y : numpy.ndarray
target array
prediction : numpy.ndarray
current target prediction
weights : numpy.ndarray
target weights
estimator_state : dict
state of the estimator. See
:meth:`~cyclic_boosting.base.CyclicBoostingBase.get_state` for
information on what will be passed here
local_variables : dict
local variables
"""
[docs]
class PlottingObserver(BaseObserver):
"""
Observer retrieving all information necessary to obtain analysis plots
based on a cyclic boosting training.
Instances of this class are intended to be passed as elements of the
``observers`` parameter to a cyclic boosting estimator, where each will
gather information on a specific iteration. Afterwards, they can be passed
to :func:`~cyclic_boosting.plots.plot_analysis`.
Parameters
----------
iteration : int
The observer will save all necessary information for the analysis plots
based on the state of the internal variables of the estimator
after the given iteration has been calculated.
Default is `-1`, which signifies the last iteration.
"""
def __init__(self, iteration=-1):
if iteration == 0:
raise ValueError("This plotting observer only makes sense with iterations >= 1.")
self.iteration = iteration
self.features = None
self.link_function = None
self.n_feature_bins = None
self.loss = list()
self.factor_change = list()
self.histograms = None
self._fitted = False
[docs]
def observe_iterations(self, iteration, X, y, prediction, weights, estimator_state, delta=None, quantile=None):
"""Observe iterations in cyclic_boosting estimator to collect information for
necessary for plots. This function is called in each major loop and once in the
end.
Parameters
----------
iteration: int
current major iteration of cyclic boosting loop
X: pd.DataFrame or numpy.ndarray shape(n, k)
feature matrix
y: np.ndarray
target array
prediction: np.ndarray
array of current cyclic boosting prediction
weights: np.ndarray
array of event weights
estimator_state: dict
state object of cyclic_boosting estimator
"""
features = estimator_state["features"]
if (iteration <= self.iteration and iteration != -1) or self.iteration == -1:
self.loss.append(estimator_state["insample_loss"])
if iteration != 0:
# for iteration 0 there are no old fators to compare with
self.factor_change.append(delta)
if iteration == self.iteration:
self._fitted = True
self.features = copy.deepcopy(features)
self.n_feature_bins = {feature.feature_group: feature.n_multi_bins_finite for feature in self.features}
self.link_function = estimator_state["link_function"]
self.histograms = calc_in_sample_histograms(y, prediction, weights, quantile)
[docs]
def observe_feature_iterations(self, iteration, feature_i, X, y, prediction, weights, estimator_state):
"""Observe iterations in cyclic_boosting estimator to collect information for
necessary for plots. This function is called in each feature/minor loop.
Parameters
----------
iteration: int
current major iteration number of cyclic boosting loop
feature_i: int
current minor iteration number of cyclic boosting loop
X: pd.DataFrame or numpy.ndarray shape(n, k)
feature matrix
y: np.ndarray
target array
prediction: np.ndarray
array of current cyclic boosting prediction
weights: np.ndarray
array of event weights
estimator_state: dict
state object of cyclic_boosting estimator
"""
pass
[docs]
def check_fitted(self):
if not self._fitted:
raise ValueError("Observer not filled.")
[docs]
def calc_in_sample_histograms(y, pred, weights, quantile=None):
"""
Calculates histograms for use with diagonal plot.
Parameters
----------
y : numpy.ndarray
truth
pred : numpy.ndarray
prediction
weights: np.ndarray
array of event weights
Returns
-------
result : tuple
Tuple consisting of:
* means
* bin_centers
* errors
* counts
"""
nbins = 100
bin_boundaries, bin_centers = utils.calc_linear_bins(pred, nbins)
bin_numbers = utils.digitize(pred, bin_boundaries)
means, _, counts, errors = utils.calc_means_medians(bin_numbers, y, weights)
if quantile is not None:
means = utils.calc_weighted_quantile(bin_numbers, y, weights, quantile)
errors = None
bin_centers = bin_centers[np.where(~np.isnan(means.reindex(np.arange(1, nbins + 1))))]
# quantiles do not work for classification mode
if np.isin(y, [0, 1]).all():
return means, bin_centers, None, counts
else:
return means, bin_centers, errors, counts
__all__ = ["PlottingObserver", "BaseObserver", "calc_in_sample_histograms"]