Source code for cyclic_boosting.features

from collections import namedtuple

import numpy as np
import pandas as pd

from cyclic_boosting import flags
from cyclic_boosting.common_smoothers import SmootherChoice
from cyclic_boosting.smoothing import multidim, onedim
from cyclic_boosting.smoothing.base import AbstractBinSmoother
from cyclic_boosting.utils import (
    arange_multi,
    clone,
    get_X_column,
    multidim_binnos_to_lexicographic_binnos,
)

from typing import Any, List, Optional, Union

FeatureID = namedtuple("FeatureID", ["feature_group", "feature_type"])


[docs] class FeatureTypes(object): standard = None external = "external"
[docs] class Feature(object): """Wrapper for information regarding a single feature group in the cyclic boosting algorithm Parameters ---------- feature_id: :class:`FeatureID` feature ID is a named tuple consisting of the name of a feature or tuple of such names in the multidimensional case and the feature type. feature_property: int feature property, see :mod:`flags` smoother: subclass of :class:`cyclic_boosting.smoothing.base.AbstractBinSmoother` smoother for the bins of the feature group minimal_factor_change: float minimum change of the results in the feature group's bins that are considered progress in a cyclic boosting iteration Note ---- The suffix ``_link`` in attributes refers to values valid in link space. """ def __init__( self, feature_id: FeatureID, feature_property: Union[tuple, int], smoother: AbstractBinSmoother, minimal_factor_change: Optional[float] = 1e-4, ): self.feature_id = feature_id self.feature_group = feature_id.feature_group if len(self.feature_group) < 2: self.smootherb = onedim.BinValuesSmoother() else: self.smootherb = multidim.BinValuesSmoother() self.is_fitted = False self.feature_property = feature_property self.smoother = smoother self.learn_rate = 0.0 self.unfitted_factors_link = None self.factors_link = None self.factors_link_old = None self.unfitted_uncertainties_link = None self.uncertainties_link = None self.fitted_aggregated = None # set by bind_data method self.lex_binned_data = None self.n_multi_bins_finite = None self.bin_weightsums = None self.minimal_factor_change = minimal_factor_change self.stop_iterations = False self.feature_type = feature_id.feature_type self.factor_sum = None # required by prepare_plots self.mean_dev = None self.y = None self.y_finite = None self.prediction = None self.prediction_finite = None # set by set_feature_bin_deviations_from_neutral self.bin_weighted_average = None @property def dim(self) -> int: """Dimension of the feature group, i.e. the number of its features""" return len(self.feature_group) @property def is_1dim(self) -> bool: """ True if feature is 1-dimensional, False otherwise """ return self.dim == 1 @property def missing_not_learned(self) -> bool: """ True if ``flags.MISSING_NOT_LEARNED`` can be found in feature property. """ return any(flags.missing_not_learned_set(feature_prop) for feature_prop in self.feature_property) @property def n_bins(self) -> int: """Number of bins, including the extra bin for missing and infinite values """ return self.n_bins_finite + 1 @property def n_bins_finite(self) -> int: """Number of bins, excluding the extra bin for missing and infinite values """ return int(np.prod(self.n_multi_bins_finite)) @property def bin_centers(self) -> np.ndarray: if self.n_bins_finite == 0: return np.ndarray(shape=(0, len(self.n_multi_bins_finite))) else: return arange_multi(self.n_multi_bins_finite) @property def finite_bin_weightsums(self) -> np.ndarray: """Array of finite bin weightsums""" return self.bin_weightsums[:-1] @property def nan_bin_weightsum(self) -> np.ndarray: return self.bin_weightsums[-1]
[docs] def bind_data(self, X: Union[pd.DataFrame, np.ndarray], weights: np.ndarray) -> None: """ Binds data from X belonging to the feature and calculates the following features: * lex_binned_data: The binned data transformed to a 1-dimensional array containing lexical binned data. * n_multi_bins_finite: Number of bins for each column in feature (needed for plotting). * finite_bin_weightsums: Array containing the sum of weights for each bin. * nan_bin_weightsum: The sum of weights for the nan-bin. * bin_centers: Array containing the center of each bin. """ binnumbers = get_X_column(X, self.feature_group, array_for_1_dim=False) ( self.lex_binned_data, self.n_multi_bins_finite, ) = multidim_binnos_to_lexicographic_binnos(binnumbers, self.n_multi_bins_finite) self.bin_weightsums = np.bincount(self.lex_binned_data, weights=weights, minlength=self.n_bins)
@property def unfitted_factor_link_nan_bin(self) -> np.ndarray: return self.unfitted_factors_link[-1] @property def factor_link_nan_bin(self) -> np.ndarray: return self.factors_link[-1]
[docs] def unbind_data(self) -> None: """Clear some of the references set in :meth:`bind_data`.""" self.lex_binned_data = None
[docs] def unbind_factor_data(self) -> None: self.unfitted_factors_link = None self.factors_link_old = None self.fitted_aggregated = None self.unfitted_uncertainties_link = None self.uncertainties_link = None self.factors_link = [self.factors_link[-1]]
def _get_data_for_smoother(self, uncertainties_link) -> np.ndarray: """Prepare ``X_for_smoother`` as needed by the smoother's ``fit`` method. bin_centers, bin_weightsums and the uncertainties in link space must already have been calculated. """ n_bins = self.n_bins_finite n_bin_dims = self.dim X_for_smoother = np.empty((n_bins + 1, n_bin_dims + 2)) X_for_smoother[:n_bins, :n_bin_dims] = self.bin_centers X_for_smoother[n_bins, :n_bin_dims] = np.nan X_for_smoother[:, n_bin_dims] = self.bin_weightsums X_for_smoother[:, -1] = uncertainties_link return X_for_smoother
[docs] def update_factors( self, unfitted_factors: np.ndarray, unfitted_uncertainties: np.ndarray, neutral_factor_link: float, learn_rate: float, ) -> np.ndarray: """Call the smoother on the bin results if necessary. Parameters ---------- unfitted_factors: ndarray bin means in link space as returned by the method :meth:`cyclic_boosting.base.CyclicBoostingBase.calc_parameters` unfitted_uncertainties: ndarray bin uncertainties in link space as returned by the method :meth:`cyclic_boosting.base.CyclicBoostingBase.calc_parameters` neutral_factor_link: float neutral value in link space, currently always 0 learn_rate: float learning rate used """ self.learn_rate = learn_rate self.factors_link_old = self.factors_link.copy() self.unfitted_factors_link = unfitted_factors self.unfitted_uncertainties_link = unfitted_uncertainties if self.missing_not_learned: # do not learn missing features by regularizing the factor # for the nan bin to the neutral factor unfitted_factors[-1] = neutral_factor_link self.factors_link[-1] = neutral_factor_link else: self.factors_link[-1] = unfitted_factors[-1] * learn_rate X_for_smoother = self._get_data_for_smoother(unfitted_uncertainties) if self.n_bins_finite > 0: self.smoother.fit(X_for_smoother[:-1], unfitted_factors[:-1].copy()) if not self.is_fitted: self.smootherb.fit(X_for_smoother[:-1], unfitted_factors[:-1].copy()) self.smootherb.smoothed_y_ = None else: self.smoother = None self.smootherb = None return X_for_smoother
[docs] def prepare_feature(self) -> None: self.factors_link = self.fitted_aggregated self.learn_rate = 1.0 self.smoother = self.smootherb del self.smootherb if self.smoother is not None: self.smoother.smoothed_y_ = self.factors_link[:-1].copy()
[docs] def clear_feature_reference(self, observers: List) -> None: if len(observers) == 0: self.bin_weightsums = None self.unbind_data() self.unbind_factor_data()
[docs] def set_feature_bin_deviations_from_neutral(self, neutral_factor_link: float) -> None: weights = np.bincount(self.lex_binned_data, minlength=self.n_bins) weighted_average = np.sum(weights * np.abs(self.factors_link - neutral_factor_link) / np.sum(weights)) self.bin_weighted_average = weighted_average
[docs] def create_feature_id(feature_group_or_id: Union[FeatureID, Any], default_type: Optional[str] = None) -> FeatureID: """Convenience function to convert feature_groups into :class:`FeatureID`s """ if isinstance(feature_group_or_id, FeatureID): return feature_group_or_id elif isinstance(feature_group_or_id, tuple): return FeatureID(feature_group=feature_group_or_id, feature_type=default_type) else: return FeatureID(feature_group=(feature_group_or_id,), feature_type=default_type)
[docs] class FeatureList(object): """Iterable providing the information about a list of features in the form of :class:`Feature` objects Parameters ---------- features: list of :class:`Feature` List of :class:`Feature` that is normally created by :func:`create_features`. """ def __init__(self, features: List[Feature]): self.features = features @property def feature_groups(self) -> List[Union[tuple, str, int]]: """ Obtain the feature_groups for all features. """ return [feature.feature_group for feature in self.features] @property def feature_ids(self) -> List[FeatureID]: """ Obtain the feature_groups for all features. """ return [feature.feature_id for feature in self.features]
[docs] def iter_fitting(self): """Generator yielding only the features with attribute ``stop_iterations == False`` """ for feature in self.features: if not feature.stop_iterations: yield feature
def __iter__(self): for feature in self.features: yield feature def __len__(self) -> int: return len(self.features) def __getitem__(self, feature_group_or_id: Union[str, int, tuple, FeatureID]) -> Feature: """Selects feature specified by ``feature_id`` Parameters ---------- feature_group_or_id: `string`, `int` or `tuple` of `string` or :class:`FeatureID` feature identifier Returns ------- class:`cyclic_boosting.base.Feature` Feature instance """ feature_id = create_feature_id(feature_group_or_id) for feature in self.features: if feature.feature_id == feature_id: return feature raise KeyError("Feature {0} is not known in {1}".format(feature_id, self.feature_ids))
[docs] def get_feature(self, feature_group: Union[str, int, tuple], feature_type: Optional[str] = None) -> Feature: """Selects feature specified by ``feature_group`` and ``feature_type`` Parameters ---------- feature_group: `string`, `int` or `tuple` of `string` or `int` name or index of feature group feature_type: `None` or `string` type of feature Returns ------- class:`cyclic_boosting.base.Feature` Feature instance """ feature_id = create_feature_id(feature_group, feature_type) return self[feature_id]
[docs] def create_features( feature_groups_or_ids: List[Union[str, int, tuple, FeatureID]], feature_properties: dict, smoother_choice: SmootherChoice, ) -> FeatureList: """ Parameters ---------- feature_groups_or_ids: List of `string`, `int` or `tuple` of `string` or :class:`FeatureID` feature identifier feature_properties: dict of feature properties smoother_choice: subclass of :class:`cyclic_boosting.SmootherChoice` Selects smoothers """ if feature_groups_or_ids is None: feature_groups_or_ids = [] def make_feature_for_group_or_id(feature_group_or_id): feature_id = create_feature_id(feature_group_or_id) feature_property = flags.read_feature_property( feature_properties, feature_id.feature_group, default=flags.HAS_MISSING ) smoother = smoother_choice.choice_fct(feature_id.feature_group, feature_property, feature_id.feature_type) return Feature(feature_id, feature_property, clone(smoother)) features = [make_feature_for_group_or_id(feature_group_or_id) for feature_group_or_id in feature_groups_or_ids] return FeatureList(features)