Source code for cyclic_boosting.binning.bin_number_transformer

from __future__ import absolute_import, division, print_function

import logging

import numpy as np
import pandas as pd

from cyclic_boosting import flags

from ._binary_search import eq_multi, ge_multi
from ._utils import _read_feature_property, check_frame_empty
from .ecdf_transformer import ECdfTransformer

from typing import Union, Optional

MISSING_VALUE_AS_BINNO = -1

_logger = logging.getLogger(__name__)


[docs] class BinNumberTransformer(ECdfTransformer): r"""This transformer bins feature-variables in ``X`` into integral bins, depending on each feature's *feature property*. Features with discrete preprocessing (not continuous, but ordered or unordered) are enumerated by their unique values, ascending from the lowest (Thus, a column with ``10, 11, 12`` would be binned as ``0, 1, 2``). If no ``feature_properties`` are passed, all columns in ``X`` are treated as :obj:`cyclic_boosting.flags.IS_CONTINUOUS`. If a ``feature_properties`` dictionary is supplied, it must contain feature properties for each feature in ``X``. Not-a-number values in the input feature matrix are mapped to :obj:`cyclic_boosting.binning.MISSING_VALUE_AS_BINNO` in the transform step. This value can then be treated as a missing value by Cyclic Boosting. The feature property :obj:`cyclic_boosting.flags.HAS_MAGIC_INT_MISSING` enables missing-value treatment for values of -999 and -9 in integer-typed feature columns (for both continuous and non-continuous features). Binning is performed for each feature-column individually. For example, two columns with the same value range can end up with totally different bin numbers. Also, the ``n_bins`` argument which is typically an integer, can be indivualized by passing a dict that provides column-names and the respective number of bins, that should be used for continuous preprocessing. During the fit, all features are treated in the same way as in :class:`ECdfTransformer`. During the transform step, each feature value is transformed to the number of its feature bin. The range of bin numbers is:: [0, trans.bins_and_cdfs_[feature_no][1].shape[0] - 1) For the **estimated parameters** see :class:`ECdfTransformer`. Parameters ---------- n_bins: int Maximum number of bins used to estimate the empirical CDF. ``n_bins`` is ignored for features with discrete preprocessing. If a dict is passed, the feature names/indices should be the keys and the n_bins are the values. Example : ``{'feature a': 150, 'feature b': 20}`` feature_properties: dict Dictionary listing the names of all features as keys and their preprocessing flags as values. When using a numpy feature matrix X with no column names the keys of the feature properties are the column indices. weight_column: str or int Optional column label or column index for the weight column. If not set all samples receive the same weight 1. epsilon: float Used thresholds for the comparison of float values: * ``epsilon * 1.0`` for the comparison of CDF values * ``epsilon * minimal_bin_width`` for the comparison with bin boundaries of a given feature Default value for epsilon: 1e-9 tolerance: double Relative tolerance of the minimum bin weight. (E.g. if you specify 100 bins and a tolerance of 0.05 the bins are required to have only 0.95% of the total bin weights instead of 1.0%) Examples -------- >>> feature_1 = np.asarray([2.1, 2.2, 2.5, 3.1, 3.3, 3.7, 4.1, 4.4]) >>> X = np.c_[feature_1] >>> from cyclic_boosting.binning import BinNumberTransformer >>> trans = BinNumberTransformer(n_bins=4, epsilon=1e-8) >>> trans = trans.fit(X) >>> # only one input column >>> column, epsilon, bins_cdfs = trans.bins_and_cdfs_[0] >>> assert column == 0, np.allclose(epsilon, 1e-8 * 0.1) >>> bins_cdfs array([[ 2.1 , 0. ], [ 2.2 , 0.25], [ 3.1 , 0.5 ], [ 3.7 , 0.75], [ 4.4 , 1. ]]) >>> X_test = np.c_[[1.9, 2.15, 2.4, 2.2, 3.6, 3.5, 4.3, 5.1]] >>> trans.transform(X_test) array([[0], [0], [1], [0], [2], [2], [3], [3]], dtype=int8) """ def __init__( self, n_bins=100, feature_properties=None, weight_column=None, epsilon=1e-9, tolerance=0.1, inplace=False, ): self.n_bins = n_bins self.feature_properties = feature_properties self.weight_column = weight_column self.epsilon = epsilon self.tolerance = tolerance self.nan_representation = MISSING_VALUE_AS_BINNO self.inplace = inplace ECdfTransformer.__init__( self, n_bins=self.n_bins, feature_properties=self.feature_properties, weight_column=self.weight_column, epsilon=self.epsilon, tolerance=self.tolerance, ) def _transform_one_feature(self, X, feature_prop, col, epsilon, bins_and_cdfs): xt = column_selector(X, col).astype(np.float64) def is_finite(x): if flags.has_magic_missing_set(feature_prop): return (x != -9) & (x != -999) & np.isfinite(xt) else: return np.isfinite(xt) if bins_and_cdfs is not None: finite_mask = is_finite(xt) xt_f = xt[finite_mask] if flags.is_continuous_set(feature_prop): ge_multi(bins_and_cdfs[1:, 0], xt_f - epsilon, 1, xt_f) else: eq_multi( bins_and_cdfs[1:, 0], xt_f, np.arange(len(bins_and_cdfs[1:, 0]), dtype=np.float64), epsilon, xt_f, ) xt[finite_mask] = xt_f # re_check for nans, which may have been brought in by the # binary search (values out of bounds) xt[~is_finite(xt)] = MISSING_VALUE_AS_BINNO else: xt = MISSING_VALUE_AS_BINNO return xt
[docs] def transform( self, X_orig: Union[pd.DataFrame, np.ndarray], y: Optional[np.ndarray] = None ) -> Union[pd.DataFrame, np.ndarray]: self._check_input_for_transform(X_orig) if not self.inplace: X = X_orig.copy() else: X = X_orig if check_frame_empty(X): if isinstance(X, pd.DataFrame): X = X.astype({col: np.int8 for col, _, _ in self.bins_and_cdfs_}) else: return _as_int_array_of_minimum_dtype(X) return X n_transformed_features = len(self.bins_and_cdfs_) for col, epsilon, bins_and_cdfs in self.bins_and_cdfs_: feature_prop = _read_feature_property(col, self.feature_properties) if feature_prop is None: pass else: xt = self._transform_one_feature(X, feature_prop, col, epsilon, bins_and_cdfs) column_setter(X, col, xt) if not isinstance(X, pd.DataFrame) and n_transformed_features == X.shape[1]: X = _as_int_array_of_minimum_dtype(X) return X
[docs] def get_feature_bin_boundaries(self): return {feature: probas for feature, epsilon, probas in self.bins_and_cdfs_}
[docs] def column_selector(X, column): """Dispatches to column selection via pandas or numpy, depending on the type of X""" if isinstance(X, pd.DataFrame): return X[column].values else: return X[:, int(column)]
def _as_int_array_of_minimum_dtype(arr): if isinstance(arr, int): maximum = abs(arr) elif len(arr) == 0: maximum = 0 else: maximum = max(arr.max(), abs(arr.min())) if maximum <= np.iinfo(np.int8).max: return np.asarray(arr, dtype=np.int8) elif maximum <= np.iinfo(np.int16).max: return np.asarray(arr, dtype=np.int16) elif maximum <= np.iinfo(np.int32).max: return np.asarray(arr, dtype=np.int32) else: return np.asarray(arr, dtype=np.int64)
[docs] def column_setter(X, column, rhs): """Dispatches to column selection via pandas or numpy, depending on the type of X""" if isinstance(X, pd.DataFrame): X[column] = _as_int_array_of_minimum_dtype(rhs) else: X[:, int(column)] = rhs