Source code for cyclic_boosting.plots._1dplots

import matplotlib.pyplot as plt
import numpy as np
from six.moves import map

from cyclic_boosting import flags
from cyclic_boosting.link import IdentityLinkMixin, LogitLinkMixin, LogLinkMixin


def _format_tick(tick, precision=1e-2):
    """Returns a suitable string representation for a
    factor, when it is given a tick position in linear space.

    A "suitable" string reprentation is (in this order):

    * an integer
    * decimal numbers with precision 2
    """
    if tick - precision < np.round(tick) < tick + precision:
        return "{:.0f}".format(tick)
    return "{:.{prec}f}".format(tick, prec=2)


def _get_x_axis(factors, bin_bounds, is_continuous):
    """
    Get x axis range and tick labels
    """
    if bin_bounds is not None:
        if is_continuous:
            labels = np.round((bin_bounds[:-1] + bin_bounds[1:]) / 2.0, 5)
        else:
            labels = bin_bounds[1:]
    else:
        labels = None
    x_axis_range = np.arange(len(factors)).astype(np.float64)
    return x_axis_range, labels


def _get_optimal_number_of_ticks(distance):
    """
    Return optimal number of ticks given a distance of the upper and lower bound
    First we scale the distance to the interval [1,20], afterwards we search
    the smalest number out of {1,2,4,5,10,20} which leads to a total tick number in (10,20].
    If we encounter something unexpected we just return 21
    """
    if distance < 1 or np.isinf(distance) or np.isnan(distance):
        return 21
    while distance > 20:
        distance /= 10
    for n in [20, 10, 5, 4, 2, 1]:
        n_ticks = np.floor(distance * n)
        if 10 < n_ticks <= 20:
            return n_ticks + 1
    return 21


def _get_y_axis(factors, uncertainties=None):
    """
    Get y axis range and tick labels
    """
    if len(factors[:-1]) > 0:
        if uncertainties is not None:
            y_max_link = max(np.max(factors[:-1] + uncertainties[1][:-1]), factors[-1])
            y_min_link = min(np.min(factors[:-1] - uncertainties[0][:-1]), factors[-1])
        else:
            y_max_link = np.max(factors)
            y_min_link = np.min(factors)
    else:
        y_max_link = factors[-1] + 0.5
        y_min_link = factors[-1] - 0.5

    y_min_link_int = np.floor(y_min_link)
    y_max_link_int = np.ceil(y_max_link)

    distance_int = y_max_link_int - y_min_link_int
    n_ticks = _get_optimal_number_of_ticks(distance_int)

    linspace = np.linspace(y_min_link_int, y_max_link_int, int(n_ticks))

    return linspace, list(map(_format_tick, linspace))


def _ensure_tuple(x):
    """
    Ensures that given object is a tuple, if not wrap it in a tuple
    """
    return x if isinstance(x, tuple) else (x,)


def _plot_factors(factors, x_axis_range, label, uncertainties=None):
    """
    Plot unsmoothed factors in given range with errobars if uncertainties are provided
    """
    if uncertainties is not None:
        unsmoothed_style = dict(capsize=2.5, markersize=2, fmt="o", color="k", alpha=0.6)
        unsmoothed_style["label"] = label
        plt.errorbar(x_axis_range, factors, yerr=uncertainties, **unsmoothed_style)
    else:
        unsmoothed_style = dict(markersize=2, marker="o", color="k", alpha=0.6)
        unsmoothed_style["label"] = label
        plt.plot(x_axis_range, factors, **unsmoothed_style)


def _plot_smoothed_factors(factors, x_axis_range, is_continuous, uncertainties=None):
    """
    Plot smoothed factors, plot style depends on is_continuous
    """
    if is_continuous:
        smoothed_style = dict(linestyle="-", linewidth=1.0, color="r")
        x_axis_range = (x_axis_range + np.append(x_axis_range, x_axis_range[-1] + 1)[1:]) / 2
    else:
        smoothed_style = dict(
            marker="o",
            markeredgecolor="r",
            markersize=5.0,
            linestyle="none",
            fillstyle="none",
            color="r",
        )
    smoothed_style["label"] = "smoothed factors"
    if uncertainties is not None:
        plt.errorbar(x_axis_range, factors, yerr=[uncertainties[0], uncertainties[1]], **smoothed_style)
    else:
        plt.plot(x_axis_range, factors, **smoothed_style)


def _plot_missing_factor(factors, x_axis_range, y_axis_range):
    """
    Plot the factor which was calculated for missing values and mark the region in an orange color
    """
    # Factor which corresponds to missing value is the last one
    missing_factor = factors[-1]
    x_position = x_axis_range[-1]

    # Plot single datapoint and shade the whole area around this point to mark it as "special"
    nan_style = dict(
        marker="p",
        markeredgecolor="r",
        markersize=5.0,
        color="b",
        linestyle="none",
        fillstyle="none",
    )
    nan_style["label"] = "smoothed nan factor"
    plt.plot([x_position], [missing_factor], **nan_style)
    plt.fill_between(
        [x_position - 0.5, x_position + 0.5],
        min(y_axis_range),
        max(y_axis_range),
        color="#f7d208",
        alpha=0.5,
    )


def _plot_axes(x_axis_range, x_axis_labels, y_axis_range, y_axis_labels, is_continuous):
    """
    Plot axes including limits, labels and ticks
    """
    # Set limits
    plt.xlim(min(x_axis_range) - 0.5, max(x_axis_range) + 0.5)
    plt.ylim(min(y_axis_range), max(y_axis_range))

    # Shift x_axis to the left if continuous, because in this case the label correspond to the bin boundaries
    if is_continuous:
        x_axis_range -= 0.5
    if x_axis_labels is not None:
        if len(x_axis_range) - len(x_axis_labels) == 1:
            x_axis_labels = np.append(x_axis_labels, "")
        plt.xticks(x_axis_range, x_axis_labels, size="xx-small", rotation="vertical")
    plt.yticks(y_axis_range, y_axis_labels)


[docs] def plot_factor_1d( feature, bin_bounds=None, with_errorbars=True, ylimits_include_errors=True, link_function=None, plot_yp=True, ): """ Plots a single one dimensional factor plot. Parameters ---------- bin_bounds: list Bin boundaries to label the bins. feature: cyclic_boosting.base.Feature Feature as it can be obtained from the plotting observers ``features`` property. link_function: cyclic_boosting.link.LinkFunction Link function of the plotted feature with_errorbars: bool Option to switch errorbars on/off. ylimits_include_errors: bool Option to show the errorbars in the plot completely. plot_yp: bool Show deviation between truth and prediction in last iteration. """ y = feature.y if y is None: plot_yp = False p = feature.prediction if plot_yp: factors = feature.mean_dev else: factors = feature.unfitted_factors_link smoothed_factors = feature.factors_link if plot_yp: uncertainties = np.abs(feature.unfitted_factors_link) uncertainties = [uncertainties, uncertainties] else: uncertainties = [ feature.unfitted_uncertainties_link, feature.unfitted_uncertainties_link, ] assert len(factors) == len(smoothed_factors) == len(uncertainties[0]) == len(uncertainties[1]) > 0 number_of_factors = len(factors) if isinstance(link_function, IdentityLinkMixin): plt.axhline(0, color="gray") plt.ylabel("Summand") elif isinstance(link_function, LogLinkMixin): factors = link_function.unlink_func(factors) smoothed_factors = link_function.unlink_func(smoothed_factors) if plot_yp: y = link_function.unlink_func(y) p = link_function.unlink_func(p) plt.axhline(1, color="gray") plt.ylabel("Factor") elif isinstance(link_function, LogitLinkMixin): lower = np.abs(link_function.unlink_func(factors - uncertainties[0]) - link_function.unlink_func(factors)) upper = np.abs(link_function.unlink_func(factors + uncertainties[1]) - link_function.unlink_func(factors)) factors = link_function.unlink_func(factors) smoothed_factors = link_function.unlink_func(smoothed_factors) uncertainties = [ np.where(lower < 0.0, 0.0, lower), np.where(upper > 1.0, 1.0, upper), ] if plot_yp: # do not unlink for nbinom width mode if ((link_function.unlink_func(y) >= 0).all()) and ((link_function.unlink_func(y) <= 1).all()): y = link_function.unlink_func(y) p = link_function.unlink_func(p) plt.axhline(0.5, color="gray") plt.ylabel("Probability") else: plt.ylabel("Unkown") # Too many factors make the plot unreadable. Thus we resort to plotting a # histogram of factors in these cases. if number_of_factors > 400: from cyclic_boosting.plots import plot_factor_histogram plot_factor_histogram(feature) return feature_property = _ensure_tuple(feature.feature_property) is_continuous = flags.is_continuous_set(feature_property[0]) | flags.is_linear_set(feature_property[0]) if plot_yp: minmax = np.r_[ np.min(np.r_[factors, smoothed_factors, y, p]), np.max(np.r_[factors, smoothed_factors, y, p]), ] else: minmax = np.r_[ np.min(np.r_[factors, smoothed_factors]), np.max(np.r_[factors, smoothed_factors]), ] f = factors.copy() if len(f) > 1: f[:2] = minmax u = uncertainties else: f = minmax u = np.c_[uncertainties, uncertainties] y_axis_range, y_axis_labels = _get_y_axis(f, u if ylimits_include_errors else None) x_axis_range, x_axis_labels = _get_x_axis(factors, bin_bounds, is_continuous) if "MISSING" in flags._convert_flags_to_string(feature.feature_property[0]): _plot_missing_factor(smoothed_factors, x_axis_range, y_axis_range) elif len(factors) > 1: factors = factors[:-1] uncertainties = [uncertainties[0][:-1], uncertainties[1][:-1]] smoothed_factors = smoothed_factors[:-1] x_axis_range = x_axis_range[:-1] _plot_axes(x_axis_range, x_axis_labels, y_axis_range, y_axis_labels, is_continuous) _plot_smoothed_factors(smoothed_factors, x_axis_range, is_continuous, None) if not plot_yp: label = "factors" if is_continuous: x_axis_range = (x_axis_range + np.append(x_axis_range, x_axis_range[-1] + 1)[1:]) / 2 _plot_factors(factors, x_axis_range, label, uncertainties if with_errorbars else None) try: if len(factors) > 1: plt.plot(p[:-1], ".-", label="prediction", alpha=0.5) plt.plot(y[:-1], ".-", label="truth", alpha=0.5) else: plt.plot(p, ".-", label="prediction", alpha=0.5) plt.plot(y, ".-", label="truth", alpha=0.5) except: pass from cyclic_boosting.plots import _format_groupname_with_type feature_group = _format_groupname_with_type(feature.feature_group, feature.feature_type) plt.xlabel(feature_group) plt.legend()
__all__ = []