Source code for cyclic_boosting.plots._1dplots

import matplotlib.pyplot as plt
import numpy as np
from six.moves import map

from cyclic_boosting import flags
from cyclic_boosting.link import IdentityLinkMixin, LogitLinkMixin, LogLinkMixin


def _format_tick(tick, precision=1e-2):
    """Returns a suitable string representation for a
    factor, when it is given a tick position in linear space.

    A "suitable" string reprentation is (in this order):

    * an integer
    * decimal numbers with precision 2
    """
    if tick - precision < np.round(tick) < tick + precision:
        return "{:.0f}".format(tick)
    return "{:.{prec}f}".format(tick, prec=2)


def _get_x_axis(factors, bin_bounds, is_continuous):
    """
    Get x axis range and tick labels
    """
    if bin_bounds is not None:
        if is_continuous:
            labels = np.round((bin_bounds[:-1] + bin_bounds[1:]) / 2.0, 5)
        else:
            labels = bin_bounds[1:]
    else:
        labels = None
    x_axis_range = np.arange(len(factors)).astype(np.float64)
    return x_axis_range, labels


def _get_optimal_number_of_ticks(distance):
    """
    Return optimal number of ticks given a distance of the upper and lower bound
    First we scale the distance to the interval [1,20], afterwards we search
    the smalest number out of {1,2,4,5,10,20} which leads to a total tick number in (10,20].
    If we encounter something unexpected we just return 21
    """
    if distance < 1 or np.isinf(distance) or np.isnan(distance):
        return 21
    while distance > 20:
        distance /= 10
    for n in [20, 10, 5, 4, 2, 1]:
        n_ticks = np.floor(distance * n)
        if 10 < n_ticks <= 20:
            return n_ticks + 1
    return 21


def _get_y_axis(factors, uncertainties=None):
    """
    Get y axis range and tick labels
    """
    if len(factors[:-1]) > 0:
        if uncertainties is not None:
            y_max_link = max(np.max(factors[:-1] + uncertainties[1][:-1]), factors[-1])
            y_min_link = min(np.min(factors[:-1] - uncertainties[0][:-1]), factors[-1])
        else:
            y_max_link = np.max(factors)
            y_min_link = np.min(factors)
    else:
        y_max_link = factors[-1] + 0.5
        y_min_link = factors[-1] - 0.5

    y_min_link_int = np.floor(y_min_link)
    y_max_link_int = np.ceil(y_max_link)

    distance_int = y_max_link_int - y_min_link_int
    n_ticks = _get_optimal_number_of_ticks(distance_int)

    linspace = np.linspace(y_min_link_int, y_max_link_int, int(n_ticks))

    return linspace, list(map(_format_tick, linspace))


def _ensure_tuple(x):
    """
    Ensures that given object is a tuple, if not wrap it in a tuple
    """
    return x if isinstance(x, tuple) else (x,)


def _plot_factors(factors, x_axis_range, label, uncertainties=None):
    """
    Plot unsmoothed factors in given range with errobars if uncertainties are provided
    """
    if uncertainties is not None:
        unsmoothed_style = dict(capsize=2.5, markersize=2, fmt="o", color="k", alpha=0.6)
        unsmoothed_style["label"] = label
        plt.errorbar(x_axis_range, factors, yerr=uncertainties, **unsmoothed_style)
    else:
        unsmoothed_style = dict(markersize=2, marker="o", color="k", alpha=0.6)
        unsmoothed_style["label"] = label
        plt.plot(x_axis_range, factors, **unsmoothed_style)


def _plot_smoothed_factors(factors, x_axis_range, is_continuous, uncertainties=None):
    """
    Plot smoothed factors, plot style depends on is_continuous
    """
    if is_continuous:
        smoothed_style = dict(linestyle="-", linewidth=1.0, color="r")
        x_axis_range = (x_axis_range + np.append(x_axis_range, x_axis_range[-1] + 1)[1:]) / 2
    else:
        smoothed_style = dict(
            marker="o",
            markeredgecolor="r",
            markersize=5.0,
            linestyle="none",
            fillstyle="none",
            color="r",
        )
    smoothed_style["label"] = "smoothed factors"
    if uncertainties is not None:
        plt.errorbar(x_axis_range, factors, yerr=[uncertainties[0], uncertainties[1]], **smoothed_style)
    else:
        plt.plot(x_axis_range, factors, **smoothed_style)


def _plot_missing_factor(factors, x_axis_range, y_axis_range):
    """
    Plot the factor which was calculated for missing values and mark the region in an orange color
    """
    # Factor which corresponds to missing value is the last one
    missing_factor = factors[-1]
    x_position = x_axis_range[-1]

    # Plot single datapoint and shade the whole area around this point to mark it as "special"
    nan_style = dict(
        marker="p",
        markeredgecolor="r",
        markersize=5.0,
        color="b",
        linestyle="none",
        fillstyle="none",
    )
    nan_style["label"] = "smoothed nan factor"
    plt.plot([x_position], [missing_factor], **nan_style)
    plt.fill_between(
        [x_position - 0.5, x_position + 0.5],
        min(y_axis_range),
        max(y_axis_range),
        color="#f7d208",
        alpha=0.5,
    )


def _plot_axes(x_axis_range, x_axis_labels, y_axis_range, y_axis_labels, is_continuous):
    """
    Plot axes including limits, labels and ticks
    """
    # Set limits
    plt.xlim(min(x_axis_range) - 0.5, max(x_axis_range) + 0.5)
    plt.ylim(min(y_axis_range), max(y_axis_range))

    # Shift x_axis to the left if continuous, because in this case the label correspond to the bin boundaries
    if is_continuous:
        x_axis_range -= 0.5
    if x_axis_labels is not None:
        if len(x_axis_range) - len(x_axis_labels) == 1:
            x_axis_labels = np.append(x_axis_labels, "")
        plt.xticks(x_axis_range, x_axis_labels, size="xx-small", rotation="vertical")
    plt.yticks(y_axis_range, y_axis_labels)



[docs]
def plot_factor_1d(
    feature,
    bin_bounds=None,
    with_errorbars=True,
    ylimits_include_errors=True,
    link_function=None,
    plot_yp=True,
):
    """
    Plots a single one dimensional factor plot.

    Parameters
    ----------
    bin_bounds: list
        Bin boundaries to label the bins.
    feature: cyclic_boosting.base.Feature
        Feature as it can be obtained from the plotting observers
        ``features`` property.
    link_function: cyclic_boosting.link.LinkFunction
        Link function of the plotted feature
    with_errorbars: bool
        Option to switch errorbars on/off.
    ylimits_include_errors: bool
        Option to show the errorbars in the plot completely.
    plot_yp: bool
        Show deviation between truth and prediction in last iteration.
    """
    y = feature.y
    if y is None:
        plot_yp = False
    p = feature.prediction

    if plot_yp:
        factors = feature.mean_dev
    else:
        factors = feature.unfitted_factors_link

    smoothed_factors = feature.factors_link

    if plot_yp:
        uncertainties = np.abs(feature.unfitted_factors_link)
        uncertainties = [uncertainties, uncertainties]
    else:
        uncertainties = [
            feature.unfitted_uncertainties_link,
            feature.unfitted_uncertainties_link,
        ]

    assert len(factors) == len(smoothed_factors) == len(uncertainties[0]) == len(uncertainties[1]) > 0
    number_of_factors = len(factors)

    if isinstance(link_function, IdentityLinkMixin):
        plt.axhline(0, color="gray")
        plt.ylabel("Summand")

    elif isinstance(link_function, LogLinkMixin):
        factors = link_function.unlink_func(factors)
        smoothed_factors = link_function.unlink_func(smoothed_factors)
        if plot_yp:
            y = link_function.unlink_func(y)
            p = link_function.unlink_func(p)
        plt.axhline(1, color="gray")
        plt.ylabel("Factor")

    elif isinstance(link_function, LogitLinkMixin):
        lower = np.abs(link_function.unlink_func(factors - uncertainties[0]) - link_function.unlink_func(factors))
        upper = np.abs(link_function.unlink_func(factors + uncertainties[1]) - link_function.unlink_func(factors))
        factors = link_function.unlink_func(factors)
        smoothed_factors = link_function.unlink_func(smoothed_factors)
        uncertainties = [
            np.where(lower < 0.0, 0.0, lower),
            np.where(upper > 1.0, 1.0, upper),
        ]
        if plot_yp:
            # do not unlink for nbinom width mode
            if ((link_function.unlink_func(y) >= 0).all()) and ((link_function.unlink_func(y) <= 1).all()):
                y = link_function.unlink_func(y)
            p = link_function.unlink_func(p)
        plt.axhline(0.5, color="gray")
        plt.ylabel("Probability")

    else:
        plt.ylabel("Unkown")

    # Too many factors make the plot unreadable. Thus we resort to plotting a
    # histogram of factors in these cases.
    if number_of_factors > 400:
        from cyclic_boosting.plots import plot_factor_histogram

        plot_factor_histogram(feature)
        return

    feature_property = _ensure_tuple(feature.feature_property)
    is_continuous = flags.is_continuous_set(feature_property[0]) | flags.is_linear_set(feature_property[0])
    if plot_yp:
        minmax = np.r_[
            np.min(np.r_[factors, smoothed_factors, y, p]),
            np.max(np.r_[factors, smoothed_factors, y, p]),
        ]
    else:
        minmax = np.r_[
            np.min(np.r_[factors, smoothed_factors]),
            np.max(np.r_[factors, smoothed_factors]),
        ]
    f = factors.copy()
    if len(f) > 1:
        f[:2] = minmax
        u = uncertainties
    else:
        f = minmax
        u = np.c_[uncertainties, uncertainties]

    y_axis_range, y_axis_labels = _get_y_axis(f, u if ylimits_include_errors else None)
    x_axis_range, x_axis_labels = _get_x_axis(factors, bin_bounds, is_continuous)

    if "MISSING" in flags._convert_flags_to_string(feature.feature_property[0]):
        _plot_missing_factor(smoothed_factors, x_axis_range, y_axis_range)
    elif len(factors) > 1:
        factors = factors[:-1]
        uncertainties = [uncertainties[0][:-1], uncertainties[1][:-1]]
        smoothed_factors = smoothed_factors[:-1]
        x_axis_range = x_axis_range[:-1]
    _plot_axes(x_axis_range, x_axis_labels, y_axis_range, y_axis_labels, is_continuous)

    _plot_smoothed_factors(smoothed_factors, x_axis_range, is_continuous, None)

    if not plot_yp:
        label = "factors"
        if is_continuous:
            x_axis_range = (x_axis_range + np.append(x_axis_range, x_axis_range[-1] + 1)[1:]) / 2
        _plot_factors(factors, x_axis_range, label, uncertainties if with_errorbars else None)

    try:
        if len(factors) > 1:
            plt.plot(p[:-1], ".-", label="prediction", alpha=0.5)
            plt.plot(y[:-1], ".-", label="truth", alpha=0.5)
        else:
            plt.plot(p, ".-", label="prediction", alpha=0.5)
            plt.plot(y, ".-", label="truth", alpha=0.5)
    except:
        pass

    from cyclic_boosting.plots import _format_groupname_with_type

    feature_group = _format_groupname_with_type(feature.feature_group, feature.feature_type)
    plt.xlabel(feature_group)
    plt.legend()



__all__ = []