Source code for bayesflow.diagnostics.plots.calibration_ecdf

from collections.abc import Callable, Mapping, Sequence

import numpy as np
import keras
import matplotlib.pyplot as plt

from ...utils.plot_utils import prepare_plot_data, add_titles_and_labels, prettify_subplots
from ...utils.ecdf import simultaneous_ecdf_bands
from ...utils.ecdf.ranks import fractional_ranks, distance_ranks



[docs]
def calibration_ecdf(
    estimates: Mapping[str, np.ndarray] | np.ndarray,
    targets: Mapping[str, np.ndarray] | np.ndarray,
    variable_keys: Sequence[str] = None,
    variable_names: Sequence[str] = None,
    test_quantities: dict[str, Callable] = None,
    difference: bool = False,
    stacked: bool = False,
    rank_type: str | np.ndarray = "fractional",
    figsize: Sequence[float] = None,
    label_fontsize: int = 16,
    legend_fontsize: int = 14,
    legend_location: str = "upper right",
    title_fontsize: int = 18,
    tick_fontsize: int = 12,
    rank_ecdf_color: str = "#132a70",
    fill_color: str = "grey",
    num_row: int = None,
    num_col: int = None,
    **kwargs,
) -> plt.Figure:
    """
    Creates the empirical CDFs for each marginal rank distribution
    and plots it against a uniform ECDF.
    ECDF simultaneous bands are drawn using simulations from the uniform,
    as proposed by [1].

    For models with many parameters, use `stacked=True` to obtain an idea
    of the overall calibration of a posterior approximator.

    To compute ranks based on the Euclidean distance to the origin or a reference, use `rank_type='distance'` (and
    pass a reference array, respectively). This can be used to check the joint calibration of the posterior approximator
    and might show potential biases in the posterior approximation which are not detected by the fractional ranks (e.g.,
    when the prior equals the posterior). This is motivated by [2].

    [1] Säilynoja, T., Bürkner, P. C., & Vehtari, A. (2022). Graphical test
    for discrete uniformity and its applications in goodness-of-fit evaluation
    and multiple sample comparison. Statistics and Computing, 32(2), 1-21.
    https://arxiv.org/abs/2103.10522

    [2] Lemos, Pablo, et al. "Sampling-based accuracy testing of posterior estimators
     for general inference." International Conference on Machine Learning. PMLR, 2023.
     https://proceedings.mlr.press/v202/lemos23a.html

    Parameters
    ----------
    estimates      : np.ndarray of shape (n_data_sets, n_post_draws, n_params)
        The posterior draws obtained from n_data_sets
    targets     : np.ndarray of shape (n_data_sets, n_params)
        The prior draws obtained for generating n_data_sets
    difference        : bool, optional, default: False
        If `True`, plots the ECDF difference.
        Enables a more dynamic visualization range.
    stacked           : bool, optional, default: False
        If `True`, all ECDFs will be plotted on the same plot.
        If `False`, each ECDF will have its own subplot,
        similar to the behavior of `calibration_histogram`.
    rank_type   : str, optional, default: 'fractional'
        If `fractional` (default), the ranks are computed as the fraction
        of posterior samples that are smaller than the prior.
        If `distance`, the ranks are computed as the fraction of posterior
        samples that are closer to a reference points (default here is the origin).
        You can pass a reference array in the same shape as the
        `estimates` array by setting `targets` in the ``ranks_kwargs``.
        This is motivated by [2].
    variable_keys       : list or None, optional, default: None
       Select keys from the dictionaries provided in estimates and targets.
       By default, select all keys.
    variable_names    : list or None, optional, default: None
        The parameter names for nice plot titles.
        Inferred if None. Only relevant if `stacked=False`.
    test_quantities   : dict or None, optional, default: None
        A dict that maps plot titles to functions that compute
        test quantities based on estimate/target draws.

        The dict keys are automatically added to ``variable_keys``
        and ``variable_names``.
        Test quantity functions are expected to accept a dict of draws with
        shape ``(batch_size, ...)`` as the first (typically only)
        positional argument and return an NumPy array of shape
        ``(batch_size,)``.
        The functions do not have to deal with an additional
        sample dimension, as appropriate reshaping is done internally.
    figsize           : tuple or None, optional, default: None
        The figure size passed to the matplotlib constructor.
        Inferred if None.
    label_fontsize    : int, optional, default: 16
        The font size of the y-label and y-label texts
    legend_fontsize   : int, optional, default: 14
        The font size of the legend text
    title_fontsize    : int, optional, default: 18
        The font size of the title text.
        Only relevant if `stacked=False`
    tick_fontsize     : int, optional, default: 12
        The font size of the axis ticklabels
    rank_ecdf_color   : str, optional, default: '#a34f4f'
        The color to use for the rank ECDFs
    fill_color        : str, optional, default: 'grey'
        The color of the fill arguments.
    num_row           : int, optional, default: None
        The number of rows for the subplots.
        Dynamically determined if None.
    num_col           : int, optional, default: None
        The number of columns for the subplots.
        Dynamically determined if None.
    **kwargs          : dict, optional, default: {}
        Keyword arguments can be passed to control the behavior of
        ECDF simultaneous band computation through the ``ecdf_bands_kwargs``
        dictionary. See `simultaneous_ecdf_bands` for keyword arguments.
        Moreover, additional keyword arguments can be passed to control the behavior of
        the rank computation through the ``ranks_kwargs`` dictionary.

    Returns
    -------
    f : plt.Figure - the figure instance for optional saving

    Raises
    ------
    ShapeError
        If there is a deviation form the expected shapes of `estimates`
        and `targets`.
    ValueError
        If an unknown `rank_type` is passed.
    """

    # Optionally, compute and prepend test quantities from draws
    if test_quantities is not None:
        test_quantities_estimates = {}
        test_quantities_targets = {}

        for key, test_quantity_fn in test_quantities.items():
            # Apply test_quantity_func to ground-truths
            tq_targets = test_quantity_fn(data=targets)
            test_quantities_targets[key] = np.expand_dims(tq_targets, axis=1)

            # # Flatten estimates for batch processing in test_quantity_fn, apply function, and restore shape
            num_conditions, num_samples = next(iter(estimates.values())).shape[:2]
            flattened_estimates = keras.tree.map_structure(lambda t: np.reshape(t, (-1, *t.shape[2:])), estimates)
            flat_tq_estimates = test_quantity_fn(data=flattened_estimates)
            test_quantities_estimates[key] = np.reshape(flat_tq_estimates, (num_conditions, num_samples, 1))

        # Add custom test quantities to variable keys and names for plotting
        # keys and names are set to the test_quantities dict keys
        test_quantities_names = list(test_quantities.keys())

        if variable_keys is None:
            variable_keys = list(estimates.keys())

        if isinstance(variable_names, list):
            variable_names = test_quantities_names + variable_names

        variable_keys = test_quantities_names + variable_keys
        estimates = test_quantities_estimates | estimates
        targets = test_quantities_targets | targets

    plot_data = prepare_plot_data(
        estimates=estimates,
        targets=targets,
        variable_keys=variable_keys,
        variable_names=variable_names,
        num_col=num_col,
        num_row=num_row,
        figsize=figsize,
        stacked=stacked,
    )

    estimates = plot_data.pop("estimates")
    targets = plot_data.pop("targets")

    if rank_type == "fractional":
        # Compute fractional ranks
        ranks = fractional_ranks(estimates, targets)
    elif rank_type == "distance":
        # Compute ranks based on distance to the origin
        ranks = distance_ranks(estimates, targets, stacked=stacked, **kwargs.pop("ranks_kwargs", {}))
    else:
        raise ValueError(f"Unknown rank type: {rank_type}. Use 'fractional' or 'distance'.")

    # Plot individual ecdf of parameters
    for j in range(ranks.shape[-1]):
        xx = np.repeat(np.sort(ranks[:, j]), 2)
        xx = np.pad(xx, (1, 1), constant_values=(0, 1))
        yy = np.linspace(0, 1, num=xx.shape[-1] // 2)
        yy = np.repeat(yy, 2)

        # Difference, if specified
        if difference:
            yy -= xx

        if stacked:
            if j == 0:
                if not isinstance(plot_data["axes"], np.ndarray):
                    plot_data["axes"] = np.array([plot_data["axes"]])  # in case of single axis
                plot_data["axes"][0].plot(xx, yy, color=rank_ecdf_color, alpha=0.95, label="Rank ECDFs")
            else:
                plot_data["axes"][0].plot(xx, yy, color=rank_ecdf_color, alpha=0.95)
        else:
            plot_data["axes"].flat[j].plot(xx, yy, color=rank_ecdf_color, alpha=0.95, label="Rank ECDF")

    # Compute uniform ECDF and bands
    alpha, z, L, U = simultaneous_ecdf_bands(estimates.shape[0], **kwargs.pop("ecdf_bands_kwargs", {}))

    # Difference, if specified
    if difference:
        L -= z
        U -= z
        ylab = "ECDF Difference"
    else:
        ylab = "ECDF"

    # Add simultaneous bounds
    if not stacked:
        titles = plot_data["variable_names"]
    elif rank_type in ["distance", "random"]:
        titles = ["Joint ECDFs"]
    else:
        titles = ["Stacked ECDFs"]

    for ax, title in zip(plot_data["axes"].flat, titles):
        ax.fill_between(z, L, U, color=fill_color, alpha=0.2, label=rf"{int((1 - alpha) * 100)}$\%$ Confidence Bands")
        ax.legend(fontsize=legend_fontsize, loc=legend_location)
        ax.set_title(title, fontsize=title_fontsize)

    prettify_subplots(plot_data["axes"], num_subplots=plot_data["num_variables"], tick_fontsize=tick_fontsize)

    add_titles_and_labels(
        plot_data["axes"],
        plot_data["num_row"],
        plot_data["num_col"],
        xlabel=f"{rank_type.capitalize()} rank statistic",
        ylabel=ylab,
        label_fontsize=label_fontsize,
    )

    plot_data["fig"].tight_layout()
    return plot_data["fig"]