Source code for bayesflow.diagnostics.metrics.calibration_log_gamma

from collections.abc import Callable, Mapping, Sequence

import numpy as np
from scipy.stats import binom

from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities



[docs]
def calibration_log_gamma(
    estimates: Mapping[str, np.ndarray] | np.ndarray,
    targets: Mapping[str, np.ndarray] | np.ndarray,
    variable_keys: Sequence[str] = None,
    variable_names: Sequence[str] = None,
    test_quantities: dict[str, Callable] = None,
    num_null_draws: int = 1000,
    quantile: float = 0.05,
):
    """
    Compute the log gamma discrepancy statistic to test posterior calibration,
    see [1] for additional information.
    Log gamma is log(gamma/gamma_null), where gamma_null is the 5th percentile of the
    null distribution under uniformity of ranks.
    That is, if adopting a hypothesis testing framework,then log_gamma < 0 implies
    a rejection of the hypothesis of uniform ranks at the 5% level.
    This diagnostic is typically more sensitive than the Kolmogorov-Smirnoff test or
    ChiSq test.

    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
    "Simulation-Based Calibration Checking for Bayesian Computation:
    The Choice of Test Quantities Shapes Sensitivity."
    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404

    Parameters
    ----------
    estimates  : np.ndarray of shape (num_datasets, num_draws, num_variables)
        The random draws from the approximate posteriors over ``num_datasets``
    targets : np.ndarray of shape (num_datasets, num_variables)
        The corresponding ground-truth values sampled from the prior
    variable_keys : Sequence[str], optional (default = None)
       Select keys from the dictionaries provided in estimates and targets.
       By default, select all keys.
    variable_names : Sequence[str], optional (default = None)
        Optional variable names to show in the output.
    test_quantities   : dict or None, optional, default: None
        A dict that maps plot titles to functions that compute
        test quantities based on estimate/target draws.

        The dict keys are automatically added to ``variable_keys``
        and ``variable_names``.
        Test quantity functions are expected to accept a dict of draws with
        shape ``(batch_size, ...)`` as the first (typically only)
        positional argument and return an NumPy array of shape
        ``(batch_size,)``.
        The functions do not have to deal with an additional
        sample dimension, as appropriate reshaping is done internally.
    quantile : float in (0, 1), optional, default 0.05
        The quantile from the null distribution to be used as a threshold.
        A lower quantile increases sensitivity to deviations from uniformity.

    Returns
    -------
    result : dict
        Dictionary containing:

        - "values" : float or np.ndarray
            The log gamma values per variable
        - "metric_name" : str
            The name of the metric ("Log Gamma").
        - "variable_names" : str
            The (inferred) variable names.
    """

    # Optionally, compute and prepend test quantities from draws
    if test_quantities is not None:
        updated_data = compute_test_quantities(
            targets=targets,
            estimates=estimates,
            variable_keys=variable_keys,
            variable_names=variable_names,
            test_quantities=test_quantities,
        )
        variable_names = updated_data["variable_names"]
        variable_keys = updated_data["variable_keys"]
        estimates = updated_data["estimates"]
        targets = updated_data["targets"]

    samples = dicts_to_arrays(
        estimates=estimates,
        targets=targets,
        variable_keys=variable_keys,
        variable_names=variable_names,
    )

    num_ranks = samples["estimates"].shape[0]
    num_post_draws = samples["estimates"].shape[1]

    # rank statistics
    ranks = np.sum(samples["estimates"] < samples["targets"][:, None], axis=1)

    # null distribution and threshold
    null_distribution = gamma_null_distribution(num_ranks, num_post_draws, num_null_draws)
    null_quantile = np.quantile(null_distribution, quantile)

    # compute log gamma for each parameter
    log_gammas = np.empty(ranks.shape[-1])

    for i in range(ranks.shape[-1]):
        gamma = gamma_discrepancy(ranks[:, i], num_post_draws=num_post_draws)
        log_gammas[i] = np.log(gamma / null_quantile)

    output = {
        "values": log_gammas,
        "metric_name": "Log Gamma",
        "variable_names": samples["estimates"].variable_names,
    }

    return output



def gamma_null_distribution(num_ranks: int, num_post_draws: int = 1000, num_null_draws: int = 1000) -> np.ndarray:
    """
    Computes the distribution of expected gamma values under uniformity of ranks.

    Parameters
    ----------
    num_ranks : int
        Number of ranks to use for each gamma.
    num_post_draws : int, optional, default 1000
        Number of posterior draws that were used to calculate the rank distribution.
    num_null_draws : int, optional, default 1000
        Number of returned gamma values under uniformity of ranks.

    Returns
    -------
    result : np.ndarray
        Array of shape (num_null_draws,) containing gamma values under uniformity of ranks.
    """
    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)
    gamma = np.empty(num_null_draws)

    # loop non-vectorized to reduce memory footprint
    for i in range(num_null_draws):
        u = np.random.uniform(size=num_ranks)
        F_z = np.mean(u[:, None] < z_i, axis=0)
        bin_1 = binom.cdf(num_ranks * F_z, num_ranks, z_i)
        bin_2 = 1 - binom.cdf(num_ranks * F_z - 1, num_ranks, z_i)

        gamma[i] = 2 * np.min(np.minimum(bin_1, bin_2))

    return gamma


def gamma_discrepancy(ranks: np.ndarray, num_post_draws: int = 100) -> float:
    """
    Quantifies deviation from uniformity by the likelihood of observing the
    most extreme point on the empirical CDF of the given rank distribution
    according to [1] (equation 7).

    [1]  Martin Modrák. Angie H. Moon. Shinyoung Kim. Paul Bürkner. Niko Huurre.
    Kateřina Faltejsková. Andrew Gelman. Aki Vehtari.
    "Simulation-Based Calibration Checking for Bayesian Computation:
    The Choice of Test Quantities Shapes Sensitivity."
    Bayesian Anal. 20 (2) 461 - 488, June 2025. https://doi.org/10.1214/23-BA1404

    Parameters
    ----------
    ranks : array of shape (num_ranks,)
        Empirical rank distribution
    num_post_draws : int, optional, default 100
        Number of posterior draws used to generate ranks.

    Returns
    -------
    result : float
        Gamma discrepancy values for each parameter.
    """
    num_ranks = len(ranks)

    # observed count of ranks smaller than i
    R_i = np.array([sum(ranks < i) for i in range(1, num_post_draws + 2)])

    # expected proportion of ranks smaller than i
    z_i = np.arange(1, num_post_draws + 2) / (num_post_draws + 1)

    bin_1 = binom.cdf(R_i, num_ranks, z_i)
    bin_2 = 1 - binom.cdf(R_i - 1, num_ranks, z_i)

    # likelihood of obtaining the most extreme point on the empirical CDF
    # if the rank distribution was indeed uniform
    return float(2 * np.min(np.minimum(bin_1, bin_2)))