Source code for bayesflow.diagnostics.metrics.posterior_z_score

from collections.abc import Sequence, Mapping, Callable

import numpy as np

from ...utils.dict_utils import dicts_to_arrays, compute_test_quantities


[docs] def posterior_z_score( estimates: Mapping[str, np.ndarray] | np.ndarray, targets: Mapping[str, np.ndarray] | np.ndarray, variable_keys: Sequence[str] = None, variable_names: Sequence[str] = None, test_quantities: dict[str, Callable] = None, aggregation: Callable | None = np.median, ) -> dict[str, any]: """ Computes the posterior z-score from prior to posterior for the given samples according to [1]: post_z_score = (posterior_mean - true_parameters) / posterior_std The score is adequate if it centers around zero and spreads roughly in the interval [-3, 3] [1] Schad, D. J., Betancourt, M., & Vasishth, S. (2021). Toward a principled Bayesian workflow in cognitive science. Psychological methods, 26(1), 103. Paper also available at https://arxiv.org/abs/1904.12765 Parameters ---------- estimates : np.ndarray of shape (num_datasets, num_draws_post, num_variables) Posterior samples, comprising `num_draws_post` random draws from the posterior distribution for each data set from `num_datasets`. targets : np.ndarray of shape (num_datasets, num_variables) Prior samples, comprising `num_datasets` ground truths. variable_keys : Sequence[str], optional (default = None) Select keys from the dictionaries provided in estimates and targets. By default, select all keys. variable_names : Sequence[str], optional (default = None) Optional variable names to show in the output. test_quantities : dict or None, optional, default: None A dict that maps plot titles to functions that compute test quantities based on estimate/target draws. The dict keys are automatically added to ``variable_keys`` and ``variable_names``. Test quantity functions are expected to accept a dict of draws with shape ``(batch_size, ...)`` as the first (typically only) positional argument and return an NumPy array of shape ``(batch_size,)``. The functions do not have to deal with an additional sample dimension, as appropriate reshaping is done internally. aggregation : callable or None, optional (default = np.median) Function to aggregate the PC across draws. Typically `np.mean` or `np.median`. If None is provided, the individual values are returned. Returns ------- result : dict Dictionary containing: - "values" : float or np.ndarray The (optionally aggregated) posterior z-score per variable - "metric_name" : str The name of the metric ("Posterior z-score"). - "variable_names" : str The (inferred) variable names. Notes ----- Posterior z-score quantifies how far the posterior mean lies from the true generating parameter, in standard-error units. Values near 0 (in absolute terms) mean the posterior mean is close to the truth; large absolute values indicate substantial deviation. The sign shows the direction of the bias. """ # Optionally, compute and prepend test quantities from draws if test_quantities is not None: updated_data = compute_test_quantities( targets=targets, estimates=estimates, variable_keys=variable_keys, variable_names=variable_names, test_quantities=test_quantities, ) variable_names = updated_data["variable_names"] variable_keys = updated_data["variable_keys"] estimates = updated_data["estimates"] targets = updated_data["targets"] samples = dicts_to_arrays( estimates=estimates, targets=targets, variable_keys=variable_keys, variable_names=variable_names, ) post_vars = samples["estimates"].var(axis=1, ddof=1) post_means = samples["estimates"].mean(axis=1) post_stds = np.sqrt(post_vars) z_score = (post_means - samples["targets"]) / post_stds if aggregation is not None: z_score = aggregation(z_score, axis=0) variable_names = samples["estimates"].variable_names return {"values": z_score, "metric_name": "Posterior z-score", "variable_names": variable_names}