Source code for bayesflow.utils.classification.calibration_curve
import numpy as np
[docs]
def calibration_curve(
targets: np.ndarray,
estimates: np.ndarray,
*,
pos_label: int | float | bool | str = 1,
num_bins: int = 5,
strategy: str = "uniform",
):
"""Compute true and predicted probabilities for a calibration curve.
The method assumes the inputs come from a binary classifier, and
discretize the [0, 1] interval into bins.
Code from: https://github.com/scikit-learn/scikit-learn/blob/98ed9dc73/sklearn/calibration.py#L927
Parameters
----------
targets : array-like of shape (n_samples,)
True targets.
estimates : array-like of shape (n_samples,)
Probabilities of the positive class.
pos_label : int, float, bool or str, default = 1
The label of the positive class.
num_bins : int, default=5
Number of bins to discretize the [0, 1] interval. A bigger number
requires more data. Bins with no samples (i.e. without
corresponding values in `estimates`) will not be returned, thus the
returned arrays may have less than `num_bins` values.
strategy : {'uniform', 'quantile'}, default='uniform'
Strategy used to define the widths of the bins.
uniform
The bins have identical widths.
quantile
The bins have the same number of samples and depend on `y_prob`.
Returns
-------
prob_true : ndarray of shape (num_bins,) or smaller
The proportion of samples whose class is the positive class, in each
bin (fraction of positives).
prob_pred : ndarray of shape (num_bins,) or smaller
The mean estimated probability in each bin.
References
----------
Alexandru Niculescu-Mizil and Rich Caruana (2005) Predicting Good
Probabilities With Supervised Learning, in Proceedings of the 22nd
International Conference on Machine Learning (ICML).
"""
if estimates.min() < 0 or estimates.max() > 1:
raise ValueError("y_prob has values outside [0, 1].")
labels = np.unique(targets)
if len(labels) > 2:
raise ValueError(f"Only binary classification is supported. Provided labels {labels}.")
targets = targets == pos_label
if strategy == "quantile": # Determine bin edges by distribution of data
quantiles = np.linspace(0, 1, num_bins + 1)
bins = np.percentile(estimates, quantiles * 100)
elif strategy == "uniform":
bins = np.linspace(0.0, 1.0, num_bins + 1)
else:
raise ValueError("Invalid entry to 'strategy' input. Strategy must be either 'quantile' or 'uniform'.")
binids = np.searchsorted(bins[1:-1], estimates)
bin_sums = np.bincount(binids, weights=estimates, minlength=len(bins))
bin_true = np.bincount(binids, weights=targets, minlength=len(bins))
bin_total = np.bincount(binids, minlength=len(bins))
nonzero = bin_total != 0
prob_true = bin_true[nonzero] / bin_total[nonzero]
prob_pred = bin_sums[nonzero] / bin_total[nonzero]
return prob_true, prob_pred