Source code for trustlens.metrics.calibration

"""
trustlens.metrics.calibration.
==============================
Calibration metrics for probabilistic classifiers.

Calibration measures how well a model's predicted probabilities reflect
the true likelihood of outcomes. A perfectly calibrated model that predicts
80% confidence for a set of samples should be correct ~80% of the time.

Metrics implemented
-------------------
* ``brier_score``       — proper scoring rule for probabilistic forecasts
* ``expected_calibration_error`` — binned confidence vs accuracy gap
* ``reliability_curve``    — data for reliability (calibration) diagrams

References
----------
* Brier, G. W. (1950). Verification of forecasts expressed in terms of
  probability. Monthly Weather Review, 78(1), 1–3.
* Niculescu-Mizil, A., & Caruana, R. (2005). Predicting good probabilities
  with supervised learning. ICML.
* Guo, C., et al. (2017). On calibration of modern neural networks. ICML.
"""

from __future__ import annotations

import numpy as np

# ---------------------------------------------------------------------------
# Brier Score
# ---------------------------------------------------------------------------


[docs] def brier_score( y_true: np.ndarray, y_prob: np.ndarray, ) -> float: r""" Compute the Brier Score for a binary probabilistic classifier. The Brier Score is the mean squared difference between predicted probabilities and actual outcomes. Lower is better; a perfect forecaster scores 0.0, a random coin-flip scores ~0.25. .. math:: \\text{BS} = \\frac{1}{N} \\sum_{i=1}^{N} \\bigl(\\hat{p}_i - y_i\\bigr)^2 Parameters ---------- y_true : np.ndarray Binary ground-truth labels (0 or 1), shape (n_samples,). y_prob : np.ndarray Predicted probabilities for the positive class, shape (n_samples,). Returns ------- float Brier Score in [0, 1]. Raises ------ ValueError If ``y_true`` and ``y_prob`` have different lengths, or if ``y_true`` contains values outside {0, 1}. Examples -------- >>> import numpy as np >>> from trustlens.metrics.calibration import brier_score >>> y_true = np.array([1, 0, 1, 1, 0]) >>> y_prob = np.array([0.9, 0.1, 0.8, 0.7, 0.3]) >>> brier_score(y_true, y_prob) 0.036 """ y_true = np.asarray(y_true, dtype=float) y_prob = np.asarray(y_prob, dtype=float) if y_true.shape != y_prob.shape: raise ValueError(f"Shape mismatch: y_true {y_true.shape} vs y_prob {y_prob.shape}.") unique_labels = np.unique(y_true) if not set(unique_labels.tolist()).issubset({0.0, 1.0}): raise ValueError( f"brier_score expects binary labels (0/1). Got unique values: {unique_labels}." ) return float(np.mean((y_prob - y_true) ** 2))
# --------------------------------------------------------------------------- # Expected Calibration Error (ECE) # ---------------------------------------------------------------------------
[docs] def expected_calibration_error( y_true: np.ndarray, y_prob: np.ndarray, n_bins: int = 10, strategy: str = "uniform", ) -> float: r""" Compute the Expected Calibration Error (ECE). ECE measures the weighted average absolute difference between predicted confidence and actual accuracy across probability bins. .. math:: \\text{ECE} = \\sum_{b=1}^{B} \\frac{|\\mathcal{B}_b|}{N} \\left|\\text{acc}(\\mathcal{B}_b) - \\text{conf}(\\mathcal{B}_b)\\right| Parameters ---------- y_true : np.ndarray Binary ground-truth labels (0 or 1), shape (n_samples,). y_prob : np.ndarray Predicted probabilities for the positive class, shape (n_samples,). n_bins : int Number of confidence bins. Default 10. strategy : str Binning strategy — ``"uniform"`` (equal-width) or ``"quantile"`` (equal-frequency). Default ``"uniform"``. Returns ------- float ECE value in [0, 1]. Lower is better. Examples -------- >>> from trustlens.metrics.calibration import expected_calibration_error >>> ece = expected_calibration_error(y_true, y_prob, n_bins=10) """ y_true = np.asarray(y_true, dtype=float) y_prob = np.asarray(y_prob, dtype=float) if strategy == "uniform": bin_edges = np.linspace(0.0, 1.0, n_bins + 1) elif strategy == "quantile": bin_edges = np.quantile(y_prob, np.linspace(0.0, 1.0, n_bins + 1)) bin_edges = np.unique(bin_edges) # remove duplicates at extremes else: raise ValueError(f"Unknown strategy '{strategy}'. Use 'uniform' or 'quantile'.") ece = 0.0 n = len(y_true) for lo, hi in zip(bin_edges[:-1], bin_edges[1:]): # Include the right edge in the last bin if hi == bin_edges[-1]: mask = (y_prob >= lo) & (y_prob <= hi) else: mask = (y_prob >= lo) & (y_prob < hi) n_bin = mask.sum() if n_bin == 0: continue accuracy = y_true[mask].mean() confidence = y_prob[mask].mean() ece += (n_bin / n) * abs(accuracy - confidence) return float(ece)
# --------------------------------------------------------------------------- # Reliability Curve # ---------------------------------------------------------------------------
[docs] def reliability_curve( y_true: np.ndarray, y_prob: np.ndarray, n_bins: int = 10, strategy: str = "uniform", ) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """ Compute the reliability (calibration) curve data. Returns the mean predicted probability, fraction of positives, and bin counts for each confidence bin. Use this data with ``trustlens.visualization.plot_reliability_diagram`` to render a calibration plot. Parameters ---------- y_true : np.ndarray Binary ground-truth labels (0 or 1). y_prob : np.ndarray Predicted probabilities for the positive class. n_bins : int Number of confidence bins. Default 10. strategy : str ``"uniform"`` or ``"quantile"``. Default ``"uniform"``. Returns ------- fraction_of_positives : np.ndarray Actual fraction of positive samples in each bin. mean_predicted_value : np.ndarray Mean predicted probability in each bin. bin_counts : np.ndarray Number of samples in each bin. Examples -------- >>> frac_pos, mean_pred, counts = reliability_curve(y_true, y_prob) """ y_true = np.asarray(y_true, dtype=float) y_prob = np.asarray(y_prob, dtype=float) if strategy == "uniform": bin_edges = np.linspace(0.0, 1.0, n_bins + 1) elif strategy == "quantile": bin_edges = np.quantile(y_prob, np.linspace(0.0, 1.0, n_bins + 1)) # Collapse duplicate edges that arise when many samples share the # same value (like a majority-class predictor at 0.0 or 1.0). bin_edges = np.unique(bin_edges) else: raise ValueError(f"Unknown strategy '{strategy}'. Use 'uniform' or 'quantile'.") n_bins_actual = len(bin_edges) - 1 bin_idx = np.clip(np.digitize(y_prob, bin_edges[1:-1]), 0, n_bins_actual - 1) counts = np.bincount(bin_idx, minlength=n_bins_actual) prob_sum = np.bincount(bin_idx, weights=y_prob, minlength=n_bins_actual) true_sum = np.bincount(bin_idx, weights=y_true, minlength=n_bins_actual) active = counts > 0 with np.errstate(divide="ignore", invalid="ignore"): frac_pos = np.where(active, true_sum / counts, 0.0) mean_pred = np.where(active, prob_sum / counts, 0.0) return ( frac_pos[active], mean_pred[active], counts[active].astype(int), )