Source code for trustlens.trust_score

"""
trustlens.trust_score.
======================
The TrustLens Trust Score — a single 0–100 composite measure of model
trustworthiness.

Why a single score?
-------------------
Practitioners face "metric overload": ECE, Brier Score, silhouette scores,
confidence gaps — great individually but hard to act on as a whole.

The Trust Score distils all TrustLens analysis into one instantly readable
number:

 * **< 40** — Serious issues. Do not deploy.
 * **40–60** — Moderate trust. Investigate flagged dimensions.
 * **60–80** — Good. Minor improvements recommended.
 * **80–100** — High trust. Model is production-ready.

Formula
-------
The Trust Score is a weighted sum of four normalized sub-scores (0–100 each):

 TrustScore = w_cal * CalibrationScore
       + w_fail * FailureScore
       + w_bias * BiasScore
       + w_rep * RepresentationScore

Default weights (tuned to reflect deployment risk):
 w_cal = 0.35  (calibration matters most — drives overconfidence risk)
 w_fail = 0.30  (failure patterns drive safety risk)
 w_bias = 0.25  (bias drives fairness/regulatory risk)
 w_rep = 0.10  (representation is a bonus signal; not always available)

If a dimension is unavailable (e.g., no embeddings → no representation score),
its weight is redistributed proportionally to the other available dimensions.

Sub-score Normalization
-----------------------
All sub-scores are normalized to [0, 100]:

 * CalibrationScore = 100 × (1 - clip(0.5×BS + 0.5×ECE, 0, 1))
   - Brier Score and ECE are both in [0, 1]; lower is better.
   - Perfect calibration → 100. Worst case (BS=1, ECE=1) → 0.

 * FailureScore = 100 × clip(confidence_gap, 0, 1)
   - Confidence gap in [0, 1] (clipped); higher is better.
   - A model that is highly confident *only* when correct → 100.

 * BiasScore = 100 × (1 - clip(bias_penalty, 0, 1))
   - bias_penalty = 0.5 × clip(imbalance_ratio / 20, 0, 1)
           + 0.5 × clip(subgroup_gap, 0, 1)
   - Perfectly balanced dataset, zero subgroup gap → 100.

 * RepresentationScore = 100 × clip(0.5 + 0.5 × silhouette, 0, 1)
   - Silhouette ∈ [-1, 1]; mapped to [0, 100].
   - Perfect separation → 100. Total overlap → 0.

References
----------
* Brier (1950), Guo et al. (2017) — calibration
* Hardt et al. (2016) — fairness
* Rousseeuw (1987) — silhouette
"""

from __future__ import annotations

from dataclasses import dataclass, field

import numpy as np

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

_DEFAULT_WEIGHTS: dict[str, float] = {
    "calibration": 0.35,
    "failure": 0.30,
    "bias": 0.25,
    "representation": 0.10,
}

_GRADE_THRESHOLDS = [
    (80, "A", "High Trust - production-ready"),
    (60, "B", "Good Trust - minor issues to address"),
    (40, "C", "Moderate Trust - investigate flagged dimensions"),
    (0, "D", "Low Trust - serious issues, do not deploy"),
]

_MAX_PENALTY_FAILURE = 20.0
_MAX_PENALTY_CALIBRATION = 15.0
_MAX_PENALTY_FAIRNESS = 15.0
_MAX_TOTAL_PENALTY = 35.0


# ---------------------------------------------------------------------------
# Sub-score computers
# ---------------------------------------------------------------------------


def _calibration_score(cal_data: dict) -> float:
    """
    Compute calibration sub-score (0–100).

    CalibScore = 100 × (1 − clip(BS + 1.5×ECE, 0, 1))
    """
    bs = float(cal_data.get("brier_score", 0.5))
    ece = float(cal_data.get("ece", 0.5))
    composite = bs + 1.5 * ece
    return 100.0 * (1.0 - float(np.clip(composite, 0.0, 1.0)))


def _failure_score(fail_data: dict) -> float:
    """
    Compute failure sub-score (0–100).

    FailScore = 100 × clip(confidence_gap, 0, 1)

    A large gap means the model is confident when right and uncertain when
    wrong — the ideal behaviour.
    """
    gap_data = fail_data.get("confidence_gap", {})
    gap = float(gap_data.get("gap", 0.0))

    # Also penalize high-confidence misclassifications
    misc = fail_data.get("misclassification_summary", {})
    overall = misc.get("__overall__", {})
    error_rate = float(overall.get("overall_error_rate", 0.5))

    # Combine: gap contribution (80%) + accuracy contribution (20%)
    gap_score = float(np.clip(gap, 0.0, 1.0))
    acc_score = 1.0 - float(np.clip(error_rate, 0.0, 1.0))
    score = 0.8 * gap_score + 0.2 * acc_score
    return 100.0 * float(np.clip(score, 0.0, 1.0))


def _bias_score(bias_data: dict) -> float:
    """
    Compute bias sub-score (0–100).

    BiasScore = 100 × (1 − clip(bias_penalty, 0, 1))
    bias_penalty = 0.5 × clip(imbalance_ratio/20, 0, 1)
           + 0.5 × max_subgroup_performance_gap
    """
    imbalance = bias_data.get("class_imbalance", {})
    ratio = float(imbalance.get("imbalance_ratio", 1.0))
    imbalance_penalty = float(np.clip((ratio - 1.0) / 19.0, 0.0, 1.0))

    # Subgroup performance gap (worst across all sensitive features)
    max_gap = 0.0
    subgroup = bias_data.get("subgroup_performance", {})
    for feat_data in subgroup.values():
        summary = feat_data.get("__summary__", {})
        gap = float(summary.get("performance_gap", 0.0))
        max_gap = max(max_gap, gap)

    subgroup_penalty = float(np.clip(max_gap, 0.0, 1.0))

    bias_penalty = 0.5 * imbalance_penalty + 0.5 * subgroup_penalty
    return 100.0 * (1.0 - float(np.clip(bias_penalty, 0.0, 1.0)))


def _representation_score(rep_data: dict) -> float:
    """
    Compute representation sub-score (0–100).

    RepScore = 100 × clip(0.5 + 0.5 × silhouette, 0, 1)
    """
    sep = rep_data.get("separability", {})
    sil = float(sep.get("silhouette_score", 0.0))
    if np.isnan(sil):
        sil = 0.0
    return 100.0 * float(np.clip(0.5 + 0.5 * sil, 0.0, 1.0))


# ---------------------------------------------------------------------------
# TrustScoreResult dataclass
# ---------------------------------------------------------------------------


[docs] @dataclass class TrustScoreResult: """ Structured result from the Trust Score computation. Attributes ---------- score : int Overall Trust Score in [0, 100]. grade : str Letter grade: A / B / C / D. verdict : str Plain-English deployment recommendation. sub_scores : dict Per-dimension scores in [0, 100]. weights_used : dict Actual weights used (after redistribution for missing dimensions). breakdown : dict Weighted contribution of each dimension to the final score. """ score: int grade: str verdict: str sub_scores: dict[str, float] = field(default_factory=dict) weights_used: dict[str, float] = field(default_factory=dict) breakdown: dict[str, float] = field(default_factory=dict) penalties_applied: dict[str, float] = field(default_factory=dict) base_score: int = 0 is_blocked: bool = False def __str__(self) -> str: lines = [ f"Trust Score: {self.score}/100 [{self.grade}]", f"Assessment : {self.verdict}", "\nDimension Breakdown:", ] for dim, score in self.sub_scores.items(): lines.append(f" - {dim:<18} {score:5.1f}/100") return "\n".join(lines) def __repr__(self) -> str: return f"TrustScoreResult(score={self.score}, grade={self.grade!r})" def _repr_html_(self) -> str: """Rich HTML representation for Jupyter notebooks.""" from trustlens.visualization.summary_plot import _C, _color_for_grade, _color_for_score gc = _color_for_grade(self.grade) html = f""" <div style="font-family: 'Segoe UI', Roboto, Helvetica, Arial, sans-serif; max-width: 450px; padding: 20px; border-radius: 12px; border: 1px solid {gc}40; background-color: #ffffff; box-shadow: 0 4px 12px rgba(0,0,0,0.05); margin: 10px 0;"> <div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 15px;"> <div style="font-size: 14px; font-weight: 600; color: {_C["gray"]}; text-transform: uppercase; letter-spacing: 0.5px;"> Trust Analysis Result </div> <div style="padding: 4px 12px; border-radius: 20px; background-color: {gc}; color: white; font-size: 13px; font-weight: 700;"> GRADE {self.grade} </div> </div> <div style="display: flex; align-items: baseline; margin-bottom: 8px;"> <span style="font-size: 48px; font-weight: 800; color: {gc}; line-height: 1;">{self.score}</span> <span style="font-size: 20px; font-weight: 600; color: {_C["gray"]}; margin-left: 4px;">/100</span> </div> <div style="font-size: 16px; font-weight: 600; color: {_C["dark"]}; margin-bottom: 20px;"> {self.verdict} </div> <div style="border-top: 1px solid #f0f0f0; pt: 15px;"> <div style="font-size: 12px; font-weight: 700; color: {_C["gray"]}; margin: 12px 0 8px 0; text-transform: uppercase;"> Dimension Breakdown </div> """ for dim, score in self.sub_scores.items(): sc = _color_for_score(score) html += f""" <div style="margin-bottom: 10px;"> <div style="display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 4px;"> <span style="color: {_C["dark"]}; font-weight: 500;">{dim.capitalize()}</span> <span style="color: {sc}; font-weight: 700;">{score:.1f}</span> </div> <div style="width: 100%; height: 6px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;"> <div style="width: {score}%; height: 100%; background-color: {sc}; border-radius: 3px;"></div> </div> </div> """ html += """ </div> </div> """ return html
def _score_bar(score: float, width: int = 12) -> str: """Return empty string (ASCII bars removed for professional output).""" return "" # --------------------------------------------------------------------------- # Main computation function # --------------------------------------------------------------------------- def compute_trust_score( results: dict, weights: dict[str, float] | None = None, ) -> TrustScoreResult: """ Compute the overall Trust Score from a TrustReport's results dict. Parameters ---------- results : dict The ``TrustReport.results`` dictionary. weights : dict, optional Custom dimension weights. Keys: ``"calibration"``, ``"failure"``, ``"bias"``, ``"representation"``. Values must sum to 1.0. If None, uses default weights. Returns ------- TrustScoreResult Structured score result with per-dimension breakdown. Examples -------- >>> from trustlens.trust_score import compute_trust_score >>> result = compute_trust_score(report.results) >>> print(result) >>> print(result.score) # e.g. 74 >>> print(result.grade) # e.g. 'B' """ w = dict(_DEFAULT_WEIGHTS) if weights: w.update(weights) # ------------------------------------------------------------------ # 1. Compute available sub-scores # ------------------------------------------------------------------ sub_scores: dict[str, float] = {} if "calibration" in results: sub_scores["calibration"] = _calibration_score(results["calibration"]) if "failure" in results: sub_scores["failure"] = _failure_score(results["failure"]) if "bias" in results: sub_scores["bias"] = _bias_score(results["bias"]) if "representation" in results: sub_scores["representation"] = _representation_score(results["representation"]) # ------------------------------------------------------------------ # 2. Redistribute weights for missing dimensions # ------------------------------------------------------------------ active_dims = [d for d in w if d in sub_scores] total_active_weight = sum(w[d] for d in active_dims) weights_used: dict[str, float] = {} if total_active_weight > 0: for dim in active_dims: weights_used[dim] = w[dim] / total_active_weight else: # Fallback: equal weights for dim in active_dims: weights_used[dim] = 1.0 / len(active_dims) if active_dims else 0.0 # ------------------------------------------------------------------ # 3. Weighted sum and Weak-Dimension Penalties → final score # ------------------------------------------------------------------ raw_score = sum(sub_scores[d] * weights_used[d] for d in active_dims) total_penalty = 0.0 penalties_applied: dict[str, float] = {} # Scaled failure penalty (if under 60.0, apply linearly up to _MAX_PENALTY_FAILURE) failure_score = sub_scores.get("failure", 100.0) if failure_score < 60.0: penalty = _MAX_PENALTY_FAILURE * ((60.0 - failure_score) / 60.0) actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_FAILURE)) total_penalty += actual_p penalties_applied["Failure"] = round(actual_p, 1) # Scaled calibration penalty (if ece > 0.05, apply linearly) calibration_data = results.get("calibration", {}) if "ece" in calibration_data and calibration_data["ece"] is not None: try: ece = float(calibration_data["ece"]) if ece > 0.05: # ECE=0.15 gives max penalty penalty = _MAX_PENALTY_CALIBRATION * ((ece - 0.05) / 0.10) actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_CALIBRATION)) total_penalty += actual_p penalties_applied["Calibration"] = round(actual_p, 1) except (ValueError, TypeError): pass bias_has_severe_violation = False max_gap = 0.0 bias_module = results.get("bias", {}) # Consolidate subgroup and equalized_odds into a single fairness penalty for feat_data in bias_module.get("subgroup_performance", {}).values(): if isinstance(feat_data, dict): gap = feat_data.get("__summary__", {}).get("performance_gap", 0.0) if gap is not None: try: gap_val = float(gap) max_gap = max(max_gap, gap_val) if gap_val > 0.15: bias_has_severe_violation = True except (ValueError, TypeError): pass for val in bias_module.get("equalized_odds", {}).values(): if not isinstance(val, dict): continue summary = val.get("__summary__", {}) if summary.get("tpr_violation") == "severe" or summary.get("fpr_violation") == "severe": bias_has_severe_violation = True break if bias_has_severe_violation: actual_p = float(_MAX_PENALTY_FAIRNESS) total_penalty += actual_p penalties_applied["Fairness"] = round(actual_p, 1) elif max_gap > 0.05: # Scale penalty based on gap from 0.05 up to 0.15 penalty = _MAX_PENALTY_FAIRNESS * ((max_gap - 0.05) / 0.10) actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_FAIRNESS)) total_penalty += actual_p penalties_applied["Fairness"] = round(actual_p, 1) # Cap total penalty to preserve general score variance if total_penalty > _MAX_TOTAL_PENALTY: scale = _MAX_TOTAL_PENALTY / total_penalty for k in penalties_applied: penalties_applied[k] = round(penalties_applied[k] * scale, 1) total_penalty = float(_MAX_TOTAL_PENALTY) base_score = int(round(float(np.clip(raw_score, 0.0, 100.0)))) raw_score -= total_penalty final_score = int(round(float(np.clip(raw_score, 0.0, 100.0)))) breakdown = {d: round(sub_scores[d] * weights_used[d], 2) for d in active_dims} # ------------------------------------------------------------------ # 4. Assign grade & Check Blockers # ------------------------------------------------------------------ conf_gap = results.get("failure", {}).get("confidence_gap", {}).get("gap", 0.0) ece_val = calibration_data.get("ece", 0.0) if isinstance(calibration_data, dict) else 0.0 is_confidently_wrong = failure_score < 50.0 and ece_val > 0.15 and conf_gap < 0.05 is_blocked = False block_reason = "" # Hierarchy: Failure > Fairness > Calibration if is_confidently_wrong: is_blocked = True block_reason = ( "Blocked by 'confidently wrong' behavior (mismatched confidence-weighted errors)" ) elif failure_score < 40.0: is_blocked = True block_reason = ( "Blocked by high diagnostic risk (misaligned confidence-weighted error distribution)" ) elif bias_has_severe_violation: is_blocked = True block_reason = "Blocked by severe fairness violations" elif ece_val > 0.1: is_blocked = True block_reason = "Blocked due to poor calibration (ECE > 0.1)" if is_blocked: grade = "D" verdict = f"Low Trust - {block_reason}" else: grade, verdict = "D", "Low Trust - serious issues" for threshold, g, v in _GRADE_THRESHOLDS: if final_score >= threshold: grade, verdict = g, v break return TrustScoreResult( score=final_score, grade=grade, verdict=verdict, sub_scores={d: round(sub_scores[d], 1) for d in active_dims}, weights_used={d: round(weights_used[d], 3) for d in active_dims}, breakdown=breakdown, penalties_applied=penalties_applied, base_score=base_score, is_blocked=is_blocked, )