Source code for trustlens.trust_score

"""
trustlens.trust_score.
======================
The TrustLens Trust Score — a single 0–100 composite measure of model
trustworthiness.

Why a single score?
-------------------
Practitioners face "metric overload": ECE, Brier Score, silhouette scores,
confidence gaps — great individually but hard to act on as a whole.

The Trust Score distils all TrustLens analysis into one instantly readable
number:

 * **< 40** — Serious issues. Do not deploy.
 * **40–60** — Moderate trust. Investigate flagged dimensions.
 * **60–80** — Good. Minor improvements recommended.
 * **80–100** — High trust. Model is production-ready.

Formula
-------
The Trust Score is a weighted sum of four normalized sub-scores (0–100 each):

 TrustScore = w_cal * CalibrationScore
       + w_fail * FailureScore
       + w_bias * BiasScore
       + w_rep * RepresentationScore

Default weights (tuned to reflect deployment risk):
 w_cal = 0.35  (calibration matters most — drives overconfidence risk)
 w_fail = 0.30  (failure patterns drive safety risk)
 w_bias = 0.25  (bias drives fairness/regulatory risk)
 w_rep = 0.10  (representation is a bonus signal; not always available)

If a dimension is unavailable (e.g., no embeddings → no representation score),
its weight is redistributed proportionally to the other available dimensions.

Sub-score Normalization
-----------------------
All sub-scores are normalized to [0, 100]:

 * CalibrationScore = 100 × (1 - clip(0.5×BS + 0.5×ECE, 0, 1))
   - Brier Score and ECE are both in [0, 1]; lower is better.
   - Perfect calibration → 100. Worst case (BS=1, ECE=1) → 0.

 * FailureScore = 100 × clip(confidence_gap, 0, 1)
   - Confidence gap in [0, 1] (clipped); higher is better.
   - A model that is highly confident *only* when correct → 100.

 * BiasScore = 100 × (1 - clip(bias_penalty, 0, 1))
   - bias_penalty = 0.5 × clip(imbalance_ratio / 20, 0, 1)
           + 0.5 × clip(subgroup_gap, 0, 1)
   - Perfectly balanced dataset, zero subgroup gap → 100.

 * RepresentationScore = 100 × clip(0.5 + 0.5 × silhouette, 0, 1)
   - Silhouette ∈ [-1, 1]; mapped to [0, 100].
   - Perfect separation → 100. Total overlap → 0.

References
----------
* Brier (1950), Guo et al. (2017) — calibration
* Hardt et al. (2016) — fairness
* Rousseeuw (1987) — silhouette
"""

from __future__ import annotations

from dataclasses import dataclass, field

import numpy as np

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

_DEFAULT_WEIGHTS: dict[str, float] = {
    "calibration": 0.35,
    "failure": 0.30,
    "bias": 0.25,
    "representation": 0.10,
}

_GRADE_THRESHOLDS = [
    (80, "A", "High Trust - production-ready"),
    (60, "B", "Good Trust - minor issues to address"),
    (40, "C", "Moderate Trust - investigate flagged dimensions"),
    (0, "D", "Low Trust - serious issues, do not deploy"),
]

_MAX_PENALTY_FAILURE = 20.0
_MAX_PENALTY_CALIBRATION = 15.0
_MAX_PENALTY_FAIRNESS = 15.0
_MAX_TOTAL_PENALTY = 35.0


# ---------------------------------------------------------------------------
# Sub-score computers
# ---------------------------------------------------------------------------


def _calibration_score(cal_data: dict) -> float:
    """
    Compute calibration sub-score (0–100).

    CalibScore = 100 × (1 − clip(BS + 1.5×ECE, 0, 1))
    """
    bs = float(cal_data.get("brier_score", 0.5))
    ece = float(cal_data.get("ece", 0.5))
    composite = bs + 1.5 * ece
    return 100.0 * (1.0 - float(np.clip(composite, 0.0, 1.0)))


def _failure_score(fail_data: dict) -> float:
    """
    Compute failure sub-score (0–100).

    FailScore = 100 × clip(confidence_gap, 0, 1)

    A large gap means the model is confident when right and uncertain when
    wrong — the ideal behaviour.
    """
    gap_data = fail_data.get("confidence_gap", {})
    gap = float(gap_data.get("gap", 0.0))

    # Also penalize high-confidence misclassifications
    misc = fail_data.get("misclassification_summary", {})
    overall = misc.get("__overall__", {})
    error_rate = float(overall.get("overall_error_rate", 0.5))

    # Combine: gap contribution (80%) + accuracy contribution (20%)
    gap_score = float(np.clip(gap, 0.0, 1.0))
    acc_score = 1.0 - float(np.clip(error_rate, 0.0, 1.0))
    score = 0.8 * gap_score + 0.2 * acc_score
    return 100.0 * float(np.clip(score, 0.0, 1.0))


def _bias_score(bias_data: dict) -> float:
    """
    Compute bias sub-score (0–100).

    BiasScore = 100 × (1 − clip(bias_penalty, 0, 1))
    bias_penalty = 0.5 × clip(imbalance_ratio/20, 0, 1)
           + 0.5 × max_subgroup_performance_gap
    """
    imbalance = bias_data.get("class_imbalance", {})
    ratio = float(imbalance.get("imbalance_ratio", 1.0))
    imbalance_penalty = float(np.clip((ratio - 1.0) / 19.0, 0.0, 1.0))

    # Subgroup performance gap (worst across all sensitive features)
    max_gap = 0.0
    subgroup = bias_data.get("subgroup_performance", {})
    for feat_data in subgroup.values():
        summary = feat_data.get("__summary__", {})
        gap = float(summary.get("performance_gap", 0.0))
        max_gap = max(max_gap, gap)

    subgroup_penalty = float(np.clip(max_gap, 0.0, 1.0))

    bias_penalty = 0.5 * imbalance_penalty + 0.5 * subgroup_penalty
    return 100.0 * (1.0 - float(np.clip(bias_penalty, 0.0, 1.0)))


def _representation_score(rep_data: dict) -> float:
    """
    Compute representation sub-score (0–100).

    RepScore = 100 × clip(0.5 + 0.5 × silhouette, 0, 1)
    """
    sep = rep_data.get("separability", {})
    sil = float(sep.get("silhouette_score", 0.0))
    if np.isnan(sil):
        sil = 0.0
    return 100.0 * float(np.clip(0.5 + 0.5 * sil, 0.0, 1.0))


# ---------------------------------------------------------------------------
# TrustScoreResult dataclass
# ---------------------------------------------------------------------------



[docs]
@dataclass
class TrustScoreResult:
    """
    Structured result from the Trust Score computation.

    Attributes
    ----------
    score : int
      Overall Trust Score in [0, 100].
    grade : str
      Letter grade: A / B / C / D.
    verdict : str
      Plain-English deployment recommendation.
    sub_scores : dict
      Per-dimension scores in [0, 100].
    weights_used : dict
      Actual weights used (after redistribution for missing dimensions).
    breakdown : dict
      Weighted contribution of each dimension to the final score.
    """

    score: int
    grade: str
    verdict: str
    sub_scores: dict[str, float] = field(default_factory=dict)
    weights_used: dict[str, float] = field(default_factory=dict)
    breakdown: dict[str, float] = field(default_factory=dict)
    penalties_applied: dict[str, float] = field(default_factory=dict)
    base_score: int = 0
    is_blocked: bool = False

    def __str__(self) -> str:
        lines = [
            f"Trust Score: {self.score}/100 [{self.grade}]",
            f"Assessment : {self.verdict}",
            "\nDimension Breakdown:",
        ]
        for dim, score in self.sub_scores.items():
            lines.append(f"  - {dim:<18} {score:5.1f}/100")
        return "\n".join(lines)

    def __repr__(self) -> str:
        return f"TrustScoreResult(score={self.score}, grade={self.grade!r})"

    def _repr_html_(self) -> str:
        """Rich HTML representation for Jupyter notebooks."""
        from trustlens.visualization.summary_plot import _C, _color_for_grade, _color_for_score

        gc = _color_for_grade(self.grade)

        html = f"""
        <div style="font-family: 'Segoe UI', Roboto, Helvetica, Arial, sans-serif;
                    max-width: 450px; padding: 20px; border-radius: 12px;
                    border: 1px solid {gc}40; background-color: #ffffff;
                    box-shadow: 0 4px 12px rgba(0,0,0,0.05); margin: 10px 0;">
            <div style="display: flex; align-items: center; justify-content: space-between; margin-bottom: 15px;">
                <div style="font-size: 14px; font-weight: 600; color: {_C["gray"]}; text-transform: uppercase; letter-spacing: 0.5px;">
                    Trust Analysis Result
                </div>
                <div style="padding: 4px 12px; border-radius: 20px; background-color: {gc}; color: white;
                            font-size: 13px; font-weight: 700;">
                    GRADE {self.grade}
                </div>
            </div>

            <div style="display: flex; align-items: baseline; margin-bottom: 8px;">
                <span style="font-size: 48px; font-weight: 800; color: {gc}; line-height: 1;">{self.score}</span>
                <span style="font-size: 20px; font-weight: 600; color: {_C["gray"]}; margin-left: 4px;">/100</span>
            </div>

            <div style="font-size: 16px; font-weight: 600; color: {_C["dark"]}; margin-bottom: 20px;">
                {self.verdict}
            </div>

            <div style="border-top: 1px solid #f0f0f0; pt: 15px;">
                <div style="font-size: 12px; font-weight: 700; color: {_C["gray"]}; margin: 12px 0 8px 0; text-transform: uppercase;">
                    Dimension Breakdown
                </div>
        """

        for dim, score in self.sub_scores.items():
            sc = _color_for_score(score)
            html += f"""
                <div style="margin-bottom: 10px;">
                    <div style="display: flex; justify-content: space-between; font-size: 13px; margin-bottom: 4px;">
                        <span style="color: {_C["dark"]}; font-weight: 500;">{dim.capitalize()}</span>
                        <span style="color: {sc}; font-weight: 700;">{score:.1f}</span>
                    </div>
                    <div style="width: 100%; height: 6px; background-color: #f0f0f0; border-radius: 3px; overflow: hidden;">
                        <div style="width: {score}%; height: 100%; background-color: {sc}; border-radius: 3px;"></div>
                    </div>
                </div>
            """

        html += """
            </div>
        </div>
        """
        return html



def _score_bar(score: float, width: int = 12) -> str:
    """Return empty string (ASCII bars removed for professional output)."""
    return ""


# ---------------------------------------------------------------------------
# Main computation function
# ---------------------------------------------------------------------------


def compute_trust_score(
    results: dict,
    weights: dict[str, float] | None = None,
) -> TrustScoreResult:
    """
    Compute the overall Trust Score from a TrustReport's results dict.

    Parameters
    ----------
    results : dict
      The ``TrustReport.results`` dictionary.
    weights : dict, optional
      Custom dimension weights. Keys: ``"calibration"``, ``"failure"``,
      ``"bias"``, ``"representation"``. Values must sum to 1.0.
      If None, uses default weights.

    Returns
    -------
    TrustScoreResult
      Structured score result with per-dimension breakdown.

    Examples
    --------
    >>> from trustlens.trust_score import compute_trust_score
    >>> result = compute_trust_score(report.results)
    >>> print(result)
    >>> print(result.score)  # e.g. 74
    >>> print(result.grade)  # e.g. 'B'
    """
    w = dict(_DEFAULT_WEIGHTS)
    if weights:
        w.update(weights)

    # ------------------------------------------------------------------
    # 1. Compute available sub-scores
    # ------------------------------------------------------------------
    sub_scores: dict[str, float] = {}

    if "calibration" in results:
        sub_scores["calibration"] = _calibration_score(results["calibration"])

    if "failure" in results:
        sub_scores["failure"] = _failure_score(results["failure"])

    if "bias" in results:
        sub_scores["bias"] = _bias_score(results["bias"])

    if "representation" in results:
        sub_scores["representation"] = _representation_score(results["representation"])

    # ------------------------------------------------------------------
    # 2. Redistribute weights for missing dimensions
    # ------------------------------------------------------------------
    active_dims = [d for d in w if d in sub_scores]
    total_active_weight = sum(w[d] for d in active_dims)

    weights_used: dict[str, float] = {}
    if total_active_weight > 0:
        for dim in active_dims:
            weights_used[dim] = w[dim] / total_active_weight
    else:
        # Fallback: equal weights
        for dim in active_dims:
            weights_used[dim] = 1.0 / len(active_dims) if active_dims else 0.0

    # ------------------------------------------------------------------
    # 3. Weighted sum and Weak-Dimension Penalties → final score
    # ------------------------------------------------------------------
    raw_score = sum(sub_scores[d] * weights_used[d] for d in active_dims)
    total_penalty = 0.0
    penalties_applied: dict[str, float] = {}

    # Scaled failure penalty (if under 60.0, apply linearly up to _MAX_PENALTY_FAILURE)
    failure_score = sub_scores.get("failure", 100.0)
    if failure_score < 60.0:
        penalty = _MAX_PENALTY_FAILURE * ((60.0 - failure_score) / 60.0)
        actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_FAILURE))
        total_penalty += actual_p
        penalties_applied["Failure"] = round(actual_p, 1)

    # Scaled calibration penalty (if ece > 0.05, apply linearly)
    calibration_data = results.get("calibration", {})
    if "ece" in calibration_data and calibration_data["ece"] is not None:
        try:
            ece = float(calibration_data["ece"])
            if ece > 0.05:
                # ECE=0.15 gives max penalty
                penalty = _MAX_PENALTY_CALIBRATION * ((ece - 0.05) / 0.10)
                actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_CALIBRATION))
                total_penalty += actual_p
                penalties_applied["Calibration"] = round(actual_p, 1)
        except (ValueError, TypeError):
            pass

    bias_has_severe_violation = False
    max_gap = 0.0
    bias_module = results.get("bias", {})

    # Consolidate subgroup and equalized_odds into a single fairness penalty
    for feat_data in bias_module.get("subgroup_performance", {}).values():
        if isinstance(feat_data, dict):
            gap = feat_data.get("__summary__", {}).get("performance_gap", 0.0)
            if gap is not None:
                try:
                    gap_val = float(gap)
                    max_gap = max(max_gap, gap_val)
                    if gap_val > 0.15:
                        bias_has_severe_violation = True
                except (ValueError, TypeError):
                    pass

    for val in bias_module.get("equalized_odds", {}).values():
        if not isinstance(val, dict):
            continue
        summary = val.get("__summary__", {})
        if summary.get("tpr_violation") == "severe" or summary.get("fpr_violation") == "severe":
            bias_has_severe_violation = True
            break

    if bias_has_severe_violation:
        actual_p = float(_MAX_PENALTY_FAIRNESS)
        total_penalty += actual_p
        penalties_applied["Fairness"] = round(actual_p, 1)
    elif max_gap > 0.05:
        # Scale penalty based on gap from 0.05 up to 0.15
        penalty = _MAX_PENALTY_FAIRNESS * ((max_gap - 0.05) / 0.10)
        actual_p = float(np.clip(penalty, 0.0, _MAX_PENALTY_FAIRNESS))
        total_penalty += actual_p
        penalties_applied["Fairness"] = round(actual_p, 1)

    # Cap total penalty to preserve general score variance
    if total_penalty > _MAX_TOTAL_PENALTY:
        scale = _MAX_TOTAL_PENALTY / total_penalty
        for k in penalties_applied:
            penalties_applied[k] = round(penalties_applied[k] * scale, 1)
        total_penalty = float(_MAX_TOTAL_PENALTY)

    base_score = int(round(float(np.clip(raw_score, 0.0, 100.0))))
    raw_score -= total_penalty

    final_score = int(round(float(np.clip(raw_score, 0.0, 100.0))))

    breakdown = {d: round(sub_scores[d] * weights_used[d], 2) for d in active_dims}

    # ------------------------------------------------------------------
    # 4. Assign grade & Check Blockers
    # ------------------------------------------------------------------
    conf_gap = results.get("failure", {}).get("confidence_gap", {}).get("gap", 0.0)
    ece_val = calibration_data.get("ece", 0.0) if isinstance(calibration_data, dict) else 0.0
    is_confidently_wrong = failure_score < 50.0 and ece_val > 0.15 and conf_gap < 0.05

    is_blocked = False
    block_reason = ""
    # Hierarchy: Failure > Fairness > Calibration
    if is_confidently_wrong:
        is_blocked = True
        block_reason = (
            "Blocked by 'confidently wrong' behavior (mismatched confidence-weighted errors)"
        )
    elif failure_score < 40.0:
        is_blocked = True
        block_reason = (
            "Blocked by high diagnostic risk (misaligned confidence-weighted error distribution)"
        )

    elif bias_has_severe_violation:
        is_blocked = True
        block_reason = "Blocked by severe fairness violations"
    elif ece_val > 0.1:
        is_blocked = True
        block_reason = "Blocked due to poor calibration (ECE > 0.1)"

    if is_blocked:
        grade = "D"
        verdict = f"Low Trust - {block_reason}"
    else:
        grade, verdict = "D", "Low Trust - serious issues"
        for threshold, g, v in _GRADE_THRESHOLDS:
            if final_score >= threshold:
                grade, verdict = g, v
                break

    return TrustScoreResult(
        score=final_score,
        grade=grade,
        verdict=verdict,
        sub_scores={d: round(sub_scores[d], 1) for d in active_dims},
        weights_used={d: round(weights_used[d], 3) for d in active_dims},
        breakdown=breakdown,
        penalties_applied=penalties_applied,
        base_score=base_score,
        is_blocked=is_blocked,
    )