diff --git a/README.md b/README.md
index 306443d..5e62300 100644
--- a/README.md
+++ b/README.md
@@ -251,6 +251,30 @@ judgearena \
 | `--n_bootstraps` | `20` | Bootstrap samples for ELO confidence intervals |
 | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias |
 | `--result_folder` | `results` | Directory where annotations and results are saved |
+| `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels |
+| `--soft-elo-temperature` | `0.3` | Initial softmax temperature for `--soft-elo`; overridden if `--calibrate-temperature` succeeds |
+| `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) |
+| `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) |
+
+### Soft-ELO & temperature calibration
+
+By default, judge scores are discretised to hard win/loss/tie labels. Passing `--soft-elo` instead converts the raw score
+difference into a continuous preference via a softmax, which is then fed into a soft Bradley-Terry model.
+
+To let the data choose the best temperature automatically, add `--calibrate-temperature`.
+JudgeArena will run the judge on a sample of human-annotated arena battles, fit the temperature $T^*$ by MLE, and
+use it for the full evaluation:
+
+```bash
+judgearena \
+  --task elo-lmarena-100k \
+  --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 200 \
+  --soft-elo \
+  --calibrate-temperature \
+  --calibration-size 300
+```
 
 ### Output
 
diff --git a/judgearena/cli.py b/judgearena/cli.py
index eb94c83..42840e9 100644
--- a/judgearena/cli.py
+++ b/judgearena/cli.py
@@ -113,6 +113,32 @@ def _build_parser() -> argparse.ArgumentParser:
         default=None,
         help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).",
     )
+    parser.add_argument(
+        "--soft-elo",
+        action="store_true",
+        help="[elo] Use continuous judge preferences as soft BT targets instead of "
+        "discretising to hard win/loss/tie.",
+    )
+    parser.add_argument(
+        "--soft-elo-temperature",
+        type=float,
+        default=0.3,
+        help="[elo] Initial PairScore temperature for --soft-elo. "
+        "Overridden by --calibrate-temperature if calibration succeeds.",
+    )
+    parser.add_argument(
+        "--calibrate-temperature",
+        action="store_true",
+        help="[elo] MLE-fit the PairScore temperature against human-labeled arena "
+        "battles before the main run. Requires --soft-elo.",
+    )
+    parser.add_argument(
+        "--calibration-size",
+        type=int,
+        default=None,
+        help="[elo] Number of human arena battles to sample for temperature "
+        "calibration. Defaults to all. Requires --calibrate-temperature.",
+    )
     add_common_arguments(parser)
     return parser
 
@@ -191,6 +217,10 @@ def _build_elo_args(
         n_bootstraps=args.n_bootstraps,
         seed=args.seed,
         baseline_model=args.baseline_model,
+        soft_elo=args.soft_elo,
+        soft_elo_temperature=args.soft_elo_temperature,
+        calibrate_temperature=args.calibrate_temperature,
+        calibration_size=args.calibration_size,
         judge_model=args.judge_model,
         n_instructions=args.n_instructions,
         provide_explanation=args.provide_explanation,
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 51ba6e2..1b17637 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -8,7 +8,7 @@
 
 from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
 from judgearena.cli_common import BaseCliArgs
-from judgearena.evaluate import judge_and_parse_prefs
+from judgearena.evaluate import PairScore, calibrate_temperature, judge_and_parse_prefs
 from judgearena.generate import generate_instructions
 from judgearena.log import get_logger
 from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
@@ -33,120 +33,142 @@ class CliEloArgs(BaseCliArgs):
     n_bootstraps: int = 20
     seed: int = 0
     baseline_model: str | None = None
+    soft_elo: bool = False
+    soft_elo_temperature: float = 0.3
+    calibrate_temperature: bool = False
+    calibration_size: int | None = None
 
 
-def compute_bradley_terry(
+def _winner_to_pref(winner: str) -> float | None:
+    """Convert a hard winner label to a continuous preference value."""
+    if winner == "model_a":
+        return 0.0
+    elif winner == "model_b":
+        return 1.0
+    elif winner in ("tie", "tie (bothbad)"):
+        return 0.5
+    return None
+
+
+def _is_nan_pref(p) -> bool:
+    return p is None or (isinstance(p, float) and np.isnan(p))
+
+
+def fit_bradley_terry(
     df: pd.DataFrame,
-    winner_col: str,
+    pref_col: str = "pref",
     scale: float = 400,
     base: float = 10,
     init_rating: float = 1000,
     baseline_model: str | None = None,
     baseline_rating: float = 1000,
 ) -> dict[str, float]:
-    """
-    Compute Bradley-Terry ratings using MLE (logistic regression).
-
-    This method fits a Bradley-Terry model to pairwise comparison data using
-    maximum likelihood estimation via logistic regression.
-
-    Args:
-        df: DataFrame with columns 'model_a', 'model_b', and the winner column
-        winner_col: Name of the column containing the winner
-        scale: Scale factor for ELO conversion (default 400)
-        base: Base for logarithm in ELO formula (default 10)
-        init_rating: Initial rating offset (default 1000)
-        baseline_model: Model to anchor at baseline_rating
-        baseline_rating: Rating to assign to the baseline model
-
-    Returns:
-        Dictionary mapping model names to their Bradley-Terry ratings
-    """
-    # Get all unique models
-    all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))
-
-    # Create pivot tables for wins
-    ptbl_a_win = pd.pivot_table(
-        df[df[winner_col] == "model_a"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
+    """Fit Bradley-Terry ratings via weighted logistic regression.
 
-    ptbl_b_win = pd.pivot_table(
-        df[df[winner_col] == "model_b"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
+    Each row in *df* is a battle with columns ``model_a``, ``model_b`` and
+    ``pref_col`` ∈ [0, 1] where 0 means A wins, 1 means B wins, 0.5 is a tie.
+    Hard win/loss/tie labels are the special case ``pref ∈ {0, 0.5, 1}``.
 
-    # Handle ties
-    if sum(df[winner_col].isin(["tie", "tie (bothbad)"])) == 0:
-        ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models)
-    else:
-        ptbl_tie = pd.pivot_table(
-            df[df[winner_col].isin(["tie", "tie (bothbad)"])],
-            index="model_a",
-            columns="model_b",
-            aggfunc="size",
-            fill_value=0,
-        )
-        ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
-        ptbl_tie = ptbl_tie + ptbl_tie.T
+    The soft cross-entropy for a battle is decomposed into two weighted
+    hard-label rows so sklearn's ``LogisticRegression`` can be reused:
 
-    # Reindex all pivot tables to have consistent dimensions
-    ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
-    ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
+        Y=1, weight = (1 − pref) · count   (evidence A wins)
+        Y=0, weight =  pref      · count   (evidence B wins)
 
-    # Combined win matrix (ties count as 0.5 for each)
-    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
+    Identical ``(model_a, model_b, pref)`` triples are aggregated first so
+    the design matrix stays small when prefs are quantised (e.g. human
+    arena labels) and untouched when prefs are continuous floats.
+    """
+    df = df.dropna(subset=[pref_col])
+    if df.empty:
+        return {}
 
-    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
+    grouped = (
+        df.groupby(["model_a", "model_b", pref_col]).size().reset_index(name="count")
+    )
 
+    all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"]))
+    models = pd.Series(np.arange(len(all_models)), index=all_models)
     p = len(models)
-    X = np.zeros([p * (p - 1) * 2, p])
-    Y = np.zeros(p * (p - 1) * 2)
-
-    cur_row = 0
-    sample_weights = []
-    for m_a in ptbl_win.index:
-        for m_b in ptbl_win.columns:
-            if m_a == m_b:
-                continue
-            # Skip if nan or no battles between this pair
-            w_ab = ptbl_win.loc[m_a, m_b]
-            w_ba = ptbl_win.loc[m_b, m_a]
-            if np.isnan(w_ab) or np.isnan(w_ba):
-                continue
-            if w_ab == 0 and w_ba == 0:
-                continue
-            X[cur_row, models[m_a]] = +np.log(base)
-            X[cur_row, models[m_b]] = -np.log(base)
-            Y[cur_row] = 1.0
-            sample_weights.append(w_ab)
-
-            X[cur_row + 1, models[m_a]] = np.log(base)
-            X[cur_row + 1, models[m_b]] = -np.log(base)
-            Y[cur_row + 1] = 0.0
-            sample_weights.append(w_ba)
-            cur_row += 2
-
-    X = X[:cur_row]
-    Y = Y[:cur_row]
+
+    m_a_idx = grouped["model_a"].map(models).to_numpy()
+    m_b_idx = grouped["model_b"].map(models).to_numpy()
+    prefs = grouped[pref_col].to_numpy(dtype=float)
+    counts = grouped["count"].to_numpy(dtype=float)
+    n = len(grouped)
+
+    log_base = np.log(base)
+    X = np.zeros((2 * n, p))
+    top = np.arange(n)
+    bot = n + top
+    X[top, m_a_idx] = +log_base
+    X[top, m_b_idx] = -log_base
+    X[bot, m_a_idx] = +log_base
+    X[bot, m_b_idx] = -log_base
+
+    Y = np.concatenate([np.ones(n), np.zeros(n)])
+    sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts])
+
+    # Keep zero-weight rows so sklearn LR always sees both Y classes — when
+    # every pref collapses to 0 or 1 the missing-class rows contribute nothing
+    # to the loss but stop the solver from raising on n_classes < 2.
+    if sample_weights.sum() == 0:
+        return {}
 
     lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000)
     lr.fit(X, Y, sample_weight=sample_weights)
     elo_scores = scale * lr.coef_[0] + init_rating
 
-    # Normalize to baseline model if specified
     if baseline_model is not None and baseline_model in models.index:
         elo_scores += baseline_rating - elo_scores[models[baseline_model]]
 
     return dict(pd.Series(elo_scores, index=models.index))
 
 
+def _prefs_to_battle_results(
+    prefs,
+    our_model_is_position_a,
+    opponent_models,
+    model_name: str,
+) -> pd.DataFrame:
+    """Map per-battle judge prefs into model-name-level battle rows.
+
+    The judge prompt placed our model at position A or B independently per
+    battle.  Here we re-orient each row so ``model_a``/``model_b`` carry
+    the actual model names and ``pref`` is consistent with that ordering
+    (``pref=0`` ⇒ ``model_a`` wins).  ``pref_hard`` is the quantised
+    {0, 0.5, 1} version used by the non-soft Bradley-Terry fit.
+    """
+    records = []
+    for pref, is_pos_a, opp in zip(
+        prefs, our_model_is_position_a, opponent_models, strict=True
+    ):
+        if _is_nan_pref(pref) or pref == 0.5:
+            winner = "tie"
+        elif pref < 0.5:
+            winner = "model_a"
+        else:
+            winner = "model_b"
+
+        if is_pos_a:
+            rec = {
+                "model_a": model_name,
+                "model_b": opp,
+                "winner": winner,
+                "pref": pref,
+            }
+        else:
+            rec = {
+                "model_a": opp,
+                "model_b": model_name,
+                "winner": winner,
+                "pref": None if _is_nan_pref(pref) else 1.0 - pref,
+            }
+        rec["pref_hard"] = _winner_to_pref(winner)
+        records.append(rec)
+    return pd.DataFrame(records)
+
+
 def main(args: CliEloArgs) -> dict:
     rng = np.random.default_rng(args.seed)
 
@@ -280,7 +302,7 @@ def run_judge() -> pd.DataFrame:
             max_tokens=args.max_out_tokens_judge,
             **judge_extra_kwargs,
         )
-        annotations, _, prefs = judge_and_parse_prefs(
+        annotations, annotations_reversed, prefs = judge_and_parse_prefs(
             judge_chat_model=judge_chat_model,
             instructions=instructions.tolist(),
             completions_A=completions_A,
@@ -290,16 +312,33 @@ def run_judge() -> pd.DataFrame:
             truncate_input_chars=args.truncate_all_input_chars,
             use_tqdm=use_tqdm,
         )
+        if annotations_reversed is None:
+            row_annotations = list(annotations)
+            row_use_model_a = use_model_a_as_opponent
+            row_our_pos_a = our_model_is_position_a
+            row_opponents = list(opponent_models)
+        else:
+            # swap_mode="both": dataframe carries 2n rows (AB then BA).
+            # Position metadata is duplicated; prefs are already oriented
+            # consistently by judge_and_parse_prefs as [pref_AB, 1 - pref_BA].
+            row_annotations = list(annotations) + list(annotations_reversed)
+            row_use_model_a = np.concatenate(
+                [use_model_a_as_opponent, use_model_a_as_opponent]
+            )
+            row_our_pos_a = np.concatenate(
+                [our_model_is_position_a, our_model_is_position_a]
+            )
+            row_opponents = list(opponent_models) + list(opponent_models)
         return pd.DataFrame(
             {
-                "judge_completion": [a.judge_completion for a in annotations],
-                "instruction": [a.instruction for a in annotations],
-                "completion_A": [a.completion_A for a in annotations],
-                "completion_B": [a.completion_B for a in annotations],
+                "judge_completion": [a.judge_completion for a in row_annotations],
+                "instruction": [a.instruction for a in row_annotations],
+                "completion_A": [a.completion_A for a in row_annotations],
+                "completion_B": [a.completion_B for a in row_annotations],
                 "pref": prefs,
-                "use_model_a_as_opponent": use_model_a_as_opponent,
-                "our_model_is_position_a": our_model_is_position_a,
-                "opponent_model": opponent_models,
+                "use_model_a_as_opponent": row_use_model_a,
+                "our_model_is_position_a": row_our_pos_a,
+                "opponent_model": row_opponents,
             }
         )
 
@@ -318,30 +357,11 @@ def run_judge() -> pd.DataFrame:
 
     logger.debug("First judge output:\n%s", df_judge["judge_completion"].iloc[0][:500])
 
-    # Map preferences back to model-name-level battle results
+    # Map preferences back to model-name-level battle results.
     model_name = args.model
-    battle_results = []
-    for pref, is_pos_a, opp_model in zip(
-        prefs, our_model_is_position_a, opponent_models, strict=True
-    ):
-        if pref is None or pref == 0.5:
-            winner = "tie"
-        elif pref < 0.5:
-            winner = "model_a"
-        else:
-            winner = "model_b"
-
-        if is_pos_a:
-            battle_results.append(
-                {"model_a": model_name, "model_b": opp_model, "winner": winner}
-            )
-        else:
-            battle_results.append(
-                {"model_a": opp_model, "model_b": model_name, "winner": winner}
-            )
-
-    # LLM-judge battle results for our model
-    df_llm_judge = pd.DataFrame(battle_results)
+    df_llm_judge = _prefs_to_battle_results(
+        prefs, our_model_is_position_a, opponent_models, model_name
+    )
 
     # Normalize prefs so pref < 0.5 always means our model wins, then summarise
     prefs_normalized = pd.Series(
@@ -357,12 +377,15 @@ def run_judge() -> pd.DataFrame:
     winrate = summary["winrate"]
 
     print(f"\n=== Results for {model_name} ===")
-    print(f"Battles: {n} | Wins: {our_wins} | Losses: {our_losses} | Ties: {our_ties}")
+    print(
+        f"Battles: {len(df_llm_judge)} | Wins: {our_wins} | "
+        f"Losses: {our_losses} | Ties: {our_ties}"
+    )
     print(f"Win rate: {winrate:.2%}")
 
     # Combine LLM-judge battles with human-annotated arena battles,
     # keeping only arena models with at least 500 human battles
-    df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]]
+    df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]].copy()
     human_battle_counts = pd.concat(
         [df_arena["model_a"], df_arena["model_b"]]
     ).value_counts()
@@ -371,16 +394,153 @@ def run_judge() -> pd.DataFrame:
         df_arena["model_a"].isin(well_represented)
         & df_arena["model_b"].isin(well_represented)
     ]
+    # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5).
+    # Human labels are already hard, so pref_hard == pref.
+    df_arena["pref"] = df_arena["winner"].map(_winner_to_pref)
+    df_arena["pref_hard"] = df_arena["pref"]
+
     df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
 
-    # Bootstrap Bradley-Terry ELO ratings
+    # Compute human-only BT ratings as ground-truth reference
+    human_elo = fit_bradley_terry(
+        df_arena, pref_col="pref_hard", baseline_model=args.baseline_model
+    )
+
+    # --- Temperature calibration (optional) ---
+    # Run the judge on a random subset of human arena battles that already
+    # have ground-truth winner labels so we can fit T* via MLE.
+    calibrated_temperature: float | None = None
+    if args.calibrate_temperature:
+        if not args.soft_elo:
+            logger.warning(
+                "--calibrate-temperature has no effect without --soft-elo; skipping."
+            )
+        else:
+            logger.info("Calibrating PairScore temperature against human annotations.")
+            # Sample calibration battles from the already-loaded arena battles.
+            # Use the same judge to score them so scores and labels are comparable.
+            _cal_n = (
+                min(args.calibration_size, len(df_arena))
+                if args.calibration_size is not None
+                else len(df_arena)
+            )
+            # Keep the original df_arena_all index so we can look up the full
+            # conversation rows below; reset_index would point at non-existent
+            # 0..N labels in df_arena_all.
+            cal_battles = df_arena.sample(
+                n=_cal_n, random_state=int(rng.integers(0, 2**31))
+            )
+
+            cal_instructions = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0])
+                for i in cal_battles.index
+            ]
+            cal_completions_a = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][1])
+                for i in cal_battles.index
+            ]
+            cal_completions_b = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_b"][1])
+                for i in cal_battles.index
+            ]
+
+            judge_chat_model_cal = make_model(
+                model=args.judge_model,
+                max_tokens=args.max_out_tokens_judge,
+                **judge_extra_kwargs,
+            )
+            cal_annotations, _, cal_prefs = judge_and_parse_prefs(
+                judge_chat_model=judge_chat_model_cal,
+                instructions=cal_instructions,
+                completions_A=cal_completions_a,
+                completions_B=cal_completions_b,
+                swap_mode=args.swap_mode,
+                truncate_input_chars=args.truncate_all_input_chars,
+            )
+
+            # Build (delta_s, y) pairs from calibration battles.
+            # delta_s = score_A - score_B (raw, using default T=1 to extract scores)
+            raw_parser = PairScore(temperature=1.0)
+            delta_s_cal = []
+            y_cal = []
+            for ann, human_winner in zip(
+                cal_annotations, cal_battles["winner"].tolist(), strict=True
+            ):
+                sa = raw_parser.get_regexp_match(
+                    ann.judge_completion.lower(), r'score.*?a[":\s*\n]*(-?\d+)'
+                )
+                sb = raw_parser.get_regexp_match(
+                    ann.judge_completion.lower(), r'score.*?b[":\s*\n]*(-?\d+)'
+                )
+                if sa is None or sb is None:
+                    continue
+                human_pref = _winner_to_pref(human_winner)
+                if human_pref is None or human_pref == 0.5:
+                    continue  # skip ties and missing
+                delta_s_cal.append(sa - sb)
+                y_cal.append(1.0 - human_pref)  # pref=0 → A wins → y=1
+
+            if len(delta_s_cal) < 10:
+                logger.warning(
+                    "Only %d valid calibration pairs (need ≥10); keeping default temperature.",
+                    len(delta_s_cal),
+                )
+            else:
+                calibrated_temperature = calibrate_temperature(
+                    np.array(delta_s_cal), np.array(y_cal)
+                )
+                logger.info(
+                    "Calibration pairs: %d  T* = %.4f  (default was %s)",
+                    len(delta_s_cal),
+                    calibrated_temperature,
+                    args.soft_elo_temperature,
+                )
+
+    # Build the score parser used for the main evaluation run.
+    score_parser = PairScore(
+        temperature=calibrated_temperature
+        if calibrated_temperature is not None
+        else args.soft_elo_temperature
+    )
+
+    # If we calibrated the temperature, the prefs stored in df_judge were
+    # computed with the default T=0.3.  Re-parse them with the new parser so
+    # the soft-ELO bootstrap uses calibrated preferences.
+    if calibrated_temperature is not None:
+        new_prefs_ab = pd.Series(
+            [score_parser.parse_model_raw(c) for c in df_judge["judge_completion"]]
+        )
+        prefs = new_prefs_ab.tolist()
+
+        def _none_to_nan(x):
+            return float("nan") if x is None else x
+
+        if args.swap_mode == "both":
+            # df_judge contains AB and BA annotations interleaved; the original
+            # run_judge() already combined them — we just need to re-parse the
+            # stored completions in the same order.
+            n_half = len(df_judge) // 2
+            prefs_ab = new_prefs_ab[:n_half].apply(_none_to_nan)
+            prefs_ba = new_prefs_ab[n_half:].apply(_none_to_nan).reset_index(drop=True)
+            prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist()
+
+        # Rebuild battle_results with calibrated prefs
+        df_llm_judge = _prefs_to_battle_results(
+            prefs, our_model_is_position_a, opponent_models, model_name
+        )
+        df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
+
     n_bootstraps = args.n_bootstraps
+    use_soft = args.soft_elo
 
     n_llm = len(df_llm_judge)
     n_human = len(df_arena)
-    print(f"\n=== ELO Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===")
+    method_label = "Soft-ELO" if use_soft else "ELO"
     print(
-        f"Estimating ELO Ratings with {n_llm} LLM-judges for model {model_name} "
+        f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ==="
+    )
+    print(
+        f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} "
         f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and "
         f"confidence intervals are reported by computing ELO on {n_bootstraps} samples of instructions."
     )
@@ -391,13 +551,14 @@ def run_judge() -> pd.DataFrame:
         battle_counts[row["model_a"]] = battle_counts.get(row["model_a"], 0) + 1
         battle_counts[row["model_b"]] = battle_counts.get(row["model_b"], 0) + 1
 
+    pref_col = "pref" if use_soft else "pref_hard"
     bootstrap_ratings: list[dict[str, float]] = []
     for _ in range(n_bootstraps):
         df_sample = df_results.sample(
             n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31))
         )
-        ratings = compute_bradley_terry(
-            df_sample, winner_col="winner", baseline_model=args.baseline_model
+        ratings = fit_bradley_terry(
+            df_sample, pref_col=pref_col, baseline_model=args.baseline_model
         )
         bootstrap_ratings.append(ratings)
 
@@ -414,11 +575,26 @@ def run_judge() -> pd.DataFrame:
             suffix = " <-----" if m == model_name else ""
             count = battle_counts.get(m, 0)
             print(f"  {m}  ({count}){suffix}: {np.mean(vals):.1f} ± {np.std(vals):.1f}")
+
+        # MAE vs human-only ELO for overlapping arena models
+        overlap = [m for m in all_model_names if m in human_elo and m != model_name]
+        if overlap:
+            abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap]
+            mae = np.mean(abs_errors)
+            print(f"\n  MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}")
+        else:
+            mae = np.nan
+            print("\n  No overlapping arena models to compute MAE.")
     else:
         print("  Not enough data to compute ELO ratings.")
+        mae = np.nan
 
     return {
         **summary,
         "bootstrap_ratings": bootstrap_ratings,
+        "human_elo": human_elo,
+        "mae_vs_human": mae,
         "model_name": model_name,
+        "method": method_label,
+        "calibrated_temperature": calibrated_temperature,
     }
diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
index 7eb8599..863e74a 100644
--- a/judgearena/evaluate.py
+++ b/judgearena/evaluate.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from langchain_core.language_models.llms import LLM
 from langchain_core.prompts import ChatPromptTemplate
+from scipy.optimize import minimize_scalar
 
 from judgearena.instruction_dataset import load_instructions
 from judgearena.instruction_dataset.arena_hard import (
@@ -29,9 +30,9 @@
 
 
 class PairScore:
-    def __init__(self):
+    def __init__(self, temperature: float = 0.3):
         super(PairScore).__init__()
-        self.temperature = 0.3
+        self.temperature = temperature
 
     def preference_from_scores(self, score_a: float, score_b: float) -> float:
         return 1 - np.exp(self.temperature * score_a) / (
@@ -59,6 +60,58 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
             return float(m.group(group_index).strip(" "))
 
 
+def calibrate_temperature(
+    delta_s: np.ndarray,
+    y: np.ndarray,
+    bounds: tuple[float, float] = (-10.0, 10.0),
+) -> float:
+    """Find the MLE temperature T* for the model P(A>B) = σ(T·Δs).
+
+    The log-likelihood is:
+
+        L(T) = Σ_i [ y_i·log σ(T·Δs_i) + (1−y_i)·log σ(−T·Δs_i) ]
+               = Σ_i log σ(T · (2y_i − 1) · Δs_i)
+
+    This is concave in T (single global maximum) so ``minimize_scalar`` with
+    the 'bounded' method is guaranteed to converge.
+
+    Args:
+        delta_s: Score differences ``s_A − s_B`` for each battle, shape (N,).
+        y: Observed hard labels (1 = A was preferred, 0 = B was preferred,
+           0.5 = tie).  Ties contribute zero gradient and are skipped.
+        bounds: Search interval for T (default −10 to +10).
+
+    Returns:
+        The calibrated temperature T*.
+    """
+    delta_s = np.asarray(delta_s, dtype=float)
+    y = np.asarray(y, dtype=float)
+
+    # Skip ties (y == 0.5) — they carry no directional information.
+    non_tie = y != 0.5
+    delta_s = delta_s[non_tie]
+    y = y[non_tie]
+
+    if len(delta_s) == 0:
+        raise ValueError(
+            "No non-tie observations available for temperature calibration."
+        )
+
+    # z_i = (2y_i − 1) · Δs_i  (positive when the score difference agrees with the outcome)
+    z = (2 * y - 1) * delta_s
+
+    def neg_log_likelihood(T: float) -> float:
+        # log σ(T·z) = −log(1 + exp(−T·z)) = −logaddexp(0, −T·z)
+        return float(np.sum(np.logaddexp(0.0, -T * z)))
+
+    result = minimize_scalar(
+        neg_log_likelihood,
+        bounds=bounds,
+        method="bounded",
+    )
+    return float(result.x)
+
+
 _COMPLETION_LABEL_SINGLE = "Answer"
 _COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
 _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
@@ -366,6 +419,7 @@ def judge_and_parse_prefs(
     user_prompt_template: str | None = None,
     truncate_input_chars: int = 8192,
     use_tqdm: bool = False,
+    score_parser: "PairScore | None" = None,
 ) -> tuple[list[JudgeAnnotation], list[JudgeAnnotation] | None, pd.Series]:
     """Run judge annotation and parse preferences, handling swap_mode='both'.
 
@@ -413,7 +467,8 @@ def judge_and_parse_prefs(
     def _none_to_nan(x):
         return float("nan") if x is None else x
 
-    score_parser = PairScore()
+    if score_parser is None:
+        score_parser = PairScore()
     prefs = pd.Series(
         [score_parser.parse_model_raw(a.judge_completion) for a in annotations]
     )
diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py
index 83f9c8a..e055fdc 100644
--- a/tests/test_estimate_elo_ratings.py
+++ b/tests/test_estimate_elo_ratings.py
@@ -5,7 +5,12 @@
 import pytest
 
 import judgearena.estimate_elo_ratings as estimate_elo_ratings
-from judgearena.estimate_elo_ratings import CliEloArgs, compute_bradley_terry, main
+from judgearena.estimate_elo_ratings import (
+    CliEloArgs,
+    _winner_to_pref,
+    fit_bradley_terry,
+    main,
+)
 from judgearena.evaluate import JudgeAnnotation, judge_and_parse_prefs
 from judgearena.utils import make_model
 
@@ -89,7 +94,13 @@ def _default_args(**kwargs) -> CliEloArgs:
     return CliEloArgs(**defaults)
 
 
-# --- compute_bradley_terry unit tests ---
+# --- fit_bradley_terry unit tests ---
+
+
+def _records_with_pref(records: list[dict]) -> pd.DataFrame:
+    df = pd.DataFrame(records)
+    df["pref"] = df["winner"].map(_winner_to_pref)
+    return df
 
 
 def test_bradley_terry_clear_winner():
@@ -97,23 +108,22 @@ def test_bradley_terry_clear_winner():
     records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 + [
         {"model_a": "B", "model_b": "A", "winner": "model_b"}
     ] * 10
-    ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner")
+    ratings = fit_bradley_terry(_records_with_pref(records))
     assert ratings["A"] > ratings["B"]
 
 
 def test_bradley_terry_all_ties():
     """All ties → ratings should be equal."""
     records = [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 20
-    ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner")
+    ratings = fit_bradley_terry(_records_with_pref(records))
     assert abs(ratings["A"] - ratings["B"]) < 1.0
 
 
 def test_bradley_terry_baseline():
     """Baseline model is anchored at baseline_rating."""
     records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10
-    ratings = compute_bradley_terry(
-        pd.DataFrame(records),
-        winner_col="winner",
+    ratings = fit_bradley_terry(
+        _records_with_pref(records),
         baseline_model="B",
         baseline_rating=1000,
     )
@@ -121,6 +131,22 @@ def test_bradley_terry_baseline():
     assert ratings["A"] > 1000.0
 
 
+def test_bradley_terry_soft_matches_hard():
+    """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels."""
+    records = (
+        [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7
+        + [{"model_a": "A", "model_b": "B", "winner": "model_b"}] * 3
+        + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2
+    )
+    df = _records_with_pref(records)
+    hard = fit_bradley_terry(df, pref_col="pref")
+    # Passing the same column twice (continuous == quantised here) must match.
+    df["pref_soft"] = df["pref"].astype(float)
+    soft = fit_bradley_terry(df, pref_col="pref_soft")
+    assert hard["A"] == pytest.approx(soft["A"], abs=1e-3)
+    assert hard["B"] == pytest.approx(soft["B"], abs=1e-3)
+
+
 # --- main() integration tests ---