From af4bced4b02c7bffb0f1a267509afe12a0b4689e Mon Sep 17 00:00:00 2001
From: Bora Kargi <kargibora@gmail.com>
Date: Tue, 14 Apr 2026 15:33:03 +0200
Subject: [PATCH 1/8] Add soft elo

---
 judgearena/estimate_elo_ratings.py | 142 +++++++++++++++++++++++++++--
 judgearena/evaluate.py             |   4 +-
 judgearena/utils.py                |  10 +-
 3 files changed, 140 insertions(+), 16 deletions(-)

diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index d7dfbd7..03e2224 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -31,6 +31,7 @@ class CliEloArgs(BaseCliArgs):
     n_bootstraps: int = 20
     seed: int = 0
     baseline_model: str | None = None
+    soft_elo: bool = False
 
     @classmethod
     def parse_args(cls):
@@ -83,6 +84,12 @@ def parse_args(cls):
             help="Model name to anchor at 1000 ELO. All other ratings are expressed relative to this model. "
             "Must be one of the models present in the arena battles. If not set, ratings are not anchored.",
         )
+        parser.add_argument(
+            "--soft-elo",
+            action="store_true",
+            help="Use continuous judge preferences as soft labels for BT fitting "
+            "instead of discretising to hard win/loss/tie.",
+        )
         add_common_arguments(parser)
         args = parser.parse_args()
 
@@ -94,6 +101,7 @@ def parse_args(cls):
             n_bootstraps=args.n_bootstraps,
             seed=args.seed,
             baseline_model=args.baseline_model,
+            soft_elo=args.soft_elo,
             judge_model=args.judge_model,
             n_instructions=args.n_instructions,
             provide_explanation=args.provide_explanation,
@@ -221,6 +229,87 @@ def compute_bradley_terry(
     return dict(pd.Series(elo_scores, index=models.index))
 
 
+def compute_soft_bradley_terry(
+    df: pd.DataFrame,
+    pref_col: str = "pref",
+    scale: float = 400,
+    base: float = 10,
+    init_rating: float = 1000,
+    baseline_model: str | None = None,
+    baseline_rating: float = 1000,
+) -> dict[str, float]:
+    """Compute Bradley-Terry ratings from continuous (soft) preferences.
+
+    Each row in *df* is a single battle with columns ``model_a``, ``model_b``,
+    and *pref_col* ∈ [0, 1] where 0 → A wins, 1 → B wins, 0.5 → tie.
+
+    The soft cross-entropy for a single battle is decomposed into two
+    weighted hard-label rows so that sklearn ``LogisticRegression`` can be
+    reused:
+
+        row 1: Y=1, weight = 1 - pref   (evidence for A winning)
+        row 2: Y=0, weight = pref        (evidence for B winning)
+    """
+    df = df.dropna(subset=[pref_col]).copy()
+    if df.empty:
+        return {}
+
+    all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))
+    models = pd.Series(np.arange(len(all_models)), index=all_models)
+    p = len(models)
+
+    n_battles = len(df)
+    X = np.zeros([2 * n_battles, p])
+    Y = np.zeros(2 * n_battles)
+    sample_weights = np.zeros(2 * n_battles)
+
+    for idx, (_, row) in enumerate(df.iterrows()):
+        m_a = row["model_a"]
+        m_b = row["model_b"]
+        pref = row[pref_col]
+
+        # Row for "A wins" evidence
+        X[2 * idx, models[m_a]] = +np.log(base)
+        X[2 * idx, models[m_b]] = -np.log(base)
+        Y[2 * idx] = 1.0
+        sample_weights[2 * idx] = 1.0 - pref
+
+        # Row for "B wins" evidence
+        X[2 * idx + 1, models[m_a]] = +np.log(base)
+        X[2 * idx + 1, models[m_b]] = -np.log(base)
+        Y[2 * idx + 1] = 0.0
+        sample_weights[2 * idx + 1] = pref
+
+    # Drop rows with zero weight (pure wins have one side = 0)
+    nonzero = sample_weights > 0
+    X = X[nonzero]
+    Y = Y[nonzero]
+    sample_weights = sample_weights[nonzero]
+
+    if len(X) == 0:
+        return {}
+
+    lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000)
+    lr.fit(X, Y, sample_weight=sample_weights)
+    elo_scores = scale * lr.coef_[0] + init_rating
+
+    if baseline_model is not None and baseline_model in models.index:
+        elo_scores += baseline_rating - elo_scores[models[baseline_model]]
+
+    return dict(pd.Series(elo_scores, index=models.index))
+
+
+def _winner_to_pref(winner: str) -> float | None:
+    """Convert a hard winner label to a continuous preference value."""
+    if winner == "model_a":
+        return 0.0
+    elif winner == "model_b":
+        return 1.0
+    elif winner in ("tie", "tie (bothbad)"):
+        return 0.5
+    return None
+
+
 def main(args: CliEloArgs | None = None) -> dict:
     if args is None:
         args = CliEloArgs.parse_args()
@@ -392,7 +481,8 @@ def run_judge() -> pd.DataFrame:
 
     print(f"First judge output:\n{df_judge['judge_completion'].iloc[0][:500]}\n")
 
-    # Map preferences back to model-name-level battle results
+    # Map preferences back to model-name-level battle results.
+    # Build both hard labels (winner) and continuous prefs for each battle.
     model_name = args.model
     battle_results = []
     for pref, is_pos_a, opp_model in zip(
@@ -405,13 +495,16 @@ def run_judge() -> pd.DataFrame:
         else:
             winner = "model_b"
 
+        # Continuous pref is relative to judge positions (A/B).
+        # Remap so that model_a column in the DataFrame always corresponds
+        # to pref=0 and model_b to pref=1.
         if is_pos_a:
             battle_results.append(
-                {"model_a": model_name, "model_b": opp_model, "winner": winner}
+                {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref}
             )
         else:
             battle_results.append(
-                {"model_a": opp_model, "model_b": model_name, "winner": winner}
+                {"model_a": opp_model, "model_b": model_name, "winner": winner, "pref": 1.0 - pref if pref is not None else None}
             )
 
     # LLM-judge battle results for our model
@@ -436,7 +529,7 @@ def run_judge() -> pd.DataFrame:
 
     # Combine LLM-judge battles with human-annotated arena battles,
     # keeping only arena models with at least 500 human battles
-    df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]]
+    df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]].copy()
     human_battle_counts = pd.concat(
         [df_arena["model_a"], df_arena["model_b"]]
     ).value_counts()
@@ -445,16 +538,26 @@ def run_judge() -> pd.DataFrame:
         df_arena["model_a"].isin(well_represented)
         & df_arena["model_b"].isin(well_represented)
     ]
+    # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5)
+    df_arena["pref"] = df_arena["winner"].map(_winner_to_pref)
+
     df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
 
+    # Compute human-only BT ratings as ground-truth reference
+    human_elo = compute_bradley_terry(
+        df_arena, winner_col="winner", baseline_model=args.baseline_model
+    )
+
     # Bootstrap Bradley-Terry ELO ratings
     n_bootstraps = args.n_bootstraps
+    use_soft = args.soft_elo
 
     n_llm = len(df_llm_judge)
     n_human = len(df_arena)
-    print(f"\n=== ELO Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===")
+    method_label = "Soft-ELO" if use_soft else "ELO"
+    print(f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===")
     print(
-        f"Estimating ELO Ratings with {n_llm} LLM-judges for model {model_name} "
+        f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} "
         f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and "
         f"confidence intervals are reported by computing ELO on {n_bootstraps} samples of instructions."
     )
@@ -470,9 +573,14 @@ def run_judge() -> pd.DataFrame:
         df_sample = df_results.sample(
             n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31))
         )
-        ratings = compute_bradley_terry(
-            df_sample, winner_col="winner", baseline_model=args.baseline_model
-        )
+        if use_soft:
+            ratings = compute_soft_bradley_terry(
+                df_sample, pref_col="pref", baseline_model=args.baseline_model
+            )
+        else:
+            ratings = compute_bradley_terry(
+                df_sample, winner_col="winner", baseline_model=args.baseline_model
+            )
         bootstrap_ratings.append(ratings)
 
     if bootstrap_ratings:
@@ -488,13 +596,29 @@ def run_judge() -> pd.DataFrame:
             suffix = " <-----" if m == model_name else ""
             count = battle_counts.get(m, 0)
             print(f"  {m}  ({count}){suffix}: {np.mean(vals):.1f} ± {np.std(vals):.1f}")
+
+        # MAE vs human-only ELO for overlapping arena models
+        overlap = [m for m in all_model_names if m in human_elo and m != model_name]
+        if overlap:
+            abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap]
+            mae = np.mean(abs_errors)
+            print(
+                f"\n  MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}"
+            )
+        else:
+            mae = np.nan
+            print("\n  No overlapping arena models to compute MAE.")
     else:
         print("  Not enough data to compute ELO ratings.")
+        mae = np.nan
 
     return {
         **summary,
         "bootstrap_ratings": bootstrap_ratings,
+        "human_elo": human_elo,
+        "mae_vs_human": mae,
         "model_name": model_name,
+        "method": method_label,
     }
 
 
diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
index de1c1c1..cd8b700 100644
--- a/judgearena/evaluate.py
+++ b/judgearena/evaluate.py
@@ -26,9 +26,9 @@
 
 
 class PairScore:
-    def __init__(self):
+    def __init__(self, temperature: float = 0.3):
         super(PairScore).__init__()
-        self.temperature = 0.3
+        self.temperature = temperature
 
     def preference_from_scores(self, score_a: float, score_b: float) -> float:
         return 1 - np.exp(self.temperature * score_a) / (
diff --git a/judgearena/utils.py b/judgearena/utils.py
index b6b0b8d..57ca21f 100644
--- a/judgearena/utils.py
+++ b/judgearena/utils.py
@@ -13,11 +13,6 @@
 from langchain_openai import ChatOpenAI
 from tqdm.asyncio import tqdm
 
-from judgearena.instruction_dataset.arena_hard import (
-    download_arena_hard,
-    is_arena_hard_dataset,
-)
-
 
 def _data_root_path() -> Path:
     raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA")
@@ -449,6 +444,11 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
 
 
 def download_all():
+    from judgearena.instruction_dataset.arena_hard import (
+        download_arena_hard,
+        is_arena_hard_dataset,
+    )
+
     print(f"Downloading all dataset in {data_root}")
     local_path_tables = data_root / "tables"
     for dataset in [

From 898b1e4e07484b9bdfd64dfe38a05aeaeea16be3 Mon Sep 17 00:00:00 2001
From: Bora Kargi <kargibora@gmail.com>
Date: Fri, 24 Apr 2026 10:58:27 +0200
Subject: [PATCH 2/8] Add temperature calibration

---
 judgearena/estimate_elo_ratings.py | 159 ++++++++++++++++++++++++++++-
 judgearena/evaluate.py             |  55 +++++++++-
 2 files changed, 211 insertions(+), 3 deletions(-)

diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 03e2224..4f9d10e 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -9,7 +9,7 @@
 
 from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
 from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs
-from judgearena.evaluate import judge_and_parse_prefs
+from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore
 from judgearena.generate import generate_instructions
 from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
 
@@ -32,6 +32,8 @@ class CliEloArgs(BaseCliArgs):
     seed: int = 0
     baseline_model: str | None = None
     soft_elo: bool = False
+    calibrate_temperature: bool = False
+    calibration_size: int | None = None
 
     @classmethod
     def parse_args(cls):
@@ -90,6 +92,19 @@ def parse_args(cls):
             help="Use continuous judge preferences as soft labels for BT fitting "
             "instead of discretising to hard win/loss/tie.",
         )
+        parser.add_argument(
+            "--calibrate-temperature",
+            action="store_true",
+            help="Calibrate the PairScore temperature T against available human-annotated "
+            "arena battles before running soft-ELO.  Requires --soft-elo.",
+        )
+        parser.add_argument(
+            "--calibration-size",
+            type=int,
+            default=None,
+            help="Number of human arena battles to sample for temperature calibration. "
+            "Defaults to all available battles. Requires --calibrate-temperature.",
+        )
         add_common_arguments(parser)
         args = parser.parse_args()
 
@@ -102,6 +117,8 @@ def parse_args(cls):
             seed=args.seed,
             baseline_model=args.baseline_model,
             soft_elo=args.soft_elo,
+            calibrate_temperature=args.calibrate_temperature,
+            calibration_size=args.calibration_size,
             judge_model=args.judge_model,
             n_instructions=args.n_instructions,
             provide_explanation=args.provide_explanation,
@@ -548,7 +565,144 @@ def run_judge() -> pd.DataFrame:
         df_arena, winner_col="winner", baseline_model=args.baseline_model
     )
 
-    # Bootstrap Bradley-Terry ELO ratings
+    # --- Temperature calibration (optional) ---
+    # Run the judge on a random subset of human arena battles that already
+    # have ground-truth winner labels so we can fit T* via MLE.
+    calibrated_temperature: float | None = None
+    if args.calibrate_temperature:
+        if not args.soft_elo:
+            print(
+                "Warning: --calibrate-temperature has no effect without --soft-elo; skipping."
+            )
+        else:
+            print("\n=== Calibrating PairScore temperature against human annotations ===")
+            # Sample calibration battles from the already-loaded arena battles.
+            # Use the same judge to score them so scores and labels are comparable.
+            _cal_n = (
+                min(args.calibration_size, len(df_arena))
+                if args.calibration_size is not None
+                else len(df_arena)
+            )
+            cal_battles = df_arena.sample(
+                n=_cal_n, random_state=int(rng.integers(0, 2**31))
+            ).reset_index(drop=True)
+
+            cal_instructions = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0])
+                for i in cal_battles.index
+            ]
+            cal_completions_a = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][1])
+                for i in cal_battles.index
+            ]
+            cal_completions_b = [
+                _extract_instruction_text(df_arena_all.loc[i, "conversation_b"][1])
+                for i in cal_battles.index
+            ]
+
+            judge_chat_model_cal = make_model(
+                model=args.judge_model,
+                max_tokens=args.max_out_tokens_judge,
+                **judge_extra_kwargs,
+            )
+            cal_annotations, _, cal_prefs = judge_and_parse_prefs(
+                judge_chat_model=judge_chat_model_cal,
+                instructions=cal_instructions,
+                completions_A=cal_completions_a,
+                completions_B=cal_completions_b,
+                swap_mode=args.swap_mode,
+                truncate_input_chars=args.truncate_all_input_chars,
+            )
+
+            # Build (delta_s, y) pairs from calibration battles.
+            # delta_s = score_A - score_B (raw, using default T=1 to extract scores)
+            raw_parser = PairScore(temperature=1.0)
+            delta_s_cal = []
+            y_cal = []
+            for ann, human_winner in zip(
+                cal_annotations, cal_battles["winner"].tolist(), strict=True
+            ):
+                sa = raw_parser.get_regexp_match(
+                    ann.judge_completion.lower(), r'score.*?a[":\s*\n]*(-?\d+)'
+                )
+                sb = raw_parser.get_regexp_match(
+                    ann.judge_completion.lower(), r'score.*?b[":\s*\n]*(-?\d+)'
+                )
+                if sa is None or sb is None:
+                    continue
+                human_pref = _winner_to_pref(human_winner)
+                if human_pref is None or human_pref == 0.5:
+                    continue  # skip ties and missing
+                delta_s_cal.append(sa - sb)
+                y_cal.append(1.0 - human_pref)  # pref=0 → A wins → y=1
+
+            if len(delta_s_cal) < 10:
+                print(
+                    f"  Only {len(delta_s_cal)} valid calibration pairs (need ≥10); "
+                    "keeping default temperature."
+                )
+            else:
+                calibrated_temperature = calibrate_temperature(
+                    np.array(delta_s_cal), np.array(y_cal)
+                )
+                print(
+                    f"  Calibration pairs: {len(delta_s_cal)}"
+                    f"  T* = {calibrated_temperature:.4f}  (default was 0.3)"
+                )
+
+    # Build the score parser used for the main evaluation run.
+    score_parser = PairScore(
+        temperature=calibrated_temperature if calibrated_temperature is not None else 0.3
+    )
+
+    # If we calibrated the temperature, the prefs stored in df_judge were
+    # computed with the default T=0.3.  Re-parse them with the new parser so
+    # the soft-ELO bootstrap uses calibrated preferences.
+    if calibrated_temperature is not None:
+        new_prefs_ab = pd.Series(
+            [score_parser.parse_model_raw(c) for c in df_judge["judge_completion"]]
+        )
+        prefs = new_prefs_ab.tolist()
+
+        def _none_to_nan(x):
+            return float("nan") if x is None else x
+
+        if args.swap_mode == "both":
+            # df_judge contains AB and BA annotations interleaved; the original
+            # run_judge() already combined them — we just need to re-parse the
+            # stored completions in the same order.
+            n_half = len(df_judge) // 2
+            prefs_ab = new_prefs_ab[:n_half].apply(_none_to_nan)
+            prefs_ba = new_prefs_ab[n_half:].apply(_none_to_nan).reset_index(drop=True)
+            prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist()
+
+        # Rebuild battle_results with calibrated prefs
+        battle_results = []
+        for pref, is_pos_a, opp_model in zip(
+            prefs, our_model_is_position_a, opponent_models, strict=True
+        ):
+            if pref is None or (isinstance(pref, float) and np.isnan(pref)) or pref == 0.5:
+                winner = "tie"
+            elif pref < 0.5:
+                winner = "model_a"
+            else:
+                winner = "model_b"
+            if is_pos_a:
+                battle_results.append(
+                    {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref}
+                )
+            else:
+                battle_results.append(
+                    {
+                        "model_a": opp_model,
+                        "model_b": model_name,
+                        "winner": winner,
+                        "pref": 1.0 - pref if (pref is not None and not (isinstance(pref, float) and np.isnan(pref))) else None,
+                    }
+                )
+        df_llm_judge = pd.DataFrame(battle_results)
+        df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
+
     n_bootstraps = args.n_bootstraps
     use_soft = args.soft_elo
 
@@ -619,6 +773,7 @@ def run_judge() -> pd.DataFrame:
         "mae_vs_human": mae,
         "model_name": model_name,
         "method": method_label,
+        "calibrated_temperature": calibrated_temperature,
     }
 
 
diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
index cd8b700..1bd9440 100644
--- a/judgearena/evaluate.py
+++ b/judgearena/evaluate.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+from scipy.optimize import minimize_scalar
 from langchain_core.language_models.llms import LLM
 from langchain_core.prompts import ChatPromptTemplate
 
@@ -56,6 +57,56 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
             return float(m.group(group_index).strip(" "))
 
 
+def calibrate_temperature(
+    delta_s: np.ndarray,
+    y: np.ndarray,
+    bounds: tuple[float, float] = (-10.0, 10.0),
+) -> float:
+    """Find the MLE temperature T* for the model P(A>B) = σ(T·Δs).
+
+    The log-likelihood is:
+
+        L(T) = Σ_i [ y_i·log σ(T·Δs_i) + (1−y_i)·log σ(−T·Δs_i) ]
+               = Σ_i log σ(T · (2y_i − 1) · Δs_i)
+
+    This is concave in T (single global maximum) so ``minimize_scalar`` with
+    the 'bounded' method is guaranteed to converge.
+
+    Args:
+        delta_s: Score differences ``s_A − s_B`` for each battle, shape (N,).
+        y: Observed hard labels (1 = A was preferred, 0 = B was preferred,
+           0.5 = tie).  Ties contribute zero gradient and are skipped.
+        bounds: Search interval for T (default −10 to +10).
+
+    Returns:
+        The calibrated temperature T*.
+    """
+    delta_s = np.asarray(delta_s, dtype=float)
+    y = np.asarray(y, dtype=float)
+
+    # Skip ties (y == 0.5) — they carry no directional information.
+    non_tie = y != 0.5
+    delta_s = delta_s[non_tie]
+    y = y[non_tie]
+
+    if len(delta_s) == 0:
+        raise ValueError("No non-tie observations available for temperature calibration.")
+
+    # z_i = (2y_i − 1) · Δs_i  (positive when the score difference agrees with the outcome)
+    z = (2 * y - 1) * delta_s
+
+    def neg_log_likelihood(T: float) -> float:
+        # log σ(T·z) = −log(1 + exp(−T·z)) = −logaddexp(0, −T·z)
+        return float(np.sum(np.logaddexp(0.0, -T * z)))
+
+    result = minimize_scalar(
+        neg_log_likelihood,
+        bounds=bounds,
+        method="bounded",
+    )
+    return float(result.x)
+
+
 _COMPLETION_LABEL_SINGLE = "Answer"
 _COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
 _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
@@ -363,6 +414,7 @@ def judge_and_parse_prefs(
     user_prompt_template: str | None = None,
     truncate_input_chars: int = 8192,
     use_tqdm: bool = False,
+    score_parser: "PairScore | None" = None,
 ) -> tuple[list[JudgeAnnotation], list[JudgeAnnotation] | None, pd.Series]:
     """Run judge annotation and parse preferences, handling swap_mode='both'.
 
@@ -407,7 +459,8 @@ def judge_and_parse_prefs(
     def _none_to_nan(x):
         return float("nan") if x is None else x
 
-    score_parser = PairScore()
+    if score_parser is None:
+        score_parser = PairScore()
     prefs = pd.Series(
         [score_parser.parse_model_raw(a.judge_completion) for a in annotations]
     )

From e4498b6a979dbbf6b64cb99dd8bfd32d92288492 Mon Sep 17 00:00:00 2001
From: Bora Kargi <kargibora@gmail.com>
Date: Fri, 24 Apr 2026 11:01:00 +0200
Subject: [PATCH 3/8] Update READMe for soft-elo support

---
 README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/README.md b/README.md
index 508ac9f..3d7d59b 100644
--- a/README.md
+++ b/README.md
@@ -245,6 +245,29 @@ uv run python judgearena/estimate_elo_ratings.py \
 | `--n_bootstraps` | `20` | Bootstrap samples for ELO confidence intervals |
 | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias |
 | `--result_folder` | `results` | Directory where annotations and results are saved |
+| `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels |
+| `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) |
+| `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) |
+
+### Soft-ELO & temperature calibration
+
+By default, judge scores are discretised to hard win/loss/tie labels. Passing `--soft-elo` instead converts the raw score
+difference into a continuous preference via a softmax, which is then fed into a soft Bradley-Terry model.
+
+To let the data choose the best temperature automatically, add `--calibrate-temperature`.
+JudgeArena will run the judge on a sample of human-annotated arena battles, fit the temperature $T^*$ by MLE, and
+use it for the full evaluation:
+
+```bash
+judgearena-elo \
+  --arena LMArena-100k \
+  --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
+  --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+  --n_instructions 200 \
+  --soft-elo \
+  --calibrate-temperature \
+  --calibration-size 300
+```
 
 ### Output
 

From 6b401e8e8e4fd34308eb9ec266bdb1409affd40c Mon Sep 17 00:00:00 2001
From: Bora Kargi <kargibora@gmail.com>
Date: Wed, 29 Apr 2026 14:57:50 +0200
Subject: [PATCH 4/8] Update temperature

---
 README.md                          |  1 +
 judgearena/estimate_elo_ratings.py | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 3d7d59b..f8d409d 100644
--- a/README.md
+++ b/README.md
@@ -246,6 +246,7 @@ uv run python judgearena/estimate_elo_ratings.py \
 | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias |
 | `--result_folder` | `results` | Directory where annotations and results are saved |
 | `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels |
+| `--soft-elo-temperature` | `0.3` | Initial softmax temperature for `--soft-elo`; overridden if `--calibrate-temperature` succeeds |
 | `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) |
 | `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) |
 
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 4f9d10e..44fb709 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -32,6 +32,7 @@ class CliEloArgs(BaseCliArgs):
     seed: int = 0
     baseline_model: str | None = None
     soft_elo: bool = False
+    soft_elo_temperature: float = 0.3
     calibrate_temperature: bool = False
     calibration_size: int | None = None
 
@@ -92,6 +93,13 @@ def parse_args(cls):
             help="Use continuous judge preferences as soft labels for BT fitting "
             "instead of discretising to hard win/loss/tie.",
         )
+        parser.add_argument(
+            "--soft-elo-temperature",
+            type=float,
+            default=0.3,
+            help="Initial PairScore temperature used by --soft-elo (default: 0.3). "
+            "Overridden by --calibrate-temperature if calibration succeeds.",
+        )
         parser.add_argument(
             "--calibrate-temperature",
             action="store_true",
@@ -117,6 +125,7 @@ def parse_args(cls):
             seed=args.seed,
             baseline_model=args.baseline_model,
             soft_elo=args.soft_elo,
+            soft_elo_temperature=args.soft_elo_temperature,
             calibrate_temperature=args.calibrate_temperature,
             calibration_size=args.calibration_size,
             judge_model=args.judge_model,
@@ -647,12 +656,12 @@ def run_judge() -> pd.DataFrame:
                 )
                 print(
                     f"  Calibration pairs: {len(delta_s_cal)}"
-                    f"  T* = {calibrated_temperature:.4f}  (default was 0.3)"
+                    f"  T* = {calibrated_temperature:.4f}  (default was {args.soft_elo_temperature})"
                 )
 
     # Build the score parser used for the main evaluation run.
     score_parser = PairScore(
-        temperature=calibrated_temperature if calibrated_temperature is not None else 0.3
+        temperature=calibrated_temperature if calibrated_temperature is not None else args.soft_elo_temperature
     )
 
     # If we calibrated the temperature, the prefs stored in df_judge were

From 995db21f8b0ee5901b341ccecf7cc01eb8fe7056 Mon Sep 17 00:00:00 2001
From: Bora Kargi <kargibora@gmail.com>
Date: Tue, 12 May 2026 13:50:03 +0200
Subject: [PATCH 5/8] Update CLI to unify elo computation

---
 judgearena/cli.py                  |  30 +++
 judgearena/estimate_elo_ratings.py | 350 +++++++++++------------------
 tests/test_estimate_elo_ratings.py |  38 +++-
 3 files changed, 186 insertions(+), 232 deletions(-)

diff --git a/judgearena/cli.py b/judgearena/cli.py
index eb94c83..42840e9 100644
--- a/judgearena/cli.py
+++ b/judgearena/cli.py
@@ -113,6 +113,32 @@ def _build_parser() -> argparse.ArgumentParser:
         default=None,
         help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).",
     )
+    parser.add_argument(
+        "--soft-elo",
+        action="store_true",
+        help="[elo] Use continuous judge preferences as soft BT targets instead of "
+        "discretising to hard win/loss/tie.",
+    )
+    parser.add_argument(
+        "--soft-elo-temperature",
+        type=float,
+        default=0.3,
+        help="[elo] Initial PairScore temperature for --soft-elo. "
+        "Overridden by --calibrate-temperature if calibration succeeds.",
+    )
+    parser.add_argument(
+        "--calibrate-temperature",
+        action="store_true",
+        help="[elo] MLE-fit the PairScore temperature against human-labeled arena "
+        "battles before the main run. Requires --soft-elo.",
+    )
+    parser.add_argument(
+        "--calibration-size",
+        type=int,
+        default=None,
+        help="[elo] Number of human arena battles to sample for temperature "
+        "calibration. Defaults to all. Requires --calibrate-temperature.",
+    )
     add_common_arguments(parser)
     return parser
 
@@ -191,6 +217,10 @@ def _build_elo_args(
         n_bootstraps=args.n_bootstraps,
         seed=args.seed,
         baseline_model=args.baseline_model,
+        soft_elo=args.soft_elo,
+        soft_elo_temperature=args.soft_elo_temperature,
+        calibrate_temperature=args.calibrate_temperature,
+        calibration_size=args.calibration_size,
         judge_model=args.judge_model,
         n_instructions=args.n_instructions,
         provide_explanation=args.provide_explanation,
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index ad703e2..3d80c7d 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -1,4 +1,5 @@
 import hashlib
+import argparse
 from dataclasses import dataclass
 from functools import partial
 
@@ -145,119 +146,22 @@ def parse_args(cls):
         )
 
 
-def compute_bradley_terry(
-    df: pd.DataFrame,
-    winner_col: str,
-    scale: float = 400,
-    base: float = 10,
-    init_rating: float = 1000,
-    baseline_model: str | None = None,
-    baseline_rating: float = 1000,
-) -> dict[str, float]:
-    """
-    Compute Bradley-Terry ratings using MLE (logistic regression).
-
-    This method fits a Bradley-Terry model to pairwise comparison data using
-    maximum likelihood estimation via logistic regression.
-
-    Args:
-        df: DataFrame with columns 'model_a', 'model_b', and the winner column
-        winner_col: Name of the column containing the winner
-        scale: Scale factor for ELO conversion (default 400)
-        base: Base for logarithm in ELO formula (default 10)
-        init_rating: Initial rating offset (default 1000)
-        baseline_model: Model to anchor at baseline_rating
-        baseline_rating: Rating to assign to the baseline model
-
-    Returns:
-        Dictionary mapping model names to their Bradley-Terry ratings
-    """
-    # Get all unique models
-    all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))
-
-    # Create pivot tables for wins
-    ptbl_a_win = pd.pivot_table(
-        df[df[winner_col] == "model_a"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-
-    ptbl_b_win = pd.pivot_table(
-        df[df[winner_col] == "model_b"],
-        index="model_a",
-        columns="model_b",
-        aggfunc="size",
-        fill_value=0,
-    )
-
-    # Handle ties
-    if sum(df[winner_col].isin(["tie", "tie (bothbad)"])) == 0:
-        ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models)
-    else:
-        ptbl_tie = pd.pivot_table(
-            df[df[winner_col].isin(["tie", "tie (bothbad)"])],
-            index="model_a",
-            columns="model_b",
-            aggfunc="size",
-            fill_value=0,
-        )
-        ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0)
-        ptbl_tie = ptbl_tie + ptbl_tie.T
-
-    # Reindex all pivot tables to have consistent dimensions
-    ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0)
-    ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0)
-
-    # Combined win matrix (ties count as 0.5 for each)
-    ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie
-
-    models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index)
-
-    p = len(models)
-    X = np.zeros([p * (p - 1) * 2, p])
-    Y = np.zeros(p * (p - 1) * 2)
-
-    cur_row = 0
-    sample_weights = []
-    for m_a in ptbl_win.index:
-        for m_b in ptbl_win.columns:
-            if m_a == m_b:
-                continue
-            # Skip if nan or no battles between this pair
-            w_ab = ptbl_win.loc[m_a, m_b]
-            w_ba = ptbl_win.loc[m_b, m_a]
-            if np.isnan(w_ab) or np.isnan(w_ba):
-                continue
-            if w_ab == 0 and w_ba == 0:
-                continue
-            X[cur_row, models[m_a]] = +np.log(base)
-            X[cur_row, models[m_b]] = -np.log(base)
-            Y[cur_row] = 1.0
-            sample_weights.append(w_ab)
-
-            X[cur_row + 1, models[m_a]] = np.log(base)
-            X[cur_row + 1, models[m_b]] = -np.log(base)
-            Y[cur_row + 1] = 0.0
-            sample_weights.append(w_ba)
-            cur_row += 2
-
-    X = X[:cur_row]
-    Y = Y[:cur_row]
-
-    lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000)
-    lr.fit(X, Y, sample_weight=sample_weights)
-    elo_scores = scale * lr.coef_[0] + init_rating
+def _winner_to_pref(winner: str) -> float | None:
+    """Convert a hard winner label to a continuous preference value."""
+    if winner == "model_a":
+        return 0.0
+    elif winner == "model_b":
+        return 1.0
+    elif winner in ("tie", "tie (bothbad)"):
+        return 0.5
+    return None
 
-    # Normalize to baseline model if specified
-    if baseline_model is not None and baseline_model in models.index:
-        elo_scores += baseline_rating - elo_scores[models[baseline_model]]
 
-    return dict(pd.Series(elo_scores, index=models.index))
+def _is_nan_pref(p) -> bool:
+    return p is None or (isinstance(p, float) and np.isnan(p))
 
 
-def compute_soft_bradley_terry(
+def fit_bradley_terry(
     df: pd.DataFrame,
     pref_col: str = "pref",
     scale: float = 400,
@@ -266,57 +170,61 @@ def compute_soft_bradley_terry(
     baseline_model: str | None = None,
     baseline_rating: float = 1000,
 ) -> dict[str, float]:
-    """Compute Bradley-Terry ratings from continuous (soft) preferences.
+    """Fit Bradley-Terry ratings via weighted logistic regression.
 
-    Each row in *df* is a single battle with columns ``model_a``, ``model_b``,
-    and *pref_col* ∈ [0, 1] where 0 → A wins, 1 → B wins, 0.5 → tie.
+    Each row in *df* is a battle with columns ``model_a``, ``model_b`` and
+    ``pref_col`` ∈ [0, 1] where 0 means A wins, 1 means B wins, 0.5 is a tie.
+    Hard win/loss/tie labels are the special case ``pref ∈ {0, 0.5, 1}``.
 
-    The soft cross-entropy for a single battle is decomposed into two
-    weighted hard-label rows so that sklearn ``LogisticRegression`` can be
-    reused:
+    The soft cross-entropy for a battle is decomposed into two weighted
+    hard-label rows so sklearn's ``LogisticRegression`` can be reused:
 
-        row 1: Y=1, weight = 1 - pref   (evidence for A winning)
-        row 2: Y=0, weight = pref        (evidence for B winning)
+        Y=1, weight = (1 − pref) · count   (evidence A wins)
+        Y=0, weight =  pref      · count   (evidence B wins)
+
+    Identical ``(model_a, model_b, pref)`` triples are aggregated first so
+    the design matrix stays small when prefs are quantised (e.g. human
+    arena labels) and untouched when prefs are continuous floats.
     """
-    df = df.dropna(subset=[pref_col]).copy()
+    df = df.dropna(subset=[pref_col])
     if df.empty:
         return {}
 
-    all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique()))
+    grouped = (
+        df.groupby(["model_a", "model_b", pref_col])
+        .size()
+        .reset_index(name="count")
+    )
+
+    all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"]))
     models = pd.Series(np.arange(len(all_models)), index=all_models)
     p = len(models)
 
-    n_battles = len(df)
-    X = np.zeros([2 * n_battles, p])
-    Y = np.zeros(2 * n_battles)
-    sample_weights = np.zeros(2 * n_battles)
-
-    for idx, (_, row) in enumerate(df.iterrows()):
-        m_a = row["model_a"]
-        m_b = row["model_b"]
-        pref = row[pref_col]
-
-        # Row for "A wins" evidence
-        X[2 * idx, models[m_a]] = +np.log(base)
-        X[2 * idx, models[m_b]] = -np.log(base)
-        Y[2 * idx] = 1.0
-        sample_weights[2 * idx] = 1.0 - pref
-
-        # Row for "B wins" evidence
-        X[2 * idx + 1, models[m_a]] = +np.log(base)
-        X[2 * idx + 1, models[m_b]] = -np.log(base)
-        Y[2 * idx + 1] = 0.0
-        sample_weights[2 * idx + 1] = pref
-
-    # Drop rows with zero weight (pure wins have one side = 0)
+    m_a_idx = grouped["model_a"].map(models).to_numpy()
+    m_b_idx = grouped["model_b"].map(models).to_numpy()
+    prefs = grouped[pref_col].to_numpy(dtype=float)
+    counts = grouped["count"].to_numpy(dtype=float)
+    n = len(grouped)
+
+    log_base = np.log(base)
+    X = np.zeros((2 * n, p))
+    top = np.arange(n)
+    bot = n + top
+    X[top, m_a_idx] = +log_base
+    X[top, m_b_idx] = -log_base
+    X[bot, m_a_idx] = +log_base
+    X[bot, m_b_idx] = -log_base
+
+    Y = np.concatenate([np.ones(n), np.zeros(n)])
+    sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts])
+
     nonzero = sample_weights > 0
+    if not nonzero.any():
+        return {}
     X = X[nonzero]
     Y = Y[nonzero]
     sample_weights = sample_weights[nonzero]
 
-    if len(X) == 0:
-        return {}
-
     lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000)
     lr.fit(X, Y, sample_weight=sample_weights)
     elo_scores = scale * lr.coef_[0] + init_rating
@@ -327,15 +235,48 @@ def compute_soft_bradley_terry(
     return dict(pd.Series(elo_scores, index=models.index))
 
 
-def _winner_to_pref(winner: str) -> float | None:
-    """Convert a hard winner label to a continuous preference value."""
-    if winner == "model_a":
-        return 0.0
-    elif winner == "model_b":
-        return 1.0
-    elif winner in ("tie", "tie (bothbad)"):
-        return 0.5
-    return None
+def _prefs_to_battle_results(
+    prefs,
+    our_model_is_position_a,
+    opponent_models,
+    model_name: str,
+) -> pd.DataFrame:
+    """Map per-battle judge prefs into model-name-level battle rows.
+
+    The judge prompt placed our model at position A or B independently per
+    battle.  Here we re-orient each row so ``model_a``/``model_b`` carry
+    the actual model names and ``pref`` is consistent with that ordering
+    (``pref=0`` ⇒ ``model_a`` wins).  ``pref_hard`` is the quantised
+    {0, 0.5, 1} version used by the non-soft Bradley-Terry fit.
+    """
+    records = []
+    for pref, is_pos_a, opp in zip(
+        prefs, our_model_is_position_a, opponent_models, strict=True
+    ):
+        if _is_nan_pref(pref) or pref == 0.5:
+            winner = "tie"
+        elif pref < 0.5:
+            winner = "model_a"
+        else:
+            winner = "model_b"
+
+        if is_pos_a:
+            rec = {
+                "model_a": model_name,
+                "model_b": opp,
+                "winner": winner,
+                "pref": pref,
+            }
+        else:
+            rec = {
+                "model_a": opp,
+                "model_b": model_name,
+                "winner": winner,
+                "pref": None if _is_nan_pref(pref) else 1.0 - pref,
+            }
+        rec["pref_hard"] = _winner_to_pref(winner)
+        records.append(rec)
+    return pd.DataFrame(records)
 
 
 def main(args: CliEloArgs | None = None) -> dict:
@@ -513,33 +454,10 @@ def run_judge() -> pd.DataFrame:
     logger.debug("First judge output:\n%s", df_judge["judge_completion"].iloc[0][:500])
 
     # Map preferences back to model-name-level battle results.
-    # Build both hard labels (winner) and continuous prefs for each battle.
     model_name = args.model
-    battle_results = []
-    for pref, is_pos_a, opp_model in zip(
-        prefs, our_model_is_position_a, opponent_models, strict=True
-    ):
-        if pref is None or pref == 0.5:
-            winner = "tie"
-        elif pref < 0.5:
-            winner = "model_a"
-        else:
-            winner = "model_b"
-
-        # Continuous pref is relative to judge positions (A/B).
-        # Remap so that model_a column in the DataFrame always corresponds
-        # to pref=0 and model_b to pref=1.
-        if is_pos_a:
-            battle_results.append(
-                {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref}
-            )
-        else:
-            battle_results.append(
-                {"model_a": opp_model, "model_b": model_name, "winner": winner, "pref": 1.0 - pref if pref is not None else None}
-            )
-
-    # LLM-judge battle results for our model
-    df_llm_judge = pd.DataFrame(battle_results)
+    df_llm_judge = _prefs_to_battle_results(
+        prefs, our_model_is_position_a, opponent_models, model_name
+    )
 
     # Normalize prefs so pref < 0.5 always means our model wins, then summarise
     prefs_normalized = pd.Series(
@@ -569,14 +487,16 @@ def run_judge() -> pd.DataFrame:
         df_arena["model_a"].isin(well_represented)
         & df_arena["model_b"].isin(well_represented)
     ]
-    # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5)
+    # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5).
+    # Human labels are already hard, so pref_hard == pref.
     df_arena["pref"] = df_arena["winner"].map(_winner_to_pref)
+    df_arena["pref_hard"] = df_arena["pref"]
 
     df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
 
     # Compute human-only BT ratings as ground-truth reference
-    human_elo = compute_bradley_terry(
-        df_arena, winner_col="winner", baseline_model=args.baseline_model
+    human_elo = fit_bradley_terry(
+        df_arena, pref_col="pref_hard", baseline_model=args.baseline_model
     )
 
     # --- Temperature calibration (optional) ---
@@ -585,11 +505,11 @@ def run_judge() -> pd.DataFrame:
     calibrated_temperature: float | None = None
     if args.calibrate_temperature:
         if not args.soft_elo:
-            print(
-                "Warning: --calibrate-temperature has no effect without --soft-elo; skipping."
+            logger.warning(
+                "--calibrate-temperature has no effect without --soft-elo; skipping."
             )
         else:
-            print("\n=== Calibrating PairScore temperature against human annotations ===")
+            logger.info("Calibrating PairScore temperature against human annotations.")
             # Sample calibration battles from the already-loaded arena battles.
             # Use the same judge to score them so scores and labels are comparable.
             _cal_n = (
@@ -597,9 +517,12 @@ def run_judge() -> pd.DataFrame:
                 if args.calibration_size is not None
                 else len(df_arena)
             )
+            # Keep the original df_arena_all index so we can look up the full
+            # conversation rows below; reset_index would point at non-existent
+            # 0..N labels in df_arena_all.
             cal_battles = df_arena.sample(
                 n=_cal_n, random_state=int(rng.integers(0, 2**31))
-            ).reset_index(drop=True)
+            )
 
             cal_instructions = [
                 _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0])
@@ -651,17 +574,19 @@ def run_judge() -> pd.DataFrame:
                 y_cal.append(1.0 - human_pref)  # pref=0 → A wins → y=1
 
             if len(delta_s_cal) < 10:
-                print(
-                    f"  Only {len(delta_s_cal)} valid calibration pairs (need ≥10); "
-                    "keeping default temperature."
+                logger.warning(
+                    "Only %d valid calibration pairs (need ≥10); keeping default temperature.",
+                    len(delta_s_cal),
                 )
             else:
                 calibrated_temperature = calibrate_temperature(
                     np.array(delta_s_cal), np.array(y_cal)
                 )
-                print(
-                    f"  Calibration pairs: {len(delta_s_cal)}"
-                    f"  T* = {calibrated_temperature:.4f}  (default was {args.soft_elo_temperature})"
+                logger.info(
+                    "Calibration pairs: %d  T* = %.4f  (default was %s)",
+                    len(delta_s_cal),
+                    calibrated_temperature,
+                    args.soft_elo_temperature,
                 )
 
     # Build the score parser used for the main evaluation run.
@@ -691,30 +616,9 @@ def _none_to_nan(x):
             prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist()
 
         # Rebuild battle_results with calibrated prefs
-        battle_results = []
-        for pref, is_pos_a, opp_model in zip(
-            prefs, our_model_is_position_a, opponent_models, strict=True
-        ):
-            if pref is None or (isinstance(pref, float) and np.isnan(pref)) or pref == 0.5:
-                winner = "tie"
-            elif pref < 0.5:
-                winner = "model_a"
-            else:
-                winner = "model_b"
-            if is_pos_a:
-                battle_results.append(
-                    {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref}
-                )
-            else:
-                battle_results.append(
-                    {
-                        "model_a": opp_model,
-                        "model_b": model_name,
-                        "winner": winner,
-                        "pref": 1.0 - pref if (pref is not None and not (isinstance(pref, float) and np.isnan(pref))) else None,
-                    }
-                )
-        df_llm_judge = pd.DataFrame(battle_results)
+        df_llm_judge = _prefs_to_battle_results(
+            prefs, our_model_is_position_a, opponent_models, model_name
+        )
         df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True)
 
     n_bootstraps = args.n_bootstraps
@@ -736,19 +640,15 @@ def _none_to_nan(x):
         battle_counts[row["model_a"]] = battle_counts.get(row["model_a"], 0) + 1
         battle_counts[row["model_b"]] = battle_counts.get(row["model_b"], 0) + 1
 
+    pref_col = "pref" if use_soft else "pref_hard"
     bootstrap_ratings: list[dict[str, float]] = []
     for _ in range(n_bootstraps):
         df_sample = df_results.sample(
             n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31))
         )
-        if use_soft:
-            ratings = compute_soft_bradley_terry(
-                df_sample, pref_col="pref", baseline_model=args.baseline_model
-            )
-        else:
-            ratings = compute_bradley_terry(
-                df_sample, winner_col="winner", baseline_model=args.baseline_model
-            )
+        ratings = fit_bradley_terry(
+            df_sample, pref_col=pref_col, baseline_model=args.baseline_model
+        )
         bootstrap_ratings.append(ratings)
 
     if bootstrap_ratings:
diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py
index 83f9c8a..c2be142 100644
--- a/tests/test_estimate_elo_ratings.py
+++ b/tests/test_estimate_elo_ratings.py
@@ -5,7 +5,12 @@
 import pytest
 
 import judgearena.estimate_elo_ratings as estimate_elo_ratings
-from judgearena.estimate_elo_ratings import CliEloArgs, compute_bradley_terry, main
+from judgearena.estimate_elo_ratings import (
+    CliEloArgs,
+    _winner_to_pref,
+    fit_bradley_terry,
+    main,
+)
 from judgearena.evaluate import JudgeAnnotation, judge_and_parse_prefs
 from judgearena.utils import make_model
 
@@ -89,7 +94,13 @@ def _default_args(**kwargs) -> CliEloArgs:
     return CliEloArgs(**defaults)
 
 
-# --- compute_bradley_terry unit tests ---
+# --- fit_bradley_terry unit tests ---
+
+
+def _records_with_pref(records: list[dict]) -> pd.DataFrame:
+    df = pd.DataFrame(records)
+    df["pref"] = df["winner"].map(_winner_to_pref)
+    return df
 
 
 def test_bradley_terry_clear_winner():
@@ -97,23 +108,22 @@ def test_bradley_terry_clear_winner():
     records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 + [
         {"model_a": "B", "model_b": "A", "winner": "model_b"}
     ] * 10
-    ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner")
+    ratings = fit_bradley_terry(_records_with_pref(records))
     assert ratings["A"] > ratings["B"]
 
 
 def test_bradley_terry_all_ties():
     """All ties → ratings should be equal."""
     records = [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 20
-    ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner")
+    ratings = fit_bradley_terry(_records_with_pref(records))
     assert abs(ratings["A"] - ratings["B"]) < 1.0
 
 
 def test_bradley_terry_baseline():
     """Baseline model is anchored at baseline_rating."""
     records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10
-    ratings = compute_bradley_terry(
-        pd.DataFrame(records),
-        winner_col="winner",
+    ratings = fit_bradley_terry(
+        _records_with_pref(records),
         baseline_model="B",
         baseline_rating=1000,
     )
@@ -121,6 +131,20 @@ def test_bradley_terry_baseline():
     assert ratings["A"] > 1000.0
 
 
+def test_bradley_terry_soft_matches_hard():
+    """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels."""
+    records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + [
+        {"model_a": "A", "model_b": "B", "winner": "model_b"}
+    ] * 3 + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2
+    df = _records_with_pref(records)
+    hard = fit_bradley_terry(df, pref_col="pref")
+    # Passing the same column twice (continuous == quantised here) must match.
+    df["pref_soft"] = df["pref"].astype(float)
+    soft = fit_bradley_terry(df, pref_col="pref_soft")
+    assert hard["A"] == pytest.approx(soft["A"], abs=1e-3)
+    assert hard["B"] == pytest.approx(soft["B"], abs=1e-3)
+
+
 # --- main() integration tests ---
 
 

From b3571167b2ec823f82bef22a5c0b2a0d4e2869f8 Mon Sep 17 00:00:00 2001
From: bora kargi <kargibora@gmail.com>
Date: Tue, 12 May 2026 14:26:14 +0200
Subject: [PATCH 6/8] Remove duplication

---
 README.md                          |   6 +-
 judgearena/estimate_elo_ratings.py | 114 +----------------------------
 2 files changed, 5 insertions(+), 115 deletions(-)

diff --git a/README.md b/README.md
index 1d8146b..5e62300 100644
--- a/README.md
+++ b/README.md
@@ -266,9 +266,9 @@ JudgeArena will run the judge on a sample of human-annotated arena battles, fit
 use it for the full evaluation:
 
 ```bash
-judgearena-elo \
-  --arena LMArena-100k \
-  --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
+judgearena \
+  --task elo-lmarena-100k \
+  --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \
   --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
   --n_instructions 200 \
   --soft-elo \
diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 3d80c7d..503277d 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -1,5 +1,4 @@
 import hashlib
-import argparse
 from dataclasses import dataclass
 from functools import partial
 
@@ -8,7 +7,7 @@
 from sklearn.linear_model import LogisticRegression
 
 from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
-from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs
+from judgearena.cli_common import BaseCliArgs
 from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore
 from judgearena.generate import generate_instructions
 from judgearena.log import get_logger
@@ -39,112 +38,6 @@ class CliEloArgs(BaseCliArgs):
     calibrate_temperature: bool = False
     calibration_size: int | None = None
 
-    @classmethod
-    def parse_args(cls):
-        parser = argparse.ArgumentParser(
-            prog="Estimate ELO rating for a model on an Arena (LMArena-100k, LMArena-140k, or ComparIA) with LLM judges",
-        )
-        parser.add_argument(
-            "--arena",
-            help="The arena to use. Battles are sampled from this Arena. If not passed use concatenation from all Arena. "
-            "Passing LMArena leads to loading the union of `LMArena-100k` and `LMArena-140k`",
-            choices=["LMArena-100k", "LMArena-140k", "ComparIA", "LMArena"],
-            required=False,
-        )
-        parser.add_argument(
-            "--model",
-            required=True,
-            help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
-        )
-        parser.add_argument(
-            "--languages",
-            nargs="+",
-            default=None,
-            help='List of language codes to evaluate, e.g. "en fr de" (default: all languages)',
-        )
-        parser.add_argument(
-            "--n_instructions_per_language",
-            type=int,
-            required=False,
-            help="Maximum number of instructions to keep per language.",
-        )
-        parser.add_argument(
-            "--n_bootstraps",
-            type=int,
-            required=False,
-            default=20,
-            help="Number of bootstrap samples for ELO confidence intervals. Default is 20.",
-        )
-        parser.add_argument(
-            "--seed",
-            type=int,
-            required=False,
-            default=0,
-            help="Random seed for reproducibility. Default is 0.",
-        )
-        parser.add_argument(
-            "--baseline_model",
-            type=str,
-            required=False,
-            default=None,
-            help="Model name to anchor at 1000 ELO. All other ratings are expressed relative to this model. "
-            "Must be one of the models present in the arena battles. If not set, ratings are not anchored.",
-        )
-        parser.add_argument(
-            "--soft-elo",
-            action="store_true",
-            help="Use continuous judge preferences as soft labels for BT fitting "
-            "instead of discretising to hard win/loss/tie.",
-        )
-        parser.add_argument(
-            "--soft-elo-temperature",
-            type=float,
-            default=0.3,
-            help="Initial PairScore temperature used by --soft-elo (default: 0.3). "
-            "Overridden by --calibrate-temperature if calibration succeeds.",
-        )
-        parser.add_argument(
-            "--calibrate-temperature",
-            action="store_true",
-            help="Calibrate the PairScore temperature T against available human-annotated "
-            "arena battles before running soft-ELO.  Requires --soft-elo.",
-        )
-        parser.add_argument(
-            "--calibration-size",
-            type=int,
-            default=None,
-            help="Number of human arena battles to sample for temperature calibration. "
-            "Defaults to all available battles. Requires --calibrate-temperature.",
-        )
-        add_common_arguments(parser)
-        args = parser.parse_args()
-
-        return cls(
-            arena=args.arena,
-            model=args.model,
-            n_instructions_per_language=args.n_instructions_per_language,
-            languages=args.languages,
-            n_bootstraps=args.n_bootstraps,
-            seed=args.seed,
-            baseline_model=args.baseline_model,
-            soft_elo=args.soft_elo,
-            soft_elo_temperature=args.soft_elo_temperature,
-            calibrate_temperature=args.calibrate_temperature,
-            calibration_size=args.calibration_size,
-            judge_model=args.judge_model,
-            n_instructions=args.n_instructions,
-            provide_explanation=args.provide_explanation,
-            swap_mode=args.swap_mode,
-            ignore_cache=args.ignore_cache,
-            truncate_all_input_chars=args.truncate_all_input_chars,
-            max_out_tokens_models=args.max_out_tokens_models,
-            max_out_tokens_judge=args.max_out_tokens_judge,
-            max_model_len=args.max_model_len,
-            chat_template=args.chat_template,
-            result_folder=args.result_folder,
-            engine_kwargs=parse_engine_kwargs(args.engine_kwargs),
-        )
-
 
 def _winner_to_pref(winner: str) -> float | None:
     """Convert a hard winner label to a continuous preference value."""
@@ -279,10 +172,7 @@ def _prefs_to_battle_results(
     return pd.DataFrame(records)
 
 
-def main(args: CliEloArgs | None = None) -> dict:
-    if args is None:
-        args = CliEloArgs.parse_args()
-
+def main(args: CliEloArgs) -> dict:
     rng = np.random.default_rng(args.seed)
 
     # Step 1: Load arena battles

From be53e8c24cc4a45e0a2454ef3f3f775cbbdd5cfd Mon Sep 17 00:00:00 2001
From: bora kargi <kargibora@gmail.com>
Date: Tue, 12 May 2026 15:09:01 +0200
Subject: [PATCH 7/8] Fix a edge case when all the labels are same

---
 judgearena/estimate_elo_ratings.py | 47 +++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 503277d..9e6b0c7 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -111,12 +111,11 @@ def fit_bradley_terry(
     Y = np.concatenate([np.ones(n), np.zeros(n)])
     sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts])
 
-    nonzero = sample_weights > 0
-    if not nonzero.any():
+    # Keep zero-weight rows so sklearn LR always sees both Y classes — when
+    # every pref collapses to 0 or 1 the missing-class rows contribute nothing
+    # to the loss but stop the solver from raising on n_classes < 2.
+    if sample_weights.sum() == 0:
         return {}
-    X = X[nonzero]
-    Y = Y[nonzero]
-    sample_weights = sample_weights[nonzero]
 
     lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000)
     lr.fit(X, Y, sample_weight=sample_weights)
@@ -305,7 +304,7 @@ def run_judge() -> pd.DataFrame:
             max_tokens=args.max_out_tokens_judge,
             **judge_extra_kwargs,
         )
-        annotations, _, prefs = judge_and_parse_prefs(
+        annotations, annotations_reversed, prefs = judge_and_parse_prefs(
             judge_chat_model=judge_chat_model,
             instructions=instructions.tolist(),
             completions_A=completions_A,
@@ -315,16 +314,33 @@ def run_judge() -> pd.DataFrame:
             truncate_input_chars=args.truncate_all_input_chars,
             use_tqdm=use_tqdm,
         )
+        if annotations_reversed is None:
+            row_annotations = list(annotations)
+            row_use_model_a = use_model_a_as_opponent
+            row_our_pos_a = our_model_is_position_a
+            row_opponents = list(opponent_models)
+        else:
+            # swap_mode="both": dataframe carries 2n rows (AB then BA).
+            # Position metadata is duplicated; prefs are already oriented
+            # consistently by judge_and_parse_prefs as [pref_AB, 1 - pref_BA].
+            row_annotations = list(annotations) + list(annotations_reversed)
+            row_use_model_a = np.concatenate(
+                [use_model_a_as_opponent, use_model_a_as_opponent]
+            )
+            row_our_pos_a = np.concatenate(
+                [our_model_is_position_a, our_model_is_position_a]
+            )
+            row_opponents = list(opponent_models) + list(opponent_models)
         return pd.DataFrame(
             {
-                "judge_completion": [a.judge_completion for a in annotations],
-                "instruction": [a.instruction for a in annotations],
-                "completion_A": [a.completion_A for a in annotations],
-                "completion_B": [a.completion_B for a in annotations],
+                "judge_completion": [a.judge_completion for a in row_annotations],
+                "instruction": [a.instruction for a in row_annotations],
+                "completion_A": [a.completion_A for a in row_annotations],
+                "completion_B": [a.completion_B for a in row_annotations],
                 "pref": prefs,
-                "use_model_a_as_opponent": use_model_a_as_opponent,
-                "our_model_is_position_a": our_model_is_position_a,
-                "opponent_model": opponent_models,
+                "use_model_a_as_opponent": row_use_model_a,
+                "our_model_is_position_a": row_our_pos_a,
+                "opponent_model": row_opponents,
             }
         )
 
@@ -363,7 +379,10 @@ def run_judge() -> pd.DataFrame:
     winrate = summary["winrate"]
 
     print(f"\n=== Results for {model_name} ===")
-    print(f"Battles: {n} | Wins: {our_wins} | Losses: {our_losses} | Ties: {our_ties}")
+    print(
+        f"Battles: {len(df_llm_judge)} | Wins: {our_wins} | "
+        f"Losses: {our_losses} | Ties: {our_ties}"
+    )
     print(f"Win rate: {winrate:.2%}")
 
     # Combine LLM-judge battles with human-annotated arena battles,

From 61f1f84ae604ef3d7a6fda8aa6c161a13d749fea Mon Sep 17 00:00:00 2001
From: bora kargi <kargibora@gmail.com>
Date: Tue, 12 May 2026 15:14:07 +0200
Subject: [PATCH 8/8] ruff fix

---
 judgearena/estimate_elo_ratings.py | 18 +++++++++---------
 judgearena/evaluate.py             |  6 ++++--
 tests/test_estimate_elo_ratings.py |  8 +++++---
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py
index 9e6b0c7..1b17637 100644
--- a/judgearena/estimate_elo_ratings.py
+++ b/judgearena/estimate_elo_ratings.py
@@ -8,7 +8,7 @@
 
 from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe
 from judgearena.cli_common import BaseCliArgs
-from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore
+from judgearena.evaluate import PairScore, calibrate_temperature, judge_and_parse_prefs
 from judgearena.generate import generate_instructions
 from judgearena.log import get_logger
 from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model
@@ -84,9 +84,7 @@ def fit_bradley_terry(
         return {}
 
     grouped = (
-        df.groupby(["model_a", "model_b", pref_col])
-        .size()
-        .reset_index(name="count")
+        df.groupby(["model_a", "model_b", pref_col]).size().reset_index(name="count")
     )
 
     all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"]))
@@ -500,7 +498,9 @@ def run_judge() -> pd.DataFrame:
 
     # Build the score parser used for the main evaluation run.
     score_parser = PairScore(
-        temperature=calibrated_temperature if calibrated_temperature is not None else args.soft_elo_temperature
+        temperature=calibrated_temperature
+        if calibrated_temperature is not None
+        else args.soft_elo_temperature
     )
 
     # If we calibrated the temperature, the prefs stored in df_judge were
@@ -536,7 +536,9 @@ def _none_to_nan(x):
     n_llm = len(df_llm_judge)
     n_human = len(df_arena)
     method_label = "Soft-ELO" if use_soft else "ELO"
-    print(f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===")
+    print(
+        f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ==="
+    )
     print(
         f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} "
         f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and "
@@ -579,9 +581,7 @@ def _none_to_nan(x):
         if overlap:
             abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap]
             mae = np.mean(abs_errors)
-            print(
-                f"\n  MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}"
-            )
+            print(f"\n  MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}")
         else:
             mae = np.nan
             print("\n  No overlapping arena models to compute MAE.")
diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py
index 2695abc..863e74a 100644
--- a/judgearena/evaluate.py
+++ b/judgearena/evaluate.py
@@ -6,9 +6,9 @@
 
 import numpy as np
 import pandas as pd
-from scipy.optimize import minimize_scalar
 from langchain_core.language_models.llms import LLM
 from langchain_core.prompts import ChatPromptTemplate
+from scipy.optimize import minimize_scalar
 
 from judgearena.instruction_dataset import load_instructions
 from judgearena.instruction_dataset.arena_hard import (
@@ -93,7 +93,9 @@ def calibrate_temperature(
     y = y[non_tie]
 
     if len(delta_s) == 0:
-        raise ValueError("No non-tie observations available for temperature calibration.")
+        raise ValueError(
+            "No non-tie observations available for temperature calibration."
+        )
 
     # z_i = (2y_i − 1) · Δs_i  (positive when the score difference agrees with the outcome)
     z = (2 * y - 1) * delta_s
diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py
index c2be142..e055fdc 100644
--- a/tests/test_estimate_elo_ratings.py
+++ b/tests/test_estimate_elo_ratings.py
@@ -133,9 +133,11 @@ def test_bradley_terry_baseline():
 
 def test_bradley_terry_soft_matches_hard():
     """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels."""
-    records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + [
-        {"model_a": "A", "model_b": "B", "winner": "model_b"}
-    ] * 3 + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2
+    records = (
+        [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7
+        + [{"model_a": "A", "model_b": "B", "winner": "model_b"}] * 3
+        + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2
+    )
     df = _records_with_pref(records)
     hard = fit_bradley_terry(df, pref_col="pref")
     # Passing the same column twice (continuous == quantised here) must match.