From af4bced4b02c7bffb0f1a267509afe12a0b4689e Mon Sep 17 00:00:00 2001 From: Bora Kargi Date: Tue, 14 Apr 2026 15:33:03 +0200 Subject: [PATCH 1/8] Add soft elo --- judgearena/estimate_elo_ratings.py | 142 +++++++++++++++++++++++++++-- judgearena/evaluate.py | 4 +- judgearena/utils.py | 10 +- 3 files changed, 140 insertions(+), 16 deletions(-) diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index d7dfbd7..03e2224 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -31,6 +31,7 @@ class CliEloArgs(BaseCliArgs): n_bootstraps: int = 20 seed: int = 0 baseline_model: str | None = None + soft_elo: bool = False @classmethod def parse_args(cls): @@ -83,6 +84,12 @@ def parse_args(cls): help="Model name to anchor at 1000 ELO. All other ratings are expressed relative to this model. " "Must be one of the models present in the arena battles. If not set, ratings are not anchored.", ) + parser.add_argument( + "--soft-elo", + action="store_true", + help="Use continuous judge preferences as soft labels for BT fitting " + "instead of discretising to hard win/loss/tie.", + ) add_common_arguments(parser) args = parser.parse_args() @@ -94,6 +101,7 @@ def parse_args(cls): n_bootstraps=args.n_bootstraps, seed=args.seed, baseline_model=args.baseline_model, + soft_elo=args.soft_elo, judge_model=args.judge_model, n_instructions=args.n_instructions, provide_explanation=args.provide_explanation, @@ -221,6 +229,87 @@ def compute_bradley_terry( return dict(pd.Series(elo_scores, index=models.index)) +def compute_soft_bradley_terry( + df: pd.DataFrame, + pref_col: str = "pref", + scale: float = 400, + base: float = 10, + init_rating: float = 1000, + baseline_model: str | None = None, + baseline_rating: float = 1000, +) -> dict[str, float]: + """Compute Bradley-Terry ratings from continuous (soft) preferences. + + Each row in *df* is a single battle with columns ``model_a``, ``model_b``, + and *pref_col* ∈ [0, 1] where 0 → A wins, 1 → B wins, 0.5 → tie. + + The soft cross-entropy for a single battle is decomposed into two + weighted hard-label rows so that sklearn ``LogisticRegression`` can be + reused: + + row 1: Y=1, weight = 1 - pref (evidence for A winning) + row 2: Y=0, weight = pref (evidence for B winning) + """ + df = df.dropna(subset=[pref_col]).copy() + if df.empty: + return {} + + all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())) + models = pd.Series(np.arange(len(all_models)), index=all_models) + p = len(models) + + n_battles = len(df) + X = np.zeros([2 * n_battles, p]) + Y = np.zeros(2 * n_battles) + sample_weights = np.zeros(2 * n_battles) + + for idx, (_, row) in enumerate(df.iterrows()): + m_a = row["model_a"] + m_b = row["model_b"] + pref = row[pref_col] + + # Row for "A wins" evidence + X[2 * idx, models[m_a]] = +np.log(base) + X[2 * idx, models[m_b]] = -np.log(base) + Y[2 * idx] = 1.0 + sample_weights[2 * idx] = 1.0 - pref + + # Row for "B wins" evidence + X[2 * idx + 1, models[m_a]] = +np.log(base) + X[2 * idx + 1, models[m_b]] = -np.log(base) + Y[2 * idx + 1] = 0.0 + sample_weights[2 * idx + 1] = pref + + # Drop rows with zero weight (pure wins have one side = 0) + nonzero = sample_weights > 0 + X = X[nonzero] + Y = Y[nonzero] + sample_weights = sample_weights[nonzero] + + if len(X) == 0: + return {} + + lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000) + lr.fit(X, Y, sample_weight=sample_weights) + elo_scores = scale * lr.coef_[0] + init_rating + + if baseline_model is not None and baseline_model in models.index: + elo_scores += baseline_rating - elo_scores[models[baseline_model]] + + return dict(pd.Series(elo_scores, index=models.index)) + + +def _winner_to_pref(winner: str) -> float | None: + """Convert a hard winner label to a continuous preference value.""" + if winner == "model_a": + return 0.0 + elif winner == "model_b": + return 1.0 + elif winner in ("tie", "tie (bothbad)"): + return 0.5 + return None + + def main(args: CliEloArgs | None = None) -> dict: if args is None: args = CliEloArgs.parse_args() @@ -392,7 +481,8 @@ def run_judge() -> pd.DataFrame: print(f"First judge output:\n{df_judge['judge_completion'].iloc[0][:500]}\n") - # Map preferences back to model-name-level battle results + # Map preferences back to model-name-level battle results. + # Build both hard labels (winner) and continuous prefs for each battle. model_name = args.model battle_results = [] for pref, is_pos_a, opp_model in zip( @@ -405,13 +495,16 @@ def run_judge() -> pd.DataFrame: else: winner = "model_b" + # Continuous pref is relative to judge positions (A/B). + # Remap so that model_a column in the DataFrame always corresponds + # to pref=0 and model_b to pref=1. if is_pos_a: battle_results.append( - {"model_a": model_name, "model_b": opp_model, "winner": winner} + {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref} ) else: battle_results.append( - {"model_a": opp_model, "model_b": model_name, "winner": winner} + {"model_a": opp_model, "model_b": model_name, "winner": winner, "pref": 1.0 - pref if pref is not None else None} ) # LLM-judge battle results for our model @@ -436,7 +529,7 @@ def run_judge() -> pd.DataFrame: # Combine LLM-judge battles with human-annotated arena battles, # keeping only arena models with at least 500 human battles - df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]] + df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]].copy() human_battle_counts = pd.concat( [df_arena["model_a"], df_arena["model_b"]] ).value_counts() @@ -445,16 +538,26 @@ def run_judge() -> pd.DataFrame: df_arena["model_a"].isin(well_represented) & df_arena["model_b"].isin(well_represented) ] + # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5) + df_arena["pref"] = df_arena["winner"].map(_winner_to_pref) + df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) + # Compute human-only BT ratings as ground-truth reference + human_elo = compute_bradley_terry( + df_arena, winner_col="winner", baseline_model=args.baseline_model + ) + # Bootstrap Bradley-Terry ELO ratings n_bootstraps = args.n_bootstraps + use_soft = args.soft_elo n_llm = len(df_llm_judge) n_human = len(df_arena) - print(f"\n=== ELO Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===") + method_label = "Soft-ELO" if use_soft else "ELO" + print(f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===") print( - f"Estimating ELO Ratings with {n_llm} LLM-judges for model {model_name} " + f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} " f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and " f"confidence intervals are reported by computing ELO on {n_bootstraps} samples of instructions." ) @@ -470,9 +573,14 @@ def run_judge() -> pd.DataFrame: df_sample = df_results.sample( n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31)) ) - ratings = compute_bradley_terry( - df_sample, winner_col="winner", baseline_model=args.baseline_model - ) + if use_soft: + ratings = compute_soft_bradley_terry( + df_sample, pref_col="pref", baseline_model=args.baseline_model + ) + else: + ratings = compute_bradley_terry( + df_sample, winner_col="winner", baseline_model=args.baseline_model + ) bootstrap_ratings.append(ratings) if bootstrap_ratings: @@ -488,13 +596,29 @@ def run_judge() -> pd.DataFrame: suffix = " <-----" if m == model_name else "" count = battle_counts.get(m, 0) print(f" {m} ({count}){suffix}: {np.mean(vals):.1f} ± {np.std(vals):.1f}") + + # MAE vs human-only ELO for overlapping arena models + overlap = [m for m in all_model_names if m in human_elo and m != model_name] + if overlap: + abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap] + mae = np.mean(abs_errors) + print( + f"\n MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}" + ) + else: + mae = np.nan + print("\n No overlapping arena models to compute MAE.") else: print(" Not enough data to compute ELO ratings.") + mae = np.nan return { **summary, "bootstrap_ratings": bootstrap_ratings, + "human_elo": human_elo, + "mae_vs_human": mae, "model_name": model_name, + "method": method_label, } diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py index de1c1c1..cd8b700 100644 --- a/judgearena/evaluate.py +++ b/judgearena/evaluate.py @@ -26,9 +26,9 @@ class PairScore: - def __init__(self): + def __init__(self, temperature: float = 0.3): super(PairScore).__init__() - self.temperature = 0.3 + self.temperature = temperature def preference_from_scores(self, score_a: float, score_b: float) -> float: return 1 - np.exp(self.temperature * score_a) / ( diff --git a/judgearena/utils.py b/judgearena/utils.py index b6b0b8d..57ca21f 100644 --- a/judgearena/utils.py +++ b/judgearena/utils.py @@ -13,11 +13,6 @@ from langchain_openai import ChatOpenAI from tqdm.asyncio import tqdm -from judgearena.instruction_dataset.arena_hard import ( - download_arena_hard, - is_arena_hard_dataset, -) - def _data_root_path() -> Path: raw = os.environ.get("JUDGEARENA_DATA") or os.environ.get("OPENJURY_DATA") @@ -449,6 +444,11 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): def download_all(): + from judgearena.instruction_dataset.arena_hard import ( + download_arena_hard, + is_arena_hard_dataset, + ) + print(f"Downloading all dataset in {data_root}") local_path_tables = data_root / "tables" for dataset in [ From 898b1e4e07484b9bdfd64dfe38a05aeaeea16be3 Mon Sep 17 00:00:00 2001 From: Bora Kargi Date: Fri, 24 Apr 2026 10:58:27 +0200 Subject: [PATCH 2/8] Add temperature calibration --- judgearena/estimate_elo_ratings.py | 159 ++++++++++++++++++++++++++++- judgearena/evaluate.py | 55 +++++++++- 2 files changed, 211 insertions(+), 3 deletions(-) diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 03e2224..4f9d10e 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -9,7 +9,7 @@ from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs -from judgearena.evaluate import judge_and_parse_prefs +from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore from judgearena.generate import generate_instructions from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model @@ -32,6 +32,8 @@ class CliEloArgs(BaseCliArgs): seed: int = 0 baseline_model: str | None = None soft_elo: bool = False + calibrate_temperature: bool = False + calibration_size: int | None = None @classmethod def parse_args(cls): @@ -90,6 +92,19 @@ def parse_args(cls): help="Use continuous judge preferences as soft labels for BT fitting " "instead of discretising to hard win/loss/tie.", ) + parser.add_argument( + "--calibrate-temperature", + action="store_true", + help="Calibrate the PairScore temperature T against available human-annotated " + "arena battles before running soft-ELO. Requires --soft-elo.", + ) + parser.add_argument( + "--calibration-size", + type=int, + default=None, + help="Number of human arena battles to sample for temperature calibration. " + "Defaults to all available battles. Requires --calibrate-temperature.", + ) add_common_arguments(parser) args = parser.parse_args() @@ -102,6 +117,8 @@ def parse_args(cls): seed=args.seed, baseline_model=args.baseline_model, soft_elo=args.soft_elo, + calibrate_temperature=args.calibrate_temperature, + calibration_size=args.calibration_size, judge_model=args.judge_model, n_instructions=args.n_instructions, provide_explanation=args.provide_explanation, @@ -548,7 +565,144 @@ def run_judge() -> pd.DataFrame: df_arena, winner_col="winner", baseline_model=args.baseline_model ) - # Bootstrap Bradley-Terry ELO ratings + # --- Temperature calibration (optional) --- + # Run the judge on a random subset of human arena battles that already + # have ground-truth winner labels so we can fit T* via MLE. + calibrated_temperature: float | None = None + if args.calibrate_temperature: + if not args.soft_elo: + print( + "Warning: --calibrate-temperature has no effect without --soft-elo; skipping." + ) + else: + print("\n=== Calibrating PairScore temperature against human annotations ===") + # Sample calibration battles from the already-loaded arena battles. + # Use the same judge to score them so scores and labels are comparable. + _cal_n = ( + min(args.calibration_size, len(df_arena)) + if args.calibration_size is not None + else len(df_arena) + ) + cal_battles = df_arena.sample( + n=_cal_n, random_state=int(rng.integers(0, 2**31)) + ).reset_index(drop=True) + + cal_instructions = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0]) + for i in cal_battles.index + ] + cal_completions_a = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][1]) + for i in cal_battles.index + ] + cal_completions_b = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_b"][1]) + for i in cal_battles.index + ] + + judge_chat_model_cal = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + **judge_extra_kwargs, + ) + cal_annotations, _, cal_prefs = judge_and_parse_prefs( + judge_chat_model=judge_chat_model_cal, + instructions=cal_instructions, + completions_A=cal_completions_a, + completions_B=cal_completions_b, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + ) + + # Build (delta_s, y) pairs from calibration battles. + # delta_s = score_A - score_B (raw, using default T=1 to extract scores) + raw_parser = PairScore(temperature=1.0) + delta_s_cal = [] + y_cal = [] + for ann, human_winner in zip( + cal_annotations, cal_battles["winner"].tolist(), strict=True + ): + sa = raw_parser.get_regexp_match( + ann.judge_completion.lower(), r'score.*?a[":\s*\n]*(-?\d+)' + ) + sb = raw_parser.get_regexp_match( + ann.judge_completion.lower(), r'score.*?b[":\s*\n]*(-?\d+)' + ) + if sa is None or sb is None: + continue + human_pref = _winner_to_pref(human_winner) + if human_pref is None or human_pref == 0.5: + continue # skip ties and missing + delta_s_cal.append(sa - sb) + y_cal.append(1.0 - human_pref) # pref=0 → A wins → y=1 + + if len(delta_s_cal) < 10: + print( + f" Only {len(delta_s_cal)} valid calibration pairs (need ≥10); " + "keeping default temperature." + ) + else: + calibrated_temperature = calibrate_temperature( + np.array(delta_s_cal), np.array(y_cal) + ) + print( + f" Calibration pairs: {len(delta_s_cal)}" + f" T* = {calibrated_temperature:.4f} (default was 0.3)" + ) + + # Build the score parser used for the main evaluation run. + score_parser = PairScore( + temperature=calibrated_temperature if calibrated_temperature is not None else 0.3 + ) + + # If we calibrated the temperature, the prefs stored in df_judge were + # computed with the default T=0.3. Re-parse them with the new parser so + # the soft-ELO bootstrap uses calibrated preferences. + if calibrated_temperature is not None: + new_prefs_ab = pd.Series( + [score_parser.parse_model_raw(c) for c in df_judge["judge_completion"]] + ) + prefs = new_prefs_ab.tolist() + + def _none_to_nan(x): + return float("nan") if x is None else x + + if args.swap_mode == "both": + # df_judge contains AB and BA annotations interleaved; the original + # run_judge() already combined them — we just need to re-parse the + # stored completions in the same order. + n_half = len(df_judge) // 2 + prefs_ab = new_prefs_ab[:n_half].apply(_none_to_nan) + prefs_ba = new_prefs_ab[n_half:].apply(_none_to_nan).reset_index(drop=True) + prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist() + + # Rebuild battle_results with calibrated prefs + battle_results = [] + for pref, is_pos_a, opp_model in zip( + prefs, our_model_is_position_a, opponent_models, strict=True + ): + if pref is None or (isinstance(pref, float) and np.isnan(pref)) or pref == 0.5: + winner = "tie" + elif pref < 0.5: + winner = "model_a" + else: + winner = "model_b" + if is_pos_a: + battle_results.append( + {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref} + ) + else: + battle_results.append( + { + "model_a": opp_model, + "model_b": model_name, + "winner": winner, + "pref": 1.0 - pref if (pref is not None and not (isinstance(pref, float) and np.isnan(pref))) else None, + } + ) + df_llm_judge = pd.DataFrame(battle_results) + df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) + n_bootstraps = args.n_bootstraps use_soft = args.soft_elo @@ -619,6 +773,7 @@ def run_judge() -> pd.DataFrame: "mae_vs_human": mae, "model_name": model_name, "method": method_label, + "calibrated_temperature": calibrated_temperature, } diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py index cd8b700..1bd9440 100644 --- a/judgearena/evaluate.py +++ b/judgearena/evaluate.py @@ -6,6 +6,7 @@ import numpy as np import pandas as pd +from scipy.optimize import minimize_scalar from langchain_core.language_models.llms import LLM from langchain_core.prompts import ChatPromptTemplate @@ -56,6 +57,56 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1): return float(m.group(group_index).strip(" ")) +def calibrate_temperature( + delta_s: np.ndarray, + y: np.ndarray, + bounds: tuple[float, float] = (-10.0, 10.0), +) -> float: + """Find the MLE temperature T* for the model P(A>B) = σ(T·Δs). + + The log-likelihood is: + + L(T) = Σ_i [ y_i·log σ(T·Δs_i) + (1−y_i)·log σ(−T·Δs_i) ] + = Σ_i log σ(T · (2y_i − 1) · Δs_i) + + This is concave in T (single global maximum) so ``minimize_scalar`` with + the 'bounded' method is guaranteed to converge. + + Args: + delta_s: Score differences ``s_A − s_B`` for each battle, shape (N,). + y: Observed hard labels (1 = A was preferred, 0 = B was preferred, + 0.5 = tie). Ties contribute zero gradient and are skipped. + bounds: Search interval for T (default −10 to +10). + + Returns: + The calibrated temperature T*. + """ + delta_s = np.asarray(delta_s, dtype=float) + y = np.asarray(y, dtype=float) + + # Skip ties (y == 0.5) — they carry no directional information. + non_tie = y != 0.5 + delta_s = delta_s[non_tie] + y = y[non_tie] + + if len(delta_s) == 0: + raise ValueError("No non-tie observations available for temperature calibration.") + + # z_i = (2y_i − 1) · Δs_i (positive when the score difference agrees with the outcome) + z = (2 * y - 1) * delta_s + + def neg_log_likelihood(T: float) -> float: + # log σ(T·z) = −log(1 + exp(−T·z)) = −logaddexp(0, −T·z) + return float(np.sum(np.logaddexp(0.0, -T * z))) + + result = minimize_scalar( + neg_log_likelihood, + bounds=bounds, + method="bounded", + ) + return float(result.x) + + _COMPLETION_LABEL_SINGLE = "Answer" _COMPLETION_LABEL_MULTI_TURN = "Conversation with User" _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement" @@ -363,6 +414,7 @@ def judge_and_parse_prefs( user_prompt_template: str | None = None, truncate_input_chars: int = 8192, use_tqdm: bool = False, + score_parser: "PairScore | None" = None, ) -> tuple[list[JudgeAnnotation], list[JudgeAnnotation] | None, pd.Series]: """Run judge annotation and parse preferences, handling swap_mode='both'. @@ -407,7 +459,8 @@ def judge_and_parse_prefs( def _none_to_nan(x): return float("nan") if x is None else x - score_parser = PairScore() + if score_parser is None: + score_parser = PairScore() prefs = pd.Series( [score_parser.parse_model_raw(a.judge_completion) for a in annotations] ) From e4498b6a979dbbf6b64cb99dd8bfd32d92288492 Mon Sep 17 00:00:00 2001 From: Bora Kargi Date: Fri, 24 Apr 2026 11:01:00 +0200 Subject: [PATCH 3/8] Update READMe for soft-elo support --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index 508ac9f..3d7d59b 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,29 @@ uv run python judgearena/estimate_elo_ratings.py \ | `--n_bootstraps` | `20` | Bootstrap samples for ELO confidence intervals | | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias | | `--result_folder` | `results` | Directory where annotations and results are saved | +| `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels | +| `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) | +| `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) | + +### Soft-ELO & temperature calibration + +By default, judge scores are discretised to hard win/loss/tie labels. Passing `--soft-elo` instead converts the raw score +difference into a continuous preference via a softmax, which is then fed into a soft Bradley-Terry model. + +To let the data choose the best temperature automatically, add `--calibrate-temperature`. +JudgeArena will run the judge on a sample of human-annotated arena battles, fit the temperature $T^*$ by MLE, and +use it for the full evaluation: + +```bash +judgearena-elo \ + --arena LMArena-100k \ + --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ + --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ + --n_instructions 200 \ + --soft-elo \ + --calibrate-temperature \ + --calibration-size 300 +``` ### Output From 6b401e8e8e4fd34308eb9ec266bdb1409affd40c Mon Sep 17 00:00:00 2001 From: Bora Kargi Date: Wed, 29 Apr 2026 14:57:50 +0200 Subject: [PATCH 4/8] Update temperature --- README.md | 1 + judgearena/estimate_elo_ratings.py | 13 +++++++++++-- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3d7d59b..f8d409d 100644 --- a/README.md +++ b/README.md @@ -246,6 +246,7 @@ uv run python judgearena/estimate_elo_ratings.py \ | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias | | `--result_folder` | `results` | Directory where annotations and results are saved | | `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels | +| `--soft-elo-temperature` | `0.3` | Initial softmax temperature for `--soft-elo`; overridden if `--calibrate-temperature` succeeds | | `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) | | `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) | diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 4f9d10e..44fb709 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -32,6 +32,7 @@ class CliEloArgs(BaseCliArgs): seed: int = 0 baseline_model: str | None = None soft_elo: bool = False + soft_elo_temperature: float = 0.3 calibrate_temperature: bool = False calibration_size: int | None = None @@ -92,6 +93,13 @@ def parse_args(cls): help="Use continuous judge preferences as soft labels for BT fitting " "instead of discretising to hard win/loss/tie.", ) + parser.add_argument( + "--soft-elo-temperature", + type=float, + default=0.3, + help="Initial PairScore temperature used by --soft-elo (default: 0.3). " + "Overridden by --calibrate-temperature if calibration succeeds.", + ) parser.add_argument( "--calibrate-temperature", action="store_true", @@ -117,6 +125,7 @@ def parse_args(cls): seed=args.seed, baseline_model=args.baseline_model, soft_elo=args.soft_elo, + soft_elo_temperature=args.soft_elo_temperature, calibrate_temperature=args.calibrate_temperature, calibration_size=args.calibration_size, judge_model=args.judge_model, @@ -647,12 +656,12 @@ def run_judge() -> pd.DataFrame: ) print( f" Calibration pairs: {len(delta_s_cal)}" - f" T* = {calibrated_temperature:.4f} (default was 0.3)" + f" T* = {calibrated_temperature:.4f} (default was {args.soft_elo_temperature})" ) # Build the score parser used for the main evaluation run. score_parser = PairScore( - temperature=calibrated_temperature if calibrated_temperature is not None else 0.3 + temperature=calibrated_temperature if calibrated_temperature is not None else args.soft_elo_temperature ) # If we calibrated the temperature, the prefs stored in df_judge were From 995db21f8b0ee5901b341ccecf7cc01eb8fe7056 Mon Sep 17 00:00:00 2001 From: Bora Kargi Date: Tue, 12 May 2026 13:50:03 +0200 Subject: [PATCH 5/8] Update CLI to unify elo computation --- judgearena/cli.py | 30 +++ judgearena/estimate_elo_ratings.py | 350 +++++++++++------------------ tests/test_estimate_elo_ratings.py | 38 +++- 3 files changed, 186 insertions(+), 232 deletions(-) diff --git a/judgearena/cli.py b/judgearena/cli.py index eb94c83..42840e9 100644 --- a/judgearena/cli.py +++ b/judgearena/cli.py @@ -113,6 +113,32 @@ def _build_parser() -> argparse.ArgumentParser: default=None, help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).", ) + parser.add_argument( + "--soft-elo", + action="store_true", + help="[elo] Use continuous judge preferences as soft BT targets instead of " + "discretising to hard win/loss/tie.", + ) + parser.add_argument( + "--soft-elo-temperature", + type=float, + default=0.3, + help="[elo] Initial PairScore temperature for --soft-elo. " + "Overridden by --calibrate-temperature if calibration succeeds.", + ) + parser.add_argument( + "--calibrate-temperature", + action="store_true", + help="[elo] MLE-fit the PairScore temperature against human-labeled arena " + "battles before the main run. Requires --soft-elo.", + ) + parser.add_argument( + "--calibration-size", + type=int, + default=None, + help="[elo] Number of human arena battles to sample for temperature " + "calibration. Defaults to all. Requires --calibrate-temperature.", + ) add_common_arguments(parser) return parser @@ -191,6 +217,10 @@ def _build_elo_args( n_bootstraps=args.n_bootstraps, seed=args.seed, baseline_model=args.baseline_model, + soft_elo=args.soft_elo, + soft_elo_temperature=args.soft_elo_temperature, + calibrate_temperature=args.calibrate_temperature, + calibration_size=args.calibration_size, judge_model=args.judge_model, n_instructions=args.n_instructions, provide_explanation=args.provide_explanation, diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index ad703e2..3d80c7d 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -1,4 +1,5 @@ import hashlib +import argparse from dataclasses import dataclass from functools import partial @@ -145,119 +146,22 @@ def parse_args(cls): ) -def compute_bradley_terry( - df: pd.DataFrame, - winner_col: str, - scale: float = 400, - base: float = 10, - init_rating: float = 1000, - baseline_model: str | None = None, - baseline_rating: float = 1000, -) -> dict[str, float]: - """ - Compute Bradley-Terry ratings using MLE (logistic regression). - - This method fits a Bradley-Terry model to pairwise comparison data using - maximum likelihood estimation via logistic regression. - - Args: - df: DataFrame with columns 'model_a', 'model_b', and the winner column - winner_col: Name of the column containing the winner - scale: Scale factor for ELO conversion (default 400) - base: Base for logarithm in ELO formula (default 10) - init_rating: Initial rating offset (default 1000) - baseline_model: Model to anchor at baseline_rating - baseline_rating: Rating to assign to the baseline model - - Returns: - Dictionary mapping model names to their Bradley-Terry ratings - """ - # Get all unique models - all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())) - - # Create pivot tables for wins - ptbl_a_win = pd.pivot_table( - df[df[winner_col] == "model_a"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - - ptbl_b_win = pd.pivot_table( - df[df[winner_col] == "model_b"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - - # Handle ties - if sum(df[winner_col].isin(["tie", "tie (bothbad)"])) == 0: - ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models) - else: - ptbl_tie = pd.pivot_table( - df[df[winner_col].isin(["tie", "tie (bothbad)"])], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0) - ptbl_tie = ptbl_tie + ptbl_tie.T - - # Reindex all pivot tables to have consistent dimensions - ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0) - ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0) - - # Combined win matrix (ties count as 0.5 for each) - ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie - - models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) - - p = len(models) - X = np.zeros([p * (p - 1) * 2, p]) - Y = np.zeros(p * (p - 1) * 2) - - cur_row = 0 - sample_weights = [] - for m_a in ptbl_win.index: - for m_b in ptbl_win.columns: - if m_a == m_b: - continue - # Skip if nan or no battles between this pair - w_ab = ptbl_win.loc[m_a, m_b] - w_ba = ptbl_win.loc[m_b, m_a] - if np.isnan(w_ab) or np.isnan(w_ba): - continue - if w_ab == 0 and w_ba == 0: - continue - X[cur_row, models[m_a]] = +np.log(base) - X[cur_row, models[m_b]] = -np.log(base) - Y[cur_row] = 1.0 - sample_weights.append(w_ab) - - X[cur_row + 1, models[m_a]] = np.log(base) - X[cur_row + 1, models[m_b]] = -np.log(base) - Y[cur_row + 1] = 0.0 - sample_weights.append(w_ba) - cur_row += 2 - - X = X[:cur_row] - Y = Y[:cur_row] - - lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000) - lr.fit(X, Y, sample_weight=sample_weights) - elo_scores = scale * lr.coef_[0] + init_rating +def _winner_to_pref(winner: str) -> float | None: + """Convert a hard winner label to a continuous preference value.""" + if winner == "model_a": + return 0.0 + elif winner == "model_b": + return 1.0 + elif winner in ("tie", "tie (bothbad)"): + return 0.5 + return None - # Normalize to baseline model if specified - if baseline_model is not None and baseline_model in models.index: - elo_scores += baseline_rating - elo_scores[models[baseline_model]] - return dict(pd.Series(elo_scores, index=models.index)) +def _is_nan_pref(p) -> bool: + return p is None or (isinstance(p, float) and np.isnan(p)) -def compute_soft_bradley_terry( +def fit_bradley_terry( df: pd.DataFrame, pref_col: str = "pref", scale: float = 400, @@ -266,57 +170,61 @@ def compute_soft_bradley_terry( baseline_model: str | None = None, baseline_rating: float = 1000, ) -> dict[str, float]: - """Compute Bradley-Terry ratings from continuous (soft) preferences. + """Fit Bradley-Terry ratings via weighted logistic regression. - Each row in *df* is a single battle with columns ``model_a``, ``model_b``, - and *pref_col* ∈ [0, 1] where 0 → A wins, 1 → B wins, 0.5 → tie. + Each row in *df* is a battle with columns ``model_a``, ``model_b`` and + ``pref_col`` ∈ [0, 1] where 0 means A wins, 1 means B wins, 0.5 is a tie. + Hard win/loss/tie labels are the special case ``pref ∈ {0, 0.5, 1}``. - The soft cross-entropy for a single battle is decomposed into two - weighted hard-label rows so that sklearn ``LogisticRegression`` can be - reused: + The soft cross-entropy for a battle is decomposed into two weighted + hard-label rows so sklearn's ``LogisticRegression`` can be reused: - row 1: Y=1, weight = 1 - pref (evidence for A winning) - row 2: Y=0, weight = pref (evidence for B winning) + Y=1, weight = (1 − pref) · count (evidence A wins) + Y=0, weight = pref · count (evidence B wins) + + Identical ``(model_a, model_b, pref)`` triples are aggregated first so + the design matrix stays small when prefs are quantised (e.g. human + arena labels) and untouched when prefs are continuous floats. """ - df = df.dropna(subset=[pref_col]).copy() + df = df.dropna(subset=[pref_col]) if df.empty: return {} - all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())) + grouped = ( + df.groupby(["model_a", "model_b", pref_col]) + .size() + .reset_index(name="count") + ) + + all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"])) models = pd.Series(np.arange(len(all_models)), index=all_models) p = len(models) - n_battles = len(df) - X = np.zeros([2 * n_battles, p]) - Y = np.zeros(2 * n_battles) - sample_weights = np.zeros(2 * n_battles) - - for idx, (_, row) in enumerate(df.iterrows()): - m_a = row["model_a"] - m_b = row["model_b"] - pref = row[pref_col] - - # Row for "A wins" evidence - X[2 * idx, models[m_a]] = +np.log(base) - X[2 * idx, models[m_b]] = -np.log(base) - Y[2 * idx] = 1.0 - sample_weights[2 * idx] = 1.0 - pref - - # Row for "B wins" evidence - X[2 * idx + 1, models[m_a]] = +np.log(base) - X[2 * idx + 1, models[m_b]] = -np.log(base) - Y[2 * idx + 1] = 0.0 - sample_weights[2 * idx + 1] = pref - - # Drop rows with zero weight (pure wins have one side = 0) + m_a_idx = grouped["model_a"].map(models).to_numpy() + m_b_idx = grouped["model_b"].map(models).to_numpy() + prefs = grouped[pref_col].to_numpy(dtype=float) + counts = grouped["count"].to_numpy(dtype=float) + n = len(grouped) + + log_base = np.log(base) + X = np.zeros((2 * n, p)) + top = np.arange(n) + bot = n + top + X[top, m_a_idx] = +log_base + X[top, m_b_idx] = -log_base + X[bot, m_a_idx] = +log_base + X[bot, m_b_idx] = -log_base + + Y = np.concatenate([np.ones(n), np.zeros(n)]) + sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts]) + nonzero = sample_weights > 0 + if not nonzero.any(): + return {} X = X[nonzero] Y = Y[nonzero] sample_weights = sample_weights[nonzero] - if len(X) == 0: - return {} - lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000) lr.fit(X, Y, sample_weight=sample_weights) elo_scores = scale * lr.coef_[0] + init_rating @@ -327,15 +235,48 @@ def compute_soft_bradley_terry( return dict(pd.Series(elo_scores, index=models.index)) -def _winner_to_pref(winner: str) -> float | None: - """Convert a hard winner label to a continuous preference value.""" - if winner == "model_a": - return 0.0 - elif winner == "model_b": - return 1.0 - elif winner in ("tie", "tie (bothbad)"): - return 0.5 - return None +def _prefs_to_battle_results( + prefs, + our_model_is_position_a, + opponent_models, + model_name: str, +) -> pd.DataFrame: + """Map per-battle judge prefs into model-name-level battle rows. + + The judge prompt placed our model at position A or B independently per + battle. Here we re-orient each row so ``model_a``/``model_b`` carry + the actual model names and ``pref`` is consistent with that ordering + (``pref=0`` ⇒ ``model_a`` wins). ``pref_hard`` is the quantised + {0, 0.5, 1} version used by the non-soft Bradley-Terry fit. + """ + records = [] + for pref, is_pos_a, opp in zip( + prefs, our_model_is_position_a, opponent_models, strict=True + ): + if _is_nan_pref(pref) or pref == 0.5: + winner = "tie" + elif pref < 0.5: + winner = "model_a" + else: + winner = "model_b" + + if is_pos_a: + rec = { + "model_a": model_name, + "model_b": opp, + "winner": winner, + "pref": pref, + } + else: + rec = { + "model_a": opp, + "model_b": model_name, + "winner": winner, + "pref": None if _is_nan_pref(pref) else 1.0 - pref, + } + rec["pref_hard"] = _winner_to_pref(winner) + records.append(rec) + return pd.DataFrame(records) def main(args: CliEloArgs | None = None) -> dict: @@ -513,33 +454,10 @@ def run_judge() -> pd.DataFrame: logger.debug("First judge output:\n%s", df_judge["judge_completion"].iloc[0][:500]) # Map preferences back to model-name-level battle results. - # Build both hard labels (winner) and continuous prefs for each battle. model_name = args.model - battle_results = [] - for pref, is_pos_a, opp_model in zip( - prefs, our_model_is_position_a, opponent_models, strict=True - ): - if pref is None or pref == 0.5: - winner = "tie" - elif pref < 0.5: - winner = "model_a" - else: - winner = "model_b" - - # Continuous pref is relative to judge positions (A/B). - # Remap so that model_a column in the DataFrame always corresponds - # to pref=0 and model_b to pref=1. - if is_pos_a: - battle_results.append( - {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref} - ) - else: - battle_results.append( - {"model_a": opp_model, "model_b": model_name, "winner": winner, "pref": 1.0 - pref if pref is not None else None} - ) - - # LLM-judge battle results for our model - df_llm_judge = pd.DataFrame(battle_results) + df_llm_judge = _prefs_to_battle_results( + prefs, our_model_is_position_a, opponent_models, model_name + ) # Normalize prefs so pref < 0.5 always means our model wins, then summarise prefs_normalized = pd.Series( @@ -569,14 +487,16 @@ def run_judge() -> pd.DataFrame: df_arena["model_a"].isin(well_represented) & df_arena["model_b"].isin(well_represented) ] - # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5) + # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5). + # Human labels are already hard, so pref_hard == pref. df_arena["pref"] = df_arena["winner"].map(_winner_to_pref) + df_arena["pref_hard"] = df_arena["pref"] df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) # Compute human-only BT ratings as ground-truth reference - human_elo = compute_bradley_terry( - df_arena, winner_col="winner", baseline_model=args.baseline_model + human_elo = fit_bradley_terry( + df_arena, pref_col="pref_hard", baseline_model=args.baseline_model ) # --- Temperature calibration (optional) --- @@ -585,11 +505,11 @@ def run_judge() -> pd.DataFrame: calibrated_temperature: float | None = None if args.calibrate_temperature: if not args.soft_elo: - print( - "Warning: --calibrate-temperature has no effect without --soft-elo; skipping." + logger.warning( + "--calibrate-temperature has no effect without --soft-elo; skipping." ) else: - print("\n=== Calibrating PairScore temperature against human annotations ===") + logger.info("Calibrating PairScore temperature against human annotations.") # Sample calibration battles from the already-loaded arena battles. # Use the same judge to score them so scores and labels are comparable. _cal_n = ( @@ -597,9 +517,12 @@ def run_judge() -> pd.DataFrame: if args.calibration_size is not None else len(df_arena) ) + # Keep the original df_arena_all index so we can look up the full + # conversation rows below; reset_index would point at non-existent + # 0..N labels in df_arena_all. cal_battles = df_arena.sample( n=_cal_n, random_state=int(rng.integers(0, 2**31)) - ).reset_index(drop=True) + ) cal_instructions = [ _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0]) @@ -651,17 +574,19 @@ def run_judge() -> pd.DataFrame: y_cal.append(1.0 - human_pref) # pref=0 → A wins → y=1 if len(delta_s_cal) < 10: - print( - f" Only {len(delta_s_cal)} valid calibration pairs (need ≥10); " - "keeping default temperature." + logger.warning( + "Only %d valid calibration pairs (need ≥10); keeping default temperature.", + len(delta_s_cal), ) else: calibrated_temperature = calibrate_temperature( np.array(delta_s_cal), np.array(y_cal) ) - print( - f" Calibration pairs: {len(delta_s_cal)}" - f" T* = {calibrated_temperature:.4f} (default was {args.soft_elo_temperature})" + logger.info( + "Calibration pairs: %d T* = %.4f (default was %s)", + len(delta_s_cal), + calibrated_temperature, + args.soft_elo_temperature, ) # Build the score parser used for the main evaluation run. @@ -691,30 +616,9 @@ def _none_to_nan(x): prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist() # Rebuild battle_results with calibrated prefs - battle_results = [] - for pref, is_pos_a, opp_model in zip( - prefs, our_model_is_position_a, opponent_models, strict=True - ): - if pref is None or (isinstance(pref, float) and np.isnan(pref)) or pref == 0.5: - winner = "tie" - elif pref < 0.5: - winner = "model_a" - else: - winner = "model_b" - if is_pos_a: - battle_results.append( - {"model_a": model_name, "model_b": opp_model, "winner": winner, "pref": pref} - ) - else: - battle_results.append( - { - "model_a": opp_model, - "model_b": model_name, - "winner": winner, - "pref": 1.0 - pref if (pref is not None and not (isinstance(pref, float) and np.isnan(pref))) else None, - } - ) - df_llm_judge = pd.DataFrame(battle_results) + df_llm_judge = _prefs_to_battle_results( + prefs, our_model_is_position_a, opponent_models, model_name + ) df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) n_bootstraps = args.n_bootstraps @@ -736,19 +640,15 @@ def _none_to_nan(x): battle_counts[row["model_a"]] = battle_counts.get(row["model_a"], 0) + 1 battle_counts[row["model_b"]] = battle_counts.get(row["model_b"], 0) + 1 + pref_col = "pref" if use_soft else "pref_hard" bootstrap_ratings: list[dict[str, float]] = [] for _ in range(n_bootstraps): df_sample = df_results.sample( n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31)) ) - if use_soft: - ratings = compute_soft_bradley_terry( - df_sample, pref_col="pref", baseline_model=args.baseline_model - ) - else: - ratings = compute_bradley_terry( - df_sample, winner_col="winner", baseline_model=args.baseline_model - ) + ratings = fit_bradley_terry( + df_sample, pref_col=pref_col, baseline_model=args.baseline_model + ) bootstrap_ratings.append(ratings) if bootstrap_ratings: diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py index 83f9c8a..c2be142 100644 --- a/tests/test_estimate_elo_ratings.py +++ b/tests/test_estimate_elo_ratings.py @@ -5,7 +5,12 @@ import pytest import judgearena.estimate_elo_ratings as estimate_elo_ratings -from judgearena.estimate_elo_ratings import CliEloArgs, compute_bradley_terry, main +from judgearena.estimate_elo_ratings import ( + CliEloArgs, + _winner_to_pref, + fit_bradley_terry, + main, +) from judgearena.evaluate import JudgeAnnotation, judge_and_parse_prefs from judgearena.utils import make_model @@ -89,7 +94,13 @@ def _default_args(**kwargs) -> CliEloArgs: return CliEloArgs(**defaults) -# --- compute_bradley_terry unit tests --- +# --- fit_bradley_terry unit tests --- + + +def _records_with_pref(records: list[dict]) -> pd.DataFrame: + df = pd.DataFrame(records) + df["pref"] = df["winner"].map(_winner_to_pref) + return df def test_bradley_terry_clear_winner(): @@ -97,23 +108,22 @@ def test_bradley_terry_clear_winner(): records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 + [ {"model_a": "B", "model_b": "A", "winner": "model_b"} ] * 10 - ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner") + ratings = fit_bradley_terry(_records_with_pref(records)) assert ratings["A"] > ratings["B"] def test_bradley_terry_all_ties(): """All ties → ratings should be equal.""" records = [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 20 - ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner") + ratings = fit_bradley_terry(_records_with_pref(records)) assert abs(ratings["A"] - ratings["B"]) < 1.0 def test_bradley_terry_baseline(): """Baseline model is anchored at baseline_rating.""" records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 - ratings = compute_bradley_terry( - pd.DataFrame(records), - winner_col="winner", + ratings = fit_bradley_terry( + _records_with_pref(records), baseline_model="B", baseline_rating=1000, ) @@ -121,6 +131,20 @@ def test_bradley_terry_baseline(): assert ratings["A"] > 1000.0 +def test_bradley_terry_soft_matches_hard(): + """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels.""" + records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + [ + {"model_a": "A", "model_b": "B", "winner": "model_b"} + ] * 3 + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2 + df = _records_with_pref(records) + hard = fit_bradley_terry(df, pref_col="pref") + # Passing the same column twice (continuous == quantised here) must match. + df["pref_soft"] = df["pref"].astype(float) + soft = fit_bradley_terry(df, pref_col="pref_soft") + assert hard["A"] == pytest.approx(soft["A"], abs=1e-3) + assert hard["B"] == pytest.approx(soft["B"], abs=1e-3) + + # --- main() integration tests --- From b3571167b2ec823f82bef22a5c0b2a0d4e2869f8 Mon Sep 17 00:00:00 2001 From: bora kargi Date: Tue, 12 May 2026 14:26:14 +0200 Subject: [PATCH 6/8] Remove duplication --- README.md | 6 +- judgearena/estimate_elo_ratings.py | 114 +---------------------------- 2 files changed, 5 insertions(+), 115 deletions(-) diff --git a/README.md b/README.md index 1d8146b..5e62300 100644 --- a/README.md +++ b/README.md @@ -266,9 +266,9 @@ JudgeArena will run the judge on a sample of human-annotated arena battles, fit use it for the full evaluation: ```bash -judgearena-elo \ - --arena LMArena-100k \ - --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ +judgearena \ + --task elo-lmarena-100k \ + --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ --n_instructions 200 \ --soft-elo \ diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 3d80c7d..503277d 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -1,5 +1,4 @@ import hashlib -import argparse from dataclasses import dataclass from functools import partial @@ -8,7 +7,7 @@ from sklearn.linear_model import LogisticRegression from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe -from judgearena.cli_common import BaseCliArgs, add_common_arguments, parse_engine_kwargs +from judgearena.cli_common import BaseCliArgs from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore from judgearena.generate import generate_instructions from judgearena.log import get_logger @@ -39,112 +38,6 @@ class CliEloArgs(BaseCliArgs): calibrate_temperature: bool = False calibration_size: int | None = None - @classmethod - def parse_args(cls): - parser = argparse.ArgumentParser( - prog="Estimate ELO rating for a model on an Arena (LMArena-100k, LMArena-140k, or ComparIA) with LLM judges", - ) - parser.add_argument( - "--arena", - help="The arena to use. Battles are sampled from this Arena. If not passed use concatenation from all Arena. " - "Passing LMArena leads to loading the union of `LMArena-100k` and `LMArena-140k`", - choices=["LMArena-100k", "LMArena-140k", "ComparIA", "LMArena"], - required=False, - ) - parser.add_argument( - "--model", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--languages", - nargs="+", - default=None, - help='List of language codes to evaluate, e.g. "en fr de" (default: all languages)', - ) - parser.add_argument( - "--n_instructions_per_language", - type=int, - required=False, - help="Maximum number of instructions to keep per language.", - ) - parser.add_argument( - "--n_bootstraps", - type=int, - required=False, - default=20, - help="Number of bootstrap samples for ELO confidence intervals. Default is 20.", - ) - parser.add_argument( - "--seed", - type=int, - required=False, - default=0, - help="Random seed for reproducibility. Default is 0.", - ) - parser.add_argument( - "--baseline_model", - type=str, - required=False, - default=None, - help="Model name to anchor at 1000 ELO. All other ratings are expressed relative to this model. " - "Must be one of the models present in the arena battles. If not set, ratings are not anchored.", - ) - parser.add_argument( - "--soft-elo", - action="store_true", - help="Use continuous judge preferences as soft labels for BT fitting " - "instead of discretising to hard win/loss/tie.", - ) - parser.add_argument( - "--soft-elo-temperature", - type=float, - default=0.3, - help="Initial PairScore temperature used by --soft-elo (default: 0.3). " - "Overridden by --calibrate-temperature if calibration succeeds.", - ) - parser.add_argument( - "--calibrate-temperature", - action="store_true", - help="Calibrate the PairScore temperature T against available human-annotated " - "arena battles before running soft-ELO. Requires --soft-elo.", - ) - parser.add_argument( - "--calibration-size", - type=int, - default=None, - help="Number of human arena battles to sample for temperature calibration. " - "Defaults to all available battles. Requires --calibrate-temperature.", - ) - add_common_arguments(parser) - args = parser.parse_args() - - return cls( - arena=args.arena, - model=args.model, - n_instructions_per_language=args.n_instructions_per_language, - languages=args.languages, - n_bootstraps=args.n_bootstraps, - seed=args.seed, - baseline_model=args.baseline_model, - soft_elo=args.soft_elo, - soft_elo_temperature=args.soft_elo_temperature, - calibrate_temperature=args.calibrate_temperature, - calibration_size=args.calibration_size, - judge_model=args.judge_model, - n_instructions=args.n_instructions, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - ignore_cache=args.ignore_cache, - truncate_all_input_chars=args.truncate_all_input_chars, - max_out_tokens_models=args.max_out_tokens_models, - max_out_tokens_judge=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - result_folder=args.result_folder, - engine_kwargs=parse_engine_kwargs(args.engine_kwargs), - ) - def _winner_to_pref(winner: str) -> float | None: """Convert a hard winner label to a continuous preference value.""" @@ -279,10 +172,7 @@ def _prefs_to_battle_results( return pd.DataFrame(records) -def main(args: CliEloArgs | None = None) -> dict: - if args is None: - args = CliEloArgs.parse_args() - +def main(args: CliEloArgs) -> dict: rng = np.random.default_rng(args.seed) # Step 1: Load arena battles From be53e8c24cc4a45e0a2454ef3f3f775cbbdd5cfd Mon Sep 17 00:00:00 2001 From: bora kargi Date: Tue, 12 May 2026 15:09:01 +0200 Subject: [PATCH 7/8] Fix a edge case when all the labels are same --- judgearena/estimate_elo_ratings.py | 47 +++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 503277d..9e6b0c7 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -111,12 +111,11 @@ def fit_bradley_terry( Y = np.concatenate([np.ones(n), np.zeros(n)]) sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts]) - nonzero = sample_weights > 0 - if not nonzero.any(): + # Keep zero-weight rows so sklearn LR always sees both Y classes — when + # every pref collapses to 0 or 1 the missing-class rows contribute nothing + # to the loss but stop the solver from raising on n_classes < 2. + if sample_weights.sum() == 0: return {} - X = X[nonzero] - Y = Y[nonzero] - sample_weights = sample_weights[nonzero] lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000) lr.fit(X, Y, sample_weight=sample_weights) @@ -305,7 +304,7 @@ def run_judge() -> pd.DataFrame: max_tokens=args.max_out_tokens_judge, **judge_extra_kwargs, ) - annotations, _, prefs = judge_and_parse_prefs( + annotations, annotations_reversed, prefs = judge_and_parse_prefs( judge_chat_model=judge_chat_model, instructions=instructions.tolist(), completions_A=completions_A, @@ -315,16 +314,33 @@ def run_judge() -> pd.DataFrame: truncate_input_chars=args.truncate_all_input_chars, use_tqdm=use_tqdm, ) + if annotations_reversed is None: + row_annotations = list(annotations) + row_use_model_a = use_model_a_as_opponent + row_our_pos_a = our_model_is_position_a + row_opponents = list(opponent_models) + else: + # swap_mode="both": dataframe carries 2n rows (AB then BA). + # Position metadata is duplicated; prefs are already oriented + # consistently by judge_and_parse_prefs as [pref_AB, 1 - pref_BA]. + row_annotations = list(annotations) + list(annotations_reversed) + row_use_model_a = np.concatenate( + [use_model_a_as_opponent, use_model_a_as_opponent] + ) + row_our_pos_a = np.concatenate( + [our_model_is_position_a, our_model_is_position_a] + ) + row_opponents = list(opponent_models) + list(opponent_models) return pd.DataFrame( { - "judge_completion": [a.judge_completion for a in annotations], - "instruction": [a.instruction for a in annotations], - "completion_A": [a.completion_A for a in annotations], - "completion_B": [a.completion_B for a in annotations], + "judge_completion": [a.judge_completion for a in row_annotations], + "instruction": [a.instruction for a in row_annotations], + "completion_A": [a.completion_A for a in row_annotations], + "completion_B": [a.completion_B for a in row_annotations], "pref": prefs, - "use_model_a_as_opponent": use_model_a_as_opponent, - "our_model_is_position_a": our_model_is_position_a, - "opponent_model": opponent_models, + "use_model_a_as_opponent": row_use_model_a, + "our_model_is_position_a": row_our_pos_a, + "opponent_model": row_opponents, } ) @@ -363,7 +379,10 @@ def run_judge() -> pd.DataFrame: winrate = summary["winrate"] print(f"\n=== Results for {model_name} ===") - print(f"Battles: {n} | Wins: {our_wins} | Losses: {our_losses} | Ties: {our_ties}") + print( + f"Battles: {len(df_llm_judge)} | Wins: {our_wins} | " + f"Losses: {our_losses} | Ties: {our_ties}" + ) print(f"Win rate: {winrate:.2%}") # Combine LLM-judge battles with human-annotated arena battles, From 61f1f84ae604ef3d7a6fda8aa6c161a13d749fea Mon Sep 17 00:00:00 2001 From: bora kargi Date: Tue, 12 May 2026 15:14:07 +0200 Subject: [PATCH 8/8] ruff fix --- judgearena/estimate_elo_ratings.py | 18 +++++++++--------- judgearena/evaluate.py | 6 ++++-- tests/test_estimate_elo_ratings.py | 8 +++++--- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 9e6b0c7..1b17637 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -8,7 +8,7 @@ from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe from judgearena.cli_common import BaseCliArgs -from judgearena.evaluate import judge_and_parse_prefs, calibrate_temperature, PairScore +from judgearena.evaluate import PairScore, calibrate_temperature, judge_and_parse_prefs from judgearena.generate import generate_instructions from judgearena.log import get_logger from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model @@ -84,9 +84,7 @@ def fit_bradley_terry( return {} grouped = ( - df.groupby(["model_a", "model_b", pref_col]) - .size() - .reset_index(name="count") + df.groupby(["model_a", "model_b", pref_col]).size().reset_index(name="count") ) all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"])) @@ -500,7 +498,9 @@ def run_judge() -> pd.DataFrame: # Build the score parser used for the main evaluation run. score_parser = PairScore( - temperature=calibrated_temperature if calibrated_temperature is not None else args.soft_elo_temperature + temperature=calibrated_temperature + if calibrated_temperature is not None + else args.soft_elo_temperature ) # If we calibrated the temperature, the prefs stored in df_judge were @@ -536,7 +536,9 @@ def _none_to_nan(x): n_llm = len(df_llm_judge) n_human = len(df_arena) method_label = "Soft-ELO" if use_soft else "ELO" - print(f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===") + print( + f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===" + ) print( f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} " f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and " @@ -579,9 +581,7 @@ def _none_to_nan(x): if overlap: abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap] mae = np.mean(abs_errors) - print( - f"\n MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}" - ) + print(f"\n MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}") else: mae = np.nan print("\n No overlapping arena models to compute MAE.") diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py index 2695abc..863e74a 100644 --- a/judgearena/evaluate.py +++ b/judgearena/evaluate.py @@ -6,9 +6,9 @@ import numpy as np import pandas as pd -from scipy.optimize import minimize_scalar from langchain_core.language_models.llms import LLM from langchain_core.prompts import ChatPromptTemplate +from scipy.optimize import minimize_scalar from judgearena.instruction_dataset import load_instructions from judgearena.instruction_dataset.arena_hard import ( @@ -93,7 +93,9 @@ def calibrate_temperature( y = y[non_tie] if len(delta_s) == 0: - raise ValueError("No non-tie observations available for temperature calibration.") + raise ValueError( + "No non-tie observations available for temperature calibration." + ) # z_i = (2y_i − 1) · Δs_i (positive when the score difference agrees with the outcome) z = (2 * y - 1) * delta_s diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py index c2be142..e055fdc 100644 --- a/tests/test_estimate_elo_ratings.py +++ b/tests/test_estimate_elo_ratings.py @@ -133,9 +133,11 @@ def test_bradley_terry_baseline(): def test_bradley_terry_soft_matches_hard(): """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels.""" - records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + [ - {"model_a": "A", "model_b": "B", "winner": "model_b"} - ] * 3 + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2 + records = ( + [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + + [{"model_a": "A", "model_b": "B", "winner": "model_b"}] * 3 + + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2 + ) df = _records_with_pref(records) hard = fit_bradley_terry(df, pref_col="pref") # Passing the same column twice (continuous == quantised here) must match.