diff --git a/README.md b/README.md index 306443d..5e62300 100644 --- a/README.md +++ b/README.md @@ -251,6 +251,30 @@ judgearena \ | `--n_bootstraps` | `20` | Bootstrap samples for ELO confidence intervals | | `--swap_mode` | `fixed` | `fixed`: single judge pass; `both`: correct for position bias | | `--result_folder` | `results` | Directory where annotations and results are saved | +| `--soft-elo` | off | Use continuous judge preferences (soft Bradley-Terry) instead of hard win/loss/tie labels | +| `--soft-elo-temperature` | `0.3` | Initial softmax temperature for `--soft-elo`; overridden if `--calibrate-temperature` succeeds | +| `--calibrate-temperature` | off | MLE-calibrate the score-to-preference temperature against human arena annotations (requires `--soft-elo`) | +| `--calibration-size` | all | Number of human battles to sample for calibration (requires `--calibrate-temperature`) | + +### Soft-ELO & temperature calibration + +By default, judge scores are discretised to hard win/loss/tie labels. Passing `--soft-elo` instead converts the raw score +difference into a continuous preference via a softmax, which is then fed into a soft Bradley-Terry model. + +To let the data choose the best temperature automatically, add `--calibrate-temperature`. +JudgeArena will run the judge on a sample of human-annotated arena battles, fit the temperature $T^*$ by MLE, and +use it for the full evaluation: + +```bash +judgearena \ + --task elo-lmarena-100k \ + --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ + --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ + --n_instructions 200 \ + --soft-elo \ + --calibrate-temperature \ + --calibration-size 300 +``` ### Output diff --git a/judgearena/cli.py b/judgearena/cli.py index eb94c83..42840e9 100644 --- a/judgearena/cli.py +++ b/judgearena/cli.py @@ -113,6 +113,32 @@ def _build_parser() -> argparse.ArgumentParser: default=None, help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).", ) + parser.add_argument( + "--soft-elo", + action="store_true", + help="[elo] Use continuous judge preferences as soft BT targets instead of " + "discretising to hard win/loss/tie.", + ) + parser.add_argument( + "--soft-elo-temperature", + type=float, + default=0.3, + help="[elo] Initial PairScore temperature for --soft-elo. " + "Overridden by --calibrate-temperature if calibration succeeds.", + ) + parser.add_argument( + "--calibrate-temperature", + action="store_true", + help="[elo] MLE-fit the PairScore temperature against human-labeled arena " + "battles before the main run. Requires --soft-elo.", + ) + parser.add_argument( + "--calibration-size", + type=int, + default=None, + help="[elo] Number of human arena battles to sample for temperature " + "calibration. Defaults to all. Requires --calibrate-temperature.", + ) add_common_arguments(parser) return parser @@ -191,6 +217,10 @@ def _build_elo_args( n_bootstraps=args.n_bootstraps, seed=args.seed, baseline_model=args.baseline_model, + soft_elo=args.soft_elo, + soft_elo_temperature=args.soft_elo_temperature, + calibrate_temperature=args.calibrate_temperature, + calibration_size=args.calibration_size, judge_model=args.judge_model, n_instructions=args.n_instructions, provide_explanation=args.provide_explanation, diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 51ba6e2..1b17637 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -8,7 +8,7 @@ from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe from judgearena.cli_common import BaseCliArgs -from judgearena.evaluate import judge_and_parse_prefs +from judgearena.evaluate import PairScore, calibrate_temperature, judge_and_parse_prefs from judgearena.generate import generate_instructions from judgearena.log import get_logger from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model @@ -33,120 +33,142 @@ class CliEloArgs(BaseCliArgs): n_bootstraps: int = 20 seed: int = 0 baseline_model: str | None = None + soft_elo: bool = False + soft_elo_temperature: float = 0.3 + calibrate_temperature: bool = False + calibration_size: int | None = None -def compute_bradley_terry( +def _winner_to_pref(winner: str) -> float | None: + """Convert a hard winner label to a continuous preference value.""" + if winner == "model_a": + return 0.0 + elif winner == "model_b": + return 1.0 + elif winner in ("tie", "tie (bothbad)"): + return 0.5 + return None + + +def _is_nan_pref(p) -> bool: + return p is None or (isinstance(p, float) and np.isnan(p)) + + +def fit_bradley_terry( df: pd.DataFrame, - winner_col: str, + pref_col: str = "pref", scale: float = 400, base: float = 10, init_rating: float = 1000, baseline_model: str | None = None, baseline_rating: float = 1000, ) -> dict[str, float]: - """ - Compute Bradley-Terry ratings using MLE (logistic regression). - - This method fits a Bradley-Terry model to pairwise comparison data using - maximum likelihood estimation via logistic regression. - - Args: - df: DataFrame with columns 'model_a', 'model_b', and the winner column - winner_col: Name of the column containing the winner - scale: Scale factor for ELO conversion (default 400) - base: Base for logarithm in ELO formula (default 10) - init_rating: Initial rating offset (default 1000) - baseline_model: Model to anchor at baseline_rating - baseline_rating: Rating to assign to the baseline model - - Returns: - Dictionary mapping model names to their Bradley-Terry ratings - """ - # Get all unique models - all_models = sorted(set(df["model_a"].unique()) | set(df["model_b"].unique())) - - # Create pivot tables for wins - ptbl_a_win = pd.pivot_table( - df[df[winner_col] == "model_a"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) + """Fit Bradley-Terry ratings via weighted logistic regression. - ptbl_b_win = pd.pivot_table( - df[df[winner_col] == "model_b"], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) + Each row in *df* is a battle with columns ``model_a``, ``model_b`` and + ``pref_col`` ∈ [0, 1] where 0 means A wins, 1 means B wins, 0.5 is a tie. + Hard win/loss/tie labels are the special case ``pref ∈ {0, 0.5, 1}``. - # Handle ties - if sum(df[winner_col].isin(["tie", "tie (bothbad)"])) == 0: - ptbl_tie = pd.DataFrame(0, index=all_models, columns=all_models) - else: - ptbl_tie = pd.pivot_table( - df[df[winner_col].isin(["tie", "tie (bothbad)"])], - index="model_a", - columns="model_b", - aggfunc="size", - fill_value=0, - ) - ptbl_tie = ptbl_tie.reindex(index=all_models, columns=all_models, fill_value=0) - ptbl_tie = ptbl_tie + ptbl_tie.T + The soft cross-entropy for a battle is decomposed into two weighted + hard-label rows so sklearn's ``LogisticRegression`` can be reused: - # Reindex all pivot tables to have consistent dimensions - ptbl_a_win = ptbl_a_win.reindex(index=all_models, columns=all_models, fill_value=0) - ptbl_b_win = ptbl_b_win.reindex(index=all_models, columns=all_models, fill_value=0) + Y=1, weight = (1 − pref) · count (evidence A wins) + Y=0, weight = pref · count (evidence B wins) - # Combined win matrix (ties count as 0.5 for each) - ptbl_win = ptbl_a_win * 2 + ptbl_b_win.T * 2 + ptbl_tie + Identical ``(model_a, model_b, pref)`` triples are aggregated first so + the design matrix stays small when prefs are quantised (e.g. human + arena labels) and untouched when prefs are continuous floats. + """ + df = df.dropna(subset=[pref_col]) + if df.empty: + return {} - models = pd.Series(np.arange(len(ptbl_win.index)), index=ptbl_win.index) + grouped = ( + df.groupby(["model_a", "model_b", pref_col]).size().reset_index(name="count") + ) + all_models = sorted(set(grouped["model_a"]) | set(grouped["model_b"])) + models = pd.Series(np.arange(len(all_models)), index=all_models) p = len(models) - X = np.zeros([p * (p - 1) * 2, p]) - Y = np.zeros(p * (p - 1) * 2) - - cur_row = 0 - sample_weights = [] - for m_a in ptbl_win.index: - for m_b in ptbl_win.columns: - if m_a == m_b: - continue - # Skip if nan or no battles between this pair - w_ab = ptbl_win.loc[m_a, m_b] - w_ba = ptbl_win.loc[m_b, m_a] - if np.isnan(w_ab) or np.isnan(w_ba): - continue - if w_ab == 0 and w_ba == 0: - continue - X[cur_row, models[m_a]] = +np.log(base) - X[cur_row, models[m_b]] = -np.log(base) - Y[cur_row] = 1.0 - sample_weights.append(w_ab) - - X[cur_row + 1, models[m_a]] = np.log(base) - X[cur_row + 1, models[m_b]] = -np.log(base) - Y[cur_row + 1] = 0.0 - sample_weights.append(w_ba) - cur_row += 2 - - X = X[:cur_row] - Y = Y[:cur_row] + + m_a_idx = grouped["model_a"].map(models).to_numpy() + m_b_idx = grouped["model_b"].map(models).to_numpy() + prefs = grouped[pref_col].to_numpy(dtype=float) + counts = grouped["count"].to_numpy(dtype=float) + n = len(grouped) + + log_base = np.log(base) + X = np.zeros((2 * n, p)) + top = np.arange(n) + bot = n + top + X[top, m_a_idx] = +log_base + X[top, m_b_idx] = -log_base + X[bot, m_a_idx] = +log_base + X[bot, m_b_idx] = -log_base + + Y = np.concatenate([np.ones(n), np.zeros(n)]) + sample_weights = np.concatenate([(1.0 - prefs) * counts, prefs * counts]) + + # Keep zero-weight rows so sklearn LR always sees both Y classes — when + # every pref collapses to 0 or 1 the missing-class rows contribute nothing + # to the loss but stop the solver from raising on n_classes < 2. + if sample_weights.sum() == 0: + return {} lr = LogisticRegression(fit_intercept=False, C=1e10, tol=1e-6, max_iter=1000) lr.fit(X, Y, sample_weight=sample_weights) elo_scores = scale * lr.coef_[0] + init_rating - # Normalize to baseline model if specified if baseline_model is not None and baseline_model in models.index: elo_scores += baseline_rating - elo_scores[models[baseline_model]] return dict(pd.Series(elo_scores, index=models.index)) +def _prefs_to_battle_results( + prefs, + our_model_is_position_a, + opponent_models, + model_name: str, +) -> pd.DataFrame: + """Map per-battle judge prefs into model-name-level battle rows. + + The judge prompt placed our model at position A or B independently per + battle. Here we re-orient each row so ``model_a``/``model_b`` carry + the actual model names and ``pref`` is consistent with that ordering + (``pref=0`` ⇒ ``model_a`` wins). ``pref_hard`` is the quantised + {0, 0.5, 1} version used by the non-soft Bradley-Terry fit. + """ + records = [] + for pref, is_pos_a, opp in zip( + prefs, our_model_is_position_a, opponent_models, strict=True + ): + if _is_nan_pref(pref) or pref == 0.5: + winner = "tie" + elif pref < 0.5: + winner = "model_a" + else: + winner = "model_b" + + if is_pos_a: + rec = { + "model_a": model_name, + "model_b": opp, + "winner": winner, + "pref": pref, + } + else: + rec = { + "model_a": opp, + "model_b": model_name, + "winner": winner, + "pref": None if _is_nan_pref(pref) else 1.0 - pref, + } + rec["pref_hard"] = _winner_to_pref(winner) + records.append(rec) + return pd.DataFrame(records) + + def main(args: CliEloArgs) -> dict: rng = np.random.default_rng(args.seed) @@ -280,7 +302,7 @@ def run_judge() -> pd.DataFrame: max_tokens=args.max_out_tokens_judge, **judge_extra_kwargs, ) - annotations, _, prefs = judge_and_parse_prefs( + annotations, annotations_reversed, prefs = judge_and_parse_prefs( judge_chat_model=judge_chat_model, instructions=instructions.tolist(), completions_A=completions_A, @@ -290,16 +312,33 @@ def run_judge() -> pd.DataFrame: truncate_input_chars=args.truncate_all_input_chars, use_tqdm=use_tqdm, ) + if annotations_reversed is None: + row_annotations = list(annotations) + row_use_model_a = use_model_a_as_opponent + row_our_pos_a = our_model_is_position_a + row_opponents = list(opponent_models) + else: + # swap_mode="both": dataframe carries 2n rows (AB then BA). + # Position metadata is duplicated; prefs are already oriented + # consistently by judge_and_parse_prefs as [pref_AB, 1 - pref_BA]. + row_annotations = list(annotations) + list(annotations_reversed) + row_use_model_a = np.concatenate( + [use_model_a_as_opponent, use_model_a_as_opponent] + ) + row_our_pos_a = np.concatenate( + [our_model_is_position_a, our_model_is_position_a] + ) + row_opponents = list(opponent_models) + list(opponent_models) return pd.DataFrame( { - "judge_completion": [a.judge_completion for a in annotations], - "instruction": [a.instruction for a in annotations], - "completion_A": [a.completion_A for a in annotations], - "completion_B": [a.completion_B for a in annotations], + "judge_completion": [a.judge_completion for a in row_annotations], + "instruction": [a.instruction for a in row_annotations], + "completion_A": [a.completion_A for a in row_annotations], + "completion_B": [a.completion_B for a in row_annotations], "pref": prefs, - "use_model_a_as_opponent": use_model_a_as_opponent, - "our_model_is_position_a": our_model_is_position_a, - "opponent_model": opponent_models, + "use_model_a_as_opponent": row_use_model_a, + "our_model_is_position_a": row_our_pos_a, + "opponent_model": row_opponents, } ) @@ -318,30 +357,11 @@ def run_judge() -> pd.DataFrame: logger.debug("First judge output:\n%s", df_judge["judge_completion"].iloc[0][:500]) - # Map preferences back to model-name-level battle results + # Map preferences back to model-name-level battle results. model_name = args.model - battle_results = [] - for pref, is_pos_a, opp_model in zip( - prefs, our_model_is_position_a, opponent_models, strict=True - ): - if pref is None or pref == 0.5: - winner = "tie" - elif pref < 0.5: - winner = "model_a" - else: - winner = "model_b" - - if is_pos_a: - battle_results.append( - {"model_a": model_name, "model_b": opp_model, "winner": winner} - ) - else: - battle_results.append( - {"model_a": opp_model, "model_b": model_name, "winner": winner} - ) - - # LLM-judge battle results for our model - df_llm_judge = pd.DataFrame(battle_results) + df_llm_judge = _prefs_to_battle_results( + prefs, our_model_is_position_a, opponent_models, model_name + ) # Normalize prefs so pref < 0.5 always means our model wins, then summarise prefs_normalized = pd.Series( @@ -357,12 +377,15 @@ def run_judge() -> pd.DataFrame: winrate = summary["winrate"] print(f"\n=== Results for {model_name} ===") - print(f"Battles: {n} | Wins: {our_wins} | Losses: {our_losses} | Ties: {our_ties}") + print( + f"Battles: {len(df_llm_judge)} | Wins: {our_wins} | " + f"Losses: {our_losses} | Ties: {our_ties}" + ) print(f"Win rate: {winrate:.2%}") # Combine LLM-judge battles with human-annotated arena battles, # keeping only arena models with at least 500 human battles - df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]] + df_arena = df_arena_all.loc[:, ["model_a", "model_b", "winner"]].copy() human_battle_counts = pd.concat( [df_arena["model_a"], df_arena["model_b"]] ).value_counts() @@ -371,16 +394,153 @@ def run_judge() -> pd.DataFrame: df_arena["model_a"].isin(well_represented) & df_arena["model_b"].isin(well_represented) ] + # Add pref column to arena battles (hard labels → 0.0 / 1.0 / 0.5). + # Human labels are already hard, so pref_hard == pref. + df_arena["pref"] = df_arena["winner"].map(_winner_to_pref) + df_arena["pref_hard"] = df_arena["pref"] + df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) - # Bootstrap Bradley-Terry ELO ratings + # Compute human-only BT ratings as ground-truth reference + human_elo = fit_bradley_terry( + df_arena, pref_col="pref_hard", baseline_model=args.baseline_model + ) + + # --- Temperature calibration (optional) --- + # Run the judge on a random subset of human arena battles that already + # have ground-truth winner labels so we can fit T* via MLE. + calibrated_temperature: float | None = None + if args.calibrate_temperature: + if not args.soft_elo: + logger.warning( + "--calibrate-temperature has no effect without --soft-elo; skipping." + ) + else: + logger.info("Calibrating PairScore temperature against human annotations.") + # Sample calibration battles from the already-loaded arena battles. + # Use the same judge to score them so scores and labels are comparable. + _cal_n = ( + min(args.calibration_size, len(df_arena)) + if args.calibration_size is not None + else len(df_arena) + ) + # Keep the original df_arena_all index so we can look up the full + # conversation rows below; reset_index would point at non-existent + # 0..N labels in df_arena_all. + cal_battles = df_arena.sample( + n=_cal_n, random_state=int(rng.integers(0, 2**31)) + ) + + cal_instructions = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][0]) + for i in cal_battles.index + ] + cal_completions_a = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_a"][1]) + for i in cal_battles.index + ] + cal_completions_b = [ + _extract_instruction_text(df_arena_all.loc[i, "conversation_b"][1]) + for i in cal_battles.index + ] + + judge_chat_model_cal = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + **judge_extra_kwargs, + ) + cal_annotations, _, cal_prefs = judge_and_parse_prefs( + judge_chat_model=judge_chat_model_cal, + instructions=cal_instructions, + completions_A=cal_completions_a, + completions_B=cal_completions_b, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + ) + + # Build (delta_s, y) pairs from calibration battles. + # delta_s = score_A - score_B (raw, using default T=1 to extract scores) + raw_parser = PairScore(temperature=1.0) + delta_s_cal = [] + y_cal = [] + for ann, human_winner in zip( + cal_annotations, cal_battles["winner"].tolist(), strict=True + ): + sa = raw_parser.get_regexp_match( + ann.judge_completion.lower(), r'score.*?a[":\s*\n]*(-?\d+)' + ) + sb = raw_parser.get_regexp_match( + ann.judge_completion.lower(), r'score.*?b[":\s*\n]*(-?\d+)' + ) + if sa is None or sb is None: + continue + human_pref = _winner_to_pref(human_winner) + if human_pref is None or human_pref == 0.5: + continue # skip ties and missing + delta_s_cal.append(sa - sb) + y_cal.append(1.0 - human_pref) # pref=0 → A wins → y=1 + + if len(delta_s_cal) < 10: + logger.warning( + "Only %d valid calibration pairs (need ≥10); keeping default temperature.", + len(delta_s_cal), + ) + else: + calibrated_temperature = calibrate_temperature( + np.array(delta_s_cal), np.array(y_cal) + ) + logger.info( + "Calibration pairs: %d T* = %.4f (default was %s)", + len(delta_s_cal), + calibrated_temperature, + args.soft_elo_temperature, + ) + + # Build the score parser used for the main evaluation run. + score_parser = PairScore( + temperature=calibrated_temperature + if calibrated_temperature is not None + else args.soft_elo_temperature + ) + + # If we calibrated the temperature, the prefs stored in df_judge were + # computed with the default T=0.3. Re-parse them with the new parser so + # the soft-ELO bootstrap uses calibrated preferences. + if calibrated_temperature is not None: + new_prefs_ab = pd.Series( + [score_parser.parse_model_raw(c) for c in df_judge["judge_completion"]] + ) + prefs = new_prefs_ab.tolist() + + def _none_to_nan(x): + return float("nan") if x is None else x + + if args.swap_mode == "both": + # df_judge contains AB and BA annotations interleaved; the original + # run_judge() already combined them — we just need to re-parse the + # stored completions in the same order. + n_half = len(df_judge) // 2 + prefs_ab = new_prefs_ab[:n_half].apply(_none_to_nan) + prefs_ba = new_prefs_ab[n_half:].apply(_none_to_nan).reset_index(drop=True) + prefs = pd.concat([prefs_ab, 1 - prefs_ba]).reset_index(drop=True).tolist() + + # Rebuild battle_results with calibrated prefs + df_llm_judge = _prefs_to_battle_results( + prefs, our_model_is_position_a, opponent_models, model_name + ) + df_results = pd.concat([df_llm_judge, df_arena], ignore_index=True) + n_bootstraps = args.n_bootstraps + use_soft = args.soft_elo n_llm = len(df_llm_judge) n_human = len(df_arena) - print(f"\n=== ELO Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===") + method_label = "Soft-ELO" if use_soft else "ELO" print( - f"Estimating ELO Ratings with {n_llm} LLM-judges for model {model_name} " + f"\n=== {method_label} Ratings (Bradley-Terry, {n_bootstraps} bootstraps) ===" + ) + print( + f"Estimating {method_label} Ratings with {n_llm} LLM-judges for model {model_name} " f"and {n_human} human annotations for other models. Number of battles is indicated in parenthesis and " f"confidence intervals are reported by computing ELO on {n_bootstraps} samples of instructions." ) @@ -391,13 +551,14 @@ def run_judge() -> pd.DataFrame: battle_counts[row["model_a"]] = battle_counts.get(row["model_a"], 0) + 1 battle_counts[row["model_b"]] = battle_counts.get(row["model_b"], 0) + 1 + pref_col = "pref" if use_soft else "pref_hard" bootstrap_ratings: list[dict[str, float]] = [] for _ in range(n_bootstraps): df_sample = df_results.sample( n=len(df_results), replace=True, random_state=int(rng.integers(0, 2**31)) ) - ratings = compute_bradley_terry( - df_sample, winner_col="winner", baseline_model=args.baseline_model + ratings = fit_bradley_terry( + df_sample, pref_col=pref_col, baseline_model=args.baseline_model ) bootstrap_ratings.append(ratings) @@ -414,11 +575,26 @@ def run_judge() -> pd.DataFrame: suffix = " <-----" if m == model_name else "" count = battle_counts.get(m, 0) print(f" {m} ({count}){suffix}: {np.mean(vals):.1f} ± {np.std(vals):.1f}") + + # MAE vs human-only ELO for overlapping arena models + overlap = [m for m in all_model_names if m in human_elo and m != model_name] + if overlap: + abs_errors = [abs(mean_ratings[m] - human_elo[m]) for m in overlap] + mae = np.mean(abs_errors) + print(f"\n MAE vs Human-ELO ({len(overlap)} arena models): {mae:.1f}") + else: + mae = np.nan + print("\n No overlapping arena models to compute MAE.") else: print(" Not enough data to compute ELO ratings.") + mae = np.nan return { **summary, "bootstrap_ratings": bootstrap_ratings, + "human_elo": human_elo, + "mae_vs_human": mae, "model_name": model_name, + "method": method_label, + "calibrated_temperature": calibrated_temperature, } diff --git a/judgearena/evaluate.py b/judgearena/evaluate.py index 7eb8599..863e74a 100644 --- a/judgearena/evaluate.py +++ b/judgearena/evaluate.py @@ -8,6 +8,7 @@ import pandas as pd from langchain_core.language_models.llms import LLM from langchain_core.prompts import ChatPromptTemplate +from scipy.optimize import minimize_scalar from judgearena.instruction_dataset import load_instructions from judgearena.instruction_dataset.arena_hard import ( @@ -29,9 +30,9 @@ class PairScore: - def __init__(self): + def __init__(self, temperature: float = 0.3): super(PairScore).__init__() - self.temperature = 0.3 + self.temperature = temperature def preference_from_scores(self, score_a: float, score_b: float) -> float: return 1 - np.exp(self.temperature * score_a) / ( @@ -59,6 +60,58 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1): return float(m.group(group_index).strip(" ")) +def calibrate_temperature( + delta_s: np.ndarray, + y: np.ndarray, + bounds: tuple[float, float] = (-10.0, 10.0), +) -> float: + """Find the MLE temperature T* for the model P(A>B) = σ(T·Δs). + + The log-likelihood is: + + L(T) = Σ_i [ y_i·log σ(T·Δs_i) + (1−y_i)·log σ(−T·Δs_i) ] + = Σ_i log σ(T · (2y_i − 1) · Δs_i) + + This is concave in T (single global maximum) so ``minimize_scalar`` with + the 'bounded' method is guaranteed to converge. + + Args: + delta_s: Score differences ``s_A − s_B`` for each battle, shape (N,). + y: Observed hard labels (1 = A was preferred, 0 = B was preferred, + 0.5 = tie). Ties contribute zero gradient and are skipped. + bounds: Search interval for T (default −10 to +10). + + Returns: + The calibrated temperature T*. + """ + delta_s = np.asarray(delta_s, dtype=float) + y = np.asarray(y, dtype=float) + + # Skip ties (y == 0.5) — they carry no directional information. + non_tie = y != 0.5 + delta_s = delta_s[non_tie] + y = y[non_tie] + + if len(delta_s) == 0: + raise ValueError( + "No non-tie observations available for temperature calibration." + ) + + # z_i = (2y_i − 1) · Δs_i (positive when the score difference agrees with the outcome) + z = (2 * y - 1) * delta_s + + def neg_log_likelihood(T: float) -> float: + # log σ(T·z) = −log(1 + exp(−T·z)) = −logaddexp(0, −T·z) + return float(np.sum(np.logaddexp(0.0, -T * z))) + + result = minimize_scalar( + neg_log_likelihood, + bounds=bounds, + method="bounded", + ) + return float(result.x) + + _COMPLETION_LABEL_SINGLE = "Answer" _COMPLETION_LABEL_MULTI_TURN = "Conversation with User" _EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement" @@ -366,6 +419,7 @@ def judge_and_parse_prefs( user_prompt_template: str | None = None, truncate_input_chars: int = 8192, use_tqdm: bool = False, + score_parser: "PairScore | None" = None, ) -> tuple[list[JudgeAnnotation], list[JudgeAnnotation] | None, pd.Series]: """Run judge annotation and parse preferences, handling swap_mode='both'. @@ -413,7 +467,8 @@ def judge_and_parse_prefs( def _none_to_nan(x): return float("nan") if x is None else x - score_parser = PairScore() + if score_parser is None: + score_parser = PairScore() prefs = pd.Series( [score_parser.parse_model_raw(a.judge_completion) for a in annotations] ) diff --git a/tests/test_estimate_elo_ratings.py b/tests/test_estimate_elo_ratings.py index 83f9c8a..e055fdc 100644 --- a/tests/test_estimate_elo_ratings.py +++ b/tests/test_estimate_elo_ratings.py @@ -5,7 +5,12 @@ import pytest import judgearena.estimate_elo_ratings as estimate_elo_ratings -from judgearena.estimate_elo_ratings import CliEloArgs, compute_bradley_terry, main +from judgearena.estimate_elo_ratings import ( + CliEloArgs, + _winner_to_pref, + fit_bradley_terry, + main, +) from judgearena.evaluate import JudgeAnnotation, judge_and_parse_prefs from judgearena.utils import make_model @@ -89,7 +94,13 @@ def _default_args(**kwargs) -> CliEloArgs: return CliEloArgs(**defaults) -# --- compute_bradley_terry unit tests --- +# --- fit_bradley_terry unit tests --- + + +def _records_with_pref(records: list[dict]) -> pd.DataFrame: + df = pd.DataFrame(records) + df["pref"] = df["winner"].map(_winner_to_pref) + return df def test_bradley_terry_clear_winner(): @@ -97,23 +108,22 @@ def test_bradley_terry_clear_winner(): records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 + [ {"model_a": "B", "model_b": "A", "winner": "model_b"} ] * 10 - ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner") + ratings = fit_bradley_terry(_records_with_pref(records)) assert ratings["A"] > ratings["B"] def test_bradley_terry_all_ties(): """All ties → ratings should be equal.""" records = [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 20 - ratings = compute_bradley_terry(pd.DataFrame(records), winner_col="winner") + ratings = fit_bradley_terry(_records_with_pref(records)) assert abs(ratings["A"] - ratings["B"]) < 1.0 def test_bradley_terry_baseline(): """Baseline model is anchored at baseline_rating.""" records = [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 10 - ratings = compute_bradley_terry( - pd.DataFrame(records), - winner_col="winner", + ratings = fit_bradley_terry( + _records_with_pref(records), baseline_model="B", baseline_rating=1000, ) @@ -121,6 +131,22 @@ def test_bradley_terry_baseline(): assert ratings["A"] > 1000.0 +def test_bradley_terry_soft_matches_hard(): + """Soft prefs ∈ {0, 0.5, 1} must give the same fit as hard winner labels.""" + records = ( + [{"model_a": "A", "model_b": "B", "winner": "model_a"}] * 7 + + [{"model_a": "A", "model_b": "B", "winner": "model_b"}] * 3 + + [{"model_a": "A", "model_b": "B", "winner": "tie"}] * 2 + ) + df = _records_with_pref(records) + hard = fit_bradley_terry(df, pref_col="pref") + # Passing the same column twice (continuous == quantised here) must match. + df["pref_soft"] = df["pref"].astype(float) + soft = fit_bradley_terry(df, pref_col="pref_soft") + assert hard["A"] == pytest.approx(soft["A"], abs=1e-3) + assert hard["B"] == pytest.approx(soft["B"], abs=1e-3) + + # --- main() integration tests ---