diff --git a/README.md b/README.md index 508ac9f..ffa7683 100644 --- a/README.md +++ b/README.md @@ -44,8 +44,8 @@ uv sync --extra llamacpp # Optional: install LlamaCpp support Compare two models head-to-head: ```bash -python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +judgearena \ + --task alpaca-eval \ --model_A gpt4_1106_preview \ --model_B VLLM/utter-project/EuroLLM-9B \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ @@ -53,7 +53,7 @@ python judgearena/generate_and_evaluate.py \ ``` **What happens here?** -- Use completions available for `gpt4_1106_preview` in Alpaca-Eval dataset +- Use completions available for `gpt4_1106_preview` in Alpaca-Eval task - Generates completions for `model_B` if not already cached on `vLLM` - Compares two models using `deepseek-chat-v3.1` which the cheapest option available on `OpenRouter` @@ -62,7 +62,7 @@ It will then display the results of the battles: ```bash ============================================================ 🏆 MODEL BATTLE RESULTS 🏆 -📊 Dataset: alpaca-eval +📊 Task: alpaca-eval 🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B ⚖️ Judge: OpenRouter/deepseek/deepseek-chat-v3.1 📈 Results Summary: @@ -90,8 +90,8 @@ JudgeArena lets you forward these options directly to the underlying engine via For instance, to run vLLM with tensor parallelism across multiple GPUs: ```bash -python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +judgearena \ + --task alpaca-eval \ --model_A VLLM/Qwen/Qwen2.5-0.5B-Instruct \ --model_B VLLM/Qwen/Qwen2.5-1.5B-Instruct \ --judge_model VLLM/Qwen/Qwen3.5-27B-FP8 \ @@ -118,8 +118,8 @@ OpenRouter/deepseek/deepseek-chat-v3.1 For instance, to run everything locally with vLLM: ```bash -python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +judgearena \ + --task alpaca-eval \ --model_A VLLM/Qwen/Qwen2.5-0.5B-Instruct \ --model_B VLLM/Qwen/Qwen2.5-1.5B-Instruct \ --judge_model VLLM/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8 \ @@ -149,8 +149,8 @@ For absolute paths, this results in a double slash (e.g., `LlamaCpp//home/user/m **Mixed example** — local LlamaCpp model with a remote judge: ```bash -uv run python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +uv run judgearena \ + --task alpaca-eval \ --model_A LlamaCpp/./models/qwen2.5-0.5b-instruct-q8_0.gguf \ --model_B OpenRouter/qwen/qwen-2.5-7b-instruct \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ @@ -160,8 +160,8 @@ uv run python judgearena/generate_and_evaluate.py \ **Fully local example** — no API keys required (useful for verifying your setup): ```bash -uv run python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +uv run judgearena \ + --task alpaca-eval \ --model_A LlamaCpp/./models/qwen2.5-0.5b-instruct-q8_0.gguf \ --model_B LlamaCpp/./models/qwen2.5-1.5b-instruct-q8_0.gguf \ --judge_model LlamaCpp/./models/qwen2.5-1.5b-instruct-q8_0.gguf \ @@ -181,8 +181,8 @@ When using vLLM, JudgeArena automatically picks the right inference method based If you need to force a specific chat template (for example, a base model that you know works with ChatML), pass it via `--chat_template`: ```bash -python judgearena/generate_and_evaluate.py \ - --dataset alpaca-eval \ +judgearena \ + --task alpaca-eval \ --model_A VLLM/swiss-ai/Apertus-8B-2509 \ --model_B VLLM/swiss-ai/Apertus-8B-Instruct-2509 \ --judge_model VLLM/Qwen/Qwen2.5-32B-Instruct-GPTQ-Int8 \ @@ -191,9 +191,13 @@ python judgearena/generate_and_evaluate.py \ This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side. -## 📊 Supported Datasets +## 📊 Supported Tasks -| Dataset | Description | +Task names follow [LMHarness](https://github.com/EleutherAI/lm-evaluation-harness) conventions. Generate+judge tasks produce pairwise preferences between two models; ELO tasks (`elo-*`) estimate a single model's ELO rating against human-annotated arena opponents. + +### Generate + judge (pairwise) + +| Task | Description | |-----------------------|------------------------------------------------------------------------------------------------| | `alpaca-eval` | General instruction-following benchmark | | `arena-hard-v2.0` | Arena-Hard v2.0 from official `lmarena-ai/arena-hard-auto` source | @@ -201,33 +205,35 @@ This override applies to all vLLM models in the run. For remote providers (OpenA | `m-arena-hard` | Translated version of Arena-Hard in 23 languages | | `m-arena-hard-{lang}` | Language-specific variants (e.g., `ar`, `cs`, `de`) | | `m-arena-hard-EU` | All EU languages combined | +| `mt-bench` | Multi-turn benchmark with FastChat-compatible pairwise judging | | `fluency-{lang}` | Fluency evaluation for pretrained models (`finnish`, `french`, `german`, `spanish`, `swedish`) | -For Arena-Hard, JudgeArena resolves baseline metadata by dataset version: +For Arena-Hard, JudgeArena resolves baseline metadata by task version: - `arena-hard-v0.1`: `gpt-4-0314` - `arena-hard-v2.0`: `o3-mini-2025-01-31` (standard prompts) +### ELO rating + +| Task | Description | +|---------------------|--------------------------------------------------------------------| +| `elo-lmarena-100k` | Battles sampled from `lmarena-ai/arena-human-preference-100k` | +| `elo-lmarena-140k` | Battles sampled from `lmarena-ai/arena-human-preference-140k` | +| `elo-lmarena` | Union of all `LMArena-*` variants | +| `elo-comparia` | Battles sampled from the ComparIA arena | + ## 📈 Estimating ELO Ratings JudgeArena can estimate the ELO rating of a model by running it against opponents sampled from a human preference arena (`LMArena-100k`, `LMArena-140k`, or `ComparIA`). The LLM judge scores each battle, and the resulting ratings are computed using the Bradley-Terry model anchored against the human-annotated arena leaderboard. -### Quick start +Pass an `elo-` value to `--task` to trigger the ELO flow. ELO tasks take a single `--model_A` whose opponents are sampled from the arena (matching the pairwise CLI shape; `--model_B` is reserved for a future extension). -```bash -judgearena-elo \ - --arena ComparIA \ - --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ - --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ - --n_instructions 200 -``` - -Alternatively, if running directly from the repository without installing: +### Quick start ```bash -uv run python judgearena/estimate_elo_ratings.py \ - --arena ComparIA \ - --model Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ +judgearena \ + --task elo-comparia \ + --model_A Together/meta-llama/Llama-3.3-70B-Instruct-Turbo \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ --n_instructions 200 ``` @@ -236,9 +242,9 @@ uv run python judgearena/estimate_elo_ratings.py \ | Flag | Default | Description | |---|---|---| -| `--arena` | `ComparIA` | Arena to sample opponents from: `LMArena-100k`, `LMArena-140k`, or `ComparIA` | -| `--model` | *(required)* | Model under evaluation (same format as `judgearena`) | -| `--judge_model` | *(required)* | LLM judge (same format as `judgearena`) | +| `--task elo-` | *(required)* | Arena to sample opponents from: `elo-lmarena-100k`, `elo-lmarena-140k`, `elo-lmarena`, or `elo-comparia` | +| `--model_A` | *(required)* | Model under evaluation (same format as pairwise tasks) | +| `--judge_model` | *(required)* | LLM judge (same format as pairwise tasks) | | `--n_instructions` | all | Number of arena battles to use for evaluation | | `--n_instructions_per_language` | all | Cap battles per language (useful for balanced multilingual eval) | | `--languages` | all | Restrict to specific language codes, e.g. `en fr de` | diff --git a/judgearena/cli.py b/judgearena/cli.py new file mode 100644 index 0000000..eb94c83 --- /dev/null +++ b/judgearena/cli.py @@ -0,0 +1,261 @@ +"""Unified CLI entrypoint for judgearena. + +Dispatches to ``generate_and_evaluate.main`` or ``estimate_elo_ratings.main`` +based on the value of ``--task``. Task names prefixed with ``elo-`` run the +ELO rating flow; anything else runs the generate-and-judge flow. +""" + +from __future__ import annotations + +import argparse +import warnings + +from judgearena.cli_common import ( + add_common_arguments, + parse_engine_kwargs, + resolve_verbosity, +) +from judgearena.estimate_elo_ratings import CliEloArgs +from judgearena.estimate_elo_ratings import main as main_elo +from judgearena.generate_and_evaluate import CliArgs +from judgearena.generate_and_evaluate import main as main_generate_and_evaluate +from judgearena.log import configure_logging, get_logger + +logger = get_logger(__name__) + +ELO_TASK_PREFIX = "elo-" + +# Lowercase CLI task name -> canonical arena identifier used inside +# ``judgearena.arenas_utils.KNOWN_ARENAS`` and the ``benchmark`` column of +# saved battle dataframes. The CLI stays lowercase (matching ``alpaca-eval`` +# conventions) while internal identifiers keep their original casing. +ELO_TASK_TO_ARENA: dict[str, str] = { + "elo-lmarena-100k": "LMArena-100k", + "elo-lmarena-140k": "LMArena-140k", + "elo-lmarena": "LMArena", + "elo-comparia": "ComparIA", +} + + +def _build_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + prog="judgearena", + description=( + "Run a judge-based evaluation. Use `--task ` for generate+judge " + "benchmarks (e.g. alpaca-eval, arena-hard-v2.0, mt-bench) or " + "`--task elo-` for ELO rating (e.g. elo-lmarena-140k, elo-comparia)." + ), + ) + parser.add_argument( + "--task", + help=( + "Task to run. Generate+judge tasks: `alpaca-eval`, `arena-hard-v0.1`, " + "`arena-hard-v2.0`, `m-arena-hard`, `m-arena-hard-{lang}`, `m-arena-hard-EU`, " + "`mt-bench`, `fluency-{lang}`. ELO tasks: `elo-lmarena-100k`, `elo-lmarena-140k`, " + "`elo-lmarena`, `elo-comparia`." + ), + ) + parser.add_argument( + "--dataset", + help="[DEPRECATED] Use `--task` instead.", + ) + parser.add_argument( + "--arena", + help="[DEPRECATED] Use `--task elo-` instead.", + ) + parser.add_argument( + "--model_A", + help=( + "Model under evaluation. For pairwise tasks, this is Model A (paired with " + "--model_B). For elo tasks, this is the single model rated against arena opponents." + ), + ) + parser.add_argument( + "--model_B", + help="Model B for generate+judge tasks (not yet supported for elo tasks).", + ) + parser.add_argument( + "--model", + help="[DEPRECATED] Use `--model_A` instead.", + ) + parser.add_argument( + "--use_tqdm", + action="store_true", + help="[generate+judge] Use tqdm (not compatible with vLLM).", + ) + parser.add_argument( + "--languages", + nargs="+", + default=None, + help="[elo] Language codes to evaluate, e.g. `en fr de`.", + ) + parser.add_argument( + "--n_instructions_per_language", + type=int, + default=None, + help="[elo] Cap battles per language.", + ) + parser.add_argument( + "--n_bootstraps", + type=int, + default=20, + help="[elo] Bootstrap samples for ELO confidence intervals.", + ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="[elo] Random seed for reproducibility.", + ) + parser.add_argument( + "--baseline_model", + type=str, + default=None, + help="[elo] Model anchored at 1000 ELO (ratings are reported relative to it).", + ) + add_common_arguments(parser) + return parser + + +def _resolve_task(args: argparse.Namespace) -> str: + """Return the task value from --task, or a deprecated --dataset / --arena.""" + set_flags = [ + name + for name, value in ( + ("--task", args.task), + ("--dataset", args.dataset), + ("--arena", args.arena), + ) + if value is not None + ] + if len(set_flags) > 1: + raise SystemExit( + f"Specify exactly one of --task/--dataset/--arena, got {set_flags}." + ) + if not set_flags: + raise SystemExit("One of --task/--dataset/--arena is required.") + + if args.task is not None: + return args.task + if args.dataset is not None: + warnings.warn( + "--dataset is deprecated; use --task instead.", + DeprecationWarning, + stacklevel=2, + ) + return args.dataset + # --arena was historically case-sensitive (e.g. "LMArena-140k"). Lowercase it + # here so the deprecated path lands on a valid ELO_TASK_TO_ARENA key without + # asking users to relearn the arena names. + lower_arena = args.arena.lower() + warnings.warn( + f"--arena is deprecated; use --task {ELO_TASK_PREFIX}{lower_arena} instead.", + DeprecationWarning, + stacklevel=2, + ) + return f"{ELO_TASK_PREFIX}{lower_arena}" + + +def _resolve_model_a(args: argparse.Namespace) -> str | None: + """Collapse the deprecated --model flag into --model_A.""" + if args.model is not None and args.model_A is not None: + raise SystemExit( + "Specify exactly one of --model_A/--model; --model is a deprecated alias." + ) + if args.model is not None: + warnings.warn( + "--model is deprecated; use --model_A instead.", + DeprecationWarning, + stacklevel=2, + ) + return args.model + return args.model_A + + +def _build_elo_args( + args: argparse.Namespace, arena: str, model_a: str | None +) -> CliEloArgs: + if model_a is None: + raise SystemExit( + "--model_A is required for elo tasks (use `--task elo- --model_A `)." + ) + if args.model_B is not None: + raise SystemExit( + "--model_B is not yet supported for elo tasks; only --model_A is used." + ) + return CliEloArgs( + arena=arena, + model=model_a, + n_instructions_per_language=args.n_instructions_per_language, + languages=args.languages, + n_bootstraps=args.n_bootstraps, + seed=args.seed, + baseline_model=args.baseline_model, + judge_model=args.judge_model, + n_instructions=args.n_instructions, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + ignore_cache=args.ignore_cache, + truncate_all_input_chars=args.truncate_all_input_chars, + max_out_tokens_models=args.max_out_tokens_models, + max_out_tokens_judge=args.max_out_tokens_judge, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + result_folder=args.result_folder, + engine_kwargs=parse_engine_kwargs(args.engine_kwargs), + verbosity=resolve_verbosity(args), + log_file=args.log_file, + no_log_file=args.no_log_file, + ) + + +def _build_generate_and_evaluate_args( + args: argparse.Namespace, task: str, model_a: str | None +) -> CliArgs: + if model_a is None or args.model_B is None: + raise SystemExit(f"--model_A and --model_B are required for task {task!r}.") + return CliArgs( + task=task, + model_A=model_a, + model_B=args.model_B, + use_tqdm=args.use_tqdm, + judge_model=args.judge_model, + n_instructions=args.n_instructions, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + ignore_cache=args.ignore_cache, + truncate_all_input_chars=args.truncate_all_input_chars, + max_out_tokens_models=args.max_out_tokens_models, + max_out_tokens_judge=args.max_out_tokens_judge, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + result_folder=args.result_folder, + engine_kwargs=parse_engine_kwargs(args.engine_kwargs), + verbosity=resolve_verbosity(args), + log_file=args.log_file, + no_log_file=args.no_log_file, + ) + + +def cli(argv: list[str] | None = None) -> None: + parser = _build_parser() + args = parser.parse_args(argv) + configure_logging(resolve_verbosity(args), log_file=args.log_file) + task = _resolve_task(args) + model_a = _resolve_model_a(args) + if task.startswith(ELO_TASK_PREFIX): + if task not in ELO_TASK_TO_ARENA: + raise SystemExit( + f"Unknown elo task {task!r}; expected one of {list(ELO_TASK_TO_ARENA)}." + ) + elo_args = _build_elo_args(args, arena=ELO_TASK_TO_ARENA[task], model_a=model_a) + logger.debug("Running with CLI args: %s", elo_args.__dict__) + main_elo(elo_args) + else: + ge_args = _build_generate_and_evaluate_args(args, task=task, model_a=model_a) + logger.debug("Running with CLI args: %s", ge_args.__dict__) + main_generate_and_evaluate(ge_args) + + +if __name__ == "__main__": + cli() diff --git a/judgearena/estimate_elo_ratings.py b/judgearena/estimate_elo_ratings.py index 1ff726e..51ba6e2 100644 --- a/judgearena/estimate_elo_ratings.py +++ b/judgearena/estimate_elo_ratings.py @@ -1,4 +1,3 @@ -import argparse import hashlib from dataclasses import dataclass from functools import partial @@ -8,15 +7,10 @@ from sklearn.linear_model import LogisticRegression from judgearena.arenas_utils import _extract_instruction_text, load_arena_dataframe -from judgearena.cli_common import ( - BaseCliArgs, - add_common_arguments, - parse_engine_kwargs, - resolve_verbosity, -) +from judgearena.cli_common import BaseCliArgs from judgearena.evaluate import judge_and_parse_prefs from judgearena.generate import generate_instructions -from judgearena.log import configure_logging, get_logger +from judgearena.log import get_logger from judgearena.utils import cache_function_dataframe, compute_pref_summary, make_model logger = get_logger(__name__) @@ -40,85 +34,6 @@ class CliEloArgs(BaseCliArgs): seed: int = 0 baseline_model: str | None = None - @classmethod - def parse_args(cls): - parser = argparse.ArgumentParser( - prog="Estimate ELO rating for a model on an Arena (LMArena-100k, LMArena-140k, or ComparIA) with LLM judges", - ) - parser.add_argument( - "--arena", - help="The arena to use. Battles are sampled from this Arena. If not passed use concatenation from all Arena. " - "Passing LMArena leads to loading the union of `LMArena-100k` and `LMArena-140k`", - choices=["LMArena-100k", "LMArena-140k", "ComparIA", "LMArena"], - required=False, - ) - parser.add_argument( - "--model", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--languages", - nargs="+", - default=None, - help='List of language codes to evaluate, e.g. "en fr de" (default: all languages)', - ) - parser.add_argument( - "--n_instructions_per_language", - type=int, - required=False, - help="Maximum number of instructions to keep per language.", - ) - parser.add_argument( - "--n_bootstraps", - type=int, - required=False, - default=20, - help="Number of bootstrap samples for ELO confidence intervals. Default is 20.", - ) - parser.add_argument( - "--seed", - type=int, - required=False, - default=0, - help="Random seed for reproducibility. Default is 0.", - ) - parser.add_argument( - "--baseline_model", - type=str, - required=False, - default=None, - help="Model name to anchor at 1000 ELO. All other ratings are expressed relative to this model. " - "Must be one of the models present in the arena battles. If not set, ratings are not anchored.", - ) - add_common_arguments(parser) - args = parser.parse_args() - - return cls( - arena=args.arena, - model=args.model, - n_instructions_per_language=args.n_instructions_per_language, - languages=args.languages, - n_bootstraps=args.n_bootstraps, - seed=args.seed, - baseline_model=args.baseline_model, - judge_model=args.judge_model, - n_instructions=args.n_instructions, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - ignore_cache=args.ignore_cache, - truncate_all_input_chars=args.truncate_all_input_chars, - max_out_tokens_models=args.max_out_tokens_models, - max_out_tokens_judge=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - result_folder=args.result_folder, - engine_kwargs=parse_engine_kwargs(args.engine_kwargs), - verbosity=resolve_verbosity(args), - log_file=args.log_file, - no_log_file=args.no_log_file, - ) - def compute_bradley_terry( df: pd.DataFrame, @@ -232,10 +147,7 @@ def compute_bradley_terry( return dict(pd.Series(elo_scores, index=models.index)) -def main(args: CliEloArgs | None = None) -> dict: - if args is None: - args = CliEloArgs.parse_args() - +def main(args: CliEloArgs) -> dict: rng = np.random.default_rng(args.seed) # Step 1: Load arena battles @@ -510,13 +422,3 @@ def run_judge() -> pd.DataFrame: "bootstrap_ratings": bootstrap_ratings, "model_name": model_name, } - - -def cli(): - args = CliEloArgs.parse_args() - configure_logging(args.verbosity, log_file=args.log_file) - main(args) - - -if __name__ == "__main__": - cli() diff --git a/judgearena/eval_utils.py b/judgearena/eval_utils.py index a083569..93a422c 100644 --- a/judgearena/eval_utils.py +++ b/judgearena/eval_utils.py @@ -17,7 +17,7 @@ def print_results(results): """Print battle results in a readable format.""" print("\n" + "=" * 60) print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) - print(f"📊 Dataset: {results['dataset']}") + print(f"📊 Task: {results['task']}") print( f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" ) diff --git a/judgearena/generate_and_evaluate.py b/judgearena/generate_and_evaluate.py index 7c70253..2919280 100644 --- a/judgearena/generate_and_evaluate.py +++ b/judgearena/generate_and_evaluate.py @@ -1,9 +1,8 @@ """ -This script generates completions for a given dataset and model, +This script generates completions for a given task (dataset) and model, and then evaluates them using a judge model. """ -import argparse import json from dataclasses import asdict, dataclass from datetime import UTC, datetime @@ -12,12 +11,7 @@ import pandas as pd -from judgearena.cli_common import ( - BaseCliArgs, - add_common_arguments, - parse_engine_kwargs, - resolve_verbosity, -) +from judgearena.cli_common import BaseCliArgs from judgearena.evaluate import judge_and_parse_prefs, resolve_judge_prompts from judgearena.generate import generate_base, generate_instructions from judgearena.instruction_dataset import load_instructions @@ -27,7 +21,6 @@ ) from judgearena.log import ( attach_file_handler, - configure_logging, get_logger, make_run_log_path, ) @@ -91,62 +84,11 @@ def try_load_dataset_completions( class CliArgs(BaseCliArgs): """CLI arguments for the generate-and-evaluate entrypoint.""" - dataset: str | None = None + task: str | None = None model_A: str | None = None model_B: str | None = None use_tqdm: bool = False - @classmethod - def parse_args(cls): - parser = argparse.ArgumentParser( - prog="Generate completion and evaluate with a judge", - ) - parser.add_argument( - "--dataset", - help="The dataset to use. For instance `alpaca-eval`, `arena-hard-v2.0`, " - "`arena-hard-v0.1`, `m-arena-hard-EU` for instruction " - "tuning cases or `french-contexts`, `spanish-contexts` for base models.", - ) - parser.add_argument( - "--model_A", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--model_B", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--use_tqdm", - action="store_true", - help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", - ) - add_common_arguments(parser) - args = parser.parse_args() - - return cls( - dataset=args.dataset, - model_A=args.model_A, - model_B=args.model_B, - use_tqdm=args.use_tqdm, - judge_model=args.judge_model, - n_instructions=args.n_instructions, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - ignore_cache=args.ignore_cache, - truncate_all_input_chars=args.truncate_all_input_chars, - max_out_tokens_models=args.max_out_tokens_models, - max_out_tokens_judge=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - result_folder=args.result_folder, - engine_kwargs=parse_engine_kwargs(args.engine_kwargs), - verbosity=resolve_verbosity(args), - log_file=args.log_file, - no_log_file=args.no_log_file, - ) - def load_contexts(dataset: str) -> pd.Series: path = data_root / "contexts" / dataset @@ -158,7 +100,7 @@ def print_results(results): print("\n" + "=" * 60) print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) - print(f"📊 Dataset: {results['dataset']}") + print(f"📊 Task: {results['task']}") print( f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" ) @@ -175,7 +117,7 @@ def print_results(results): def main(args: CliArgs): """ 1) take as input: - * dataset, make sure instruct-completion works + * task (dataset), make sure instruct-completion works * model to generate output from * llm used for judge * number of annotations @@ -188,7 +130,7 @@ def main(args: CliArgs): # Build the result folder early so the file handler captures the entire run. # Include a timestamp so each run gets its own unique directory. - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name = f"{args.task}-{args.model_A}-{args.model_B}-{args.judge_model}" name += f"-{args.swap_mode}" name = name.replace("/", "_") run_ts = run_started_at.strftime("%Y%m%d_%H%M%S") @@ -198,8 +140,8 @@ def main(args: CliArgs): attach_file_handler(make_run_log_path(res_folder)) logger.info( - "Using dataset %s and evaluating models %s and %s.", - args.dataset, + "Using task %s and evaluating models %s and %s.", + args.task, args.model_A, args.model_B, ) @@ -209,7 +151,7 @@ def main(args: CliArgs): # set_langchain_cache() ignore_cache = args.ignore_cache - if args.dataset == "mt-bench": + if args.task == "mt-bench": return run_mt_bench( args, ignore_cache, @@ -218,15 +160,15 @@ def main(args: CliArgs): ) # Currrently, we run context evaluation - is_fluency_task = "fluency" in args.dataset + is_fluency_task = "fluency" in args.task if is_fluency_task: - # if args.dataset = "fluency-french", we map to "french-contexts.csv" + # if args.task = "fluency-french", we map to "french-contexts.csv" # to match files in https://huggingface.co/datasets/geoalgo/multilingual-contexts-to-be-completed - lang = args.dataset.split("-")[-1] + lang = args.task.split("-")[-1] instructions = load_contexts(f"{lang}-contexts.csv") else: instructions = load_instructions( - dataset=args.dataset, n_instructions=args.n_instructions + dataset=args.task, n_instructions=args.n_instructions ).loc[:, "instruction"] n_instructions = args.n_instructions if args.n_instructions else len(instructions) @@ -234,9 +176,9 @@ def main(args: CliArgs): instructions = instructions[:n_instructions] logger.info( - "Generating completions for dataset %s with model %s and %s " + "Generating completions for task %s with model %s and %s " "(or loading them directly if present)", - args.dataset, + args.task, args.model_A, args.model_B, ) @@ -264,7 +206,7 @@ def main(args: CliArgs): ) ) dataset_completions_A = try_load_dataset_completions( - args.dataset, args.model_A, n_instructions + args.task, args.model_A, n_instructions ) if dataset_completions_A is not None: completions_A = dataset_completions_A.set_index("instruction_index").loc[ @@ -278,12 +220,12 @@ def main(args: CliArgs): use_tqdm=args.use_tqdm, ), ignore_cache=ignore_cache, - cache_name=f"{args.dataset}_{args.model_A}_{args.n_instructions}", + cache_name=f"{args.task}_{args.model_A}_{args.n_instructions}", ).set_index("instruction_index") completions_A = completions_A.loc[:, "completion"] dataset_completions_B = try_load_dataset_completions( - args.dataset, args.model_B, n_instructions + args.task, args.model_B, n_instructions ) if dataset_completions_B is not None: completions_B = dataset_completions_B.set_index("instruction_index").loc[ @@ -297,7 +239,7 @@ def main(args: CliArgs): use_tqdm=args.use_tqdm, ), ignore_cache=ignore_cache, - cache_name=f"{args.dataset}_{args.model_B}_{args.n_instructions}", + cache_name=f"{args.task}_{args.model_B}_{args.n_instructions}", ).set_index("instruction_index") completions_B = completions_B.loc[:, "completion"] logger.debug("First instruction/context: %s", instructions.values[0]) @@ -370,7 +312,7 @@ def main(args: CliArgs): summary = compute_pref_summary(prefs) results = { - "dataset": args.dataset, + "task": args.task, "model_A": args.model_A, "model_B": args.model_B, "judge_model": args.judge_model, @@ -408,14 +350,3 @@ def main(args: CliArgs): logger.warning("Failed to write run metadata: %s", e) return prefs - - -def cli(): - args = CliArgs.parse_args() - configure_logging(args.verbosity, log_file=args.log_file) - logger.debug("Running with CLI args: %s", args.__dict__) - main(args) - - -if __name__ == "__main__": - cli() diff --git a/judgearena/mt_bench/mt_bench_utils.py b/judgearena/mt_bench/mt_bench_utils.py index b3149f0..b28f859 100644 --- a/judgearena/mt_bench/mt_bench_utils.py +++ b/judgearena/mt_bench/mt_bench_utils.py @@ -66,7 +66,7 @@ def _run_generation(model_name: str) -> pd.DataFrame: def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name = f"{args.task}-{args.model_A}-{args.model_B}-{args.judge_model}" name += f"-{args.swap_mode}" if suffix: name += f"-{suffix}" @@ -118,7 +118,7 @@ def _run_mt_bench_fastchat( stats = compute_pref_summary(prefs) results = { - "dataset": args.dataset, + "task": args.task, "model_A": args.model_A, "model_B": args.model_B, "judge_model": args.judge_model, diff --git a/judgearena/utils.py b/judgearena/utils.py index 573fd76..993ef01 100644 --- a/judgearena/utils.py +++ b/judgearena/utils.py @@ -408,6 +408,13 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs): model_provider = model.split("/")[0] + # vLLM-engine-only kwargs must not leak to remote-API providers + # (OpenRouter, OpenAI, Together): langchain-openai forwards unknown + # kwargs via model_kwargs into chat.completions.create, which rejects them. + if model_provider != "VLLM": + engine_kwargs.pop("max_model_len", None) + engine_kwargs.pop("chat_template", None) + if model_provider == "Dummy": return DummyModel(model) diff --git a/pyproject.toml b/pyproject.toml index 7f90892..2d421a9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,7 @@ requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" [project.scripts] -judgearena = "judgearena.generate_and_evaluate:cli" -judgearena-elo = "judgearena.estimate_elo_ratings:cli" +judgearena = "judgearena.cli:cli" [project] name = "judgearena" diff --git a/scripts/multilingual_arena_hard/README_translated_AH.md b/scripts/multilingual_arena_hard/README_translated_AH.md index 78c44f9..8e7dd7f 100644 --- a/scripts/multilingual_arena_hard/README_translated_AH.md +++ b/scripts/multilingual_arena_hard/README_translated_AH.md @@ -45,8 +45,8 @@ for example in dataset["train"]: Evaluate and compare two models using the JudgeArena framework: ```bash -python judgearena/generate_and_evaluate.py \ - --dataset arena-hard-EU \ +judgearena \ + --task arena-hard-EU \ --model_A gpt4_1106_preview \ --model_B VLLM/utter-project/EuroLLM-9B \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..30be4fa --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,331 @@ +"""Tests for the unified `judgearena` CLI dispatcher.""" + +from __future__ import annotations + +import pytest + +from judgearena import cli as cli_module +from judgearena.estimate_elo_ratings import CliEloArgs +from judgearena.generate_and_evaluate import CliArgs + + +@pytest.fixture +def capture_mains(monkeypatch): + """Replace both main functions with spies that record the dispatched args.""" + captured: dict[str, object] = {} + + def fake_main_ge(args: CliArgs) -> None: + captured["module"] = "generate_and_evaluate" + captured["args"] = args + + def fake_main_elo(args: CliEloArgs) -> None: + captured["module"] = "elo" + captured["args"] = args + + monkeypatch.setattr(cli_module, "main_generate_and_evaluate", fake_main_ge) + monkeypatch.setattr(cli_module, "main_elo", fake_main_elo) + return captured + + +@pytest.mark.parametrize( + "task", + [ + "alpaca-eval", + "arena-hard-v2.0", + "m-arena-hard-EU", + "fluency-french", + "mt-bench", + ], +) +def test_task_dispatches_to_generate_and_evaluate(capture_mains, task: str): + cli_module.cli( + [ + "--task", + task, + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "generate_and_evaluate" + ge_args: CliArgs = capture_mains["args"] + assert isinstance(ge_args, CliArgs) + assert ge_args.task == task + assert ge_args.model_A == "Dummy/A" + assert ge_args.model_B == "Dummy/B" + assert ge_args.judge_model == "Dummy/J" + + +@pytest.mark.parametrize( + "task, expected_arena", + [ + ("elo-comparia", "ComparIA"), + ("elo-lmarena-140k", "LMArena-140k"), + ("elo-lmarena-100k", "LMArena-100k"), + ("elo-lmarena", "LMArena"), + ], +) +def test_elo_task_dispatches_to_estimate_elo_ratings( + capture_mains, task: str, expected_arena: str +): + cli_module.cli( + [ + "--task", + task, + "--model_A", + "Dummy/X", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "elo" + elo_args: CliEloArgs = capture_mains["args"] + assert isinstance(elo_args, CliEloArgs) + assert elo_args.arena == expected_arena + assert elo_args.model == "Dummy/X" + assert elo_args.judge_model == "Dummy/J" + + +def test_dataset_flag_is_deprecated_alias(capture_mains): + with pytest.warns(DeprecationWarning, match="--dataset is deprecated"): + cli_module.cli( + [ + "--dataset", + "alpaca-eval", + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "generate_and_evaluate" + assert capture_mains["args"].task == "alpaca-eval" + + +def test_arena_flag_is_deprecated_alias_lowercases(capture_mains): + """`--arena ComparIA` must still route to the ComparIA ELO path. + + The deprecated path is allowed to be case-insensitive because that was the + historical contract for ``judgearena-elo --arena LMArena-140k``. + """ + with pytest.warns(DeprecationWarning, match="--arena is deprecated"): + cli_module.cli( + [ + "--arena", + "ComparIA", + "--model_A", + "Dummy/X", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "elo" + assert capture_mains["args"].arena == "ComparIA" + + +def test_model_flag_is_deprecated_alias(capture_mains): + with pytest.warns(DeprecationWarning, match="--model is deprecated"): + cli_module.cli( + [ + "--task", + "elo-comparia", + "--model", + "Dummy/X", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "elo" + assert capture_mains["args"].model == "Dummy/X" + + +def test_model_and_model_a_collide_raises(capture_mains): + with pytest.raises(SystemExit, match="exactly one of --model_A/--model"): + cli_module.cli( + [ + "--task", + "elo-comparia", + "--model", + "Dummy/X", + "--model_A", + "Dummy/Y", + "--judge", + "Dummy/J", + ] + ) + + +def test_missing_task_raises(capture_mains): + with pytest.raises(SystemExit, match="One of --task"): + cli_module.cli( + [ + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + + +def test_multiple_task_flags_raise(capture_mains): + with pytest.raises(SystemExit, match="exactly one of"): + cli_module.cli( + [ + "--task", + "alpaca-eval", + "--dataset", + "alpaca-eval", + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + + +def test_elo_task_requires_model_a(capture_mains): + with pytest.raises(SystemExit, match="--model_A is required for elo"): + cli_module.cli( + [ + "--task", + "elo-comparia", + "--judge", + "Dummy/J", + ] + ) + + +def test_elo_task_rejects_model_b_for_now(capture_mains): + with pytest.raises(SystemExit, match="not yet supported"): + cli_module.cli( + [ + "--task", + "elo-comparia", + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + + +def test_uppercase_elo_task_is_rejected(capture_mains): + with pytest.raises(SystemExit, match="Unknown elo task"): + cli_module.cli( + [ + "--task", + "elo-LMArena-140k", + "--model_A", + "Dummy/X", + "--judge", + "Dummy/J", + ] + ) + + +def test_unknown_elo_task_raises(capture_mains): + with pytest.raises(SystemExit, match="Unknown elo task.*elo-lmarena-140k"): + cli_module.cli( + [ + "--task", + "elo-foo", + "--model_A", + "Dummy/X", + "--judge", + "Dummy/J", + ] + ) + + +def test_generate_and_evaluate_requires_model_a_and_b(capture_mains): + with pytest.raises(SystemExit, match="--model_A and --model_B are required"): + cli_module.cli( + [ + "--task", + "alpaca-eval", + "--model_A", + "Dummy/A", + "--judge", + "Dummy/J", + ] + ) + + +def test_deprecated_model_flag_routes_into_pairwise_task(capture_mains): + """`--model` is a deprecated alias for `--model_A` even on pairwise tasks.""" + with pytest.warns(DeprecationWarning, match="--model is deprecated"): + cli_module.cli( + [ + "--task", + "alpaca-eval", + "--model", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + ] + ) + assert capture_mains["module"] == "generate_and_evaluate" + assert capture_mains["args"].model_A == "Dummy/A" + assert capture_mains["args"].model_B == "Dummy/B" + + +def test_elo_forwards_optional_flags(capture_mains): + cli_module.cli( + [ + "--task", + "elo-lmarena-140k", + "--model_A", + "Dummy/X", + "--judge", + "Dummy/J", + "--languages", + "en", + "fr", + "--n_instructions_per_language", + "50", + "--n_bootstraps", + "5", + "--seed", + "7", + "--baseline_model", + "gpt-4o", + ] + ) + elo_args: CliEloArgs = capture_mains["args"] + assert elo_args.languages == ["en", "fr"] + assert elo_args.n_instructions_per_language == 50 + assert elo_args.n_bootstraps == 5 + assert elo_args.seed == 7 + assert elo_args.baseline_model == "gpt-4o" + + +def test_engine_kwargs_parsed_as_json(capture_mains): + cli_module.cli( + [ + "--task", + "alpaca-eval", + "--model_A", + "Dummy/A", + "--model_B", + "Dummy/B", + "--judge", + "Dummy/J", + "--engine_kwargs", + '{"tensor_parallel_size": 4}', + ] + ) + ge_args: CliArgs = capture_mains["args"] + assert ge_args.engine_kwargs == {"tensor_parallel_size": 4} diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index f01e916..03a278f 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -49,7 +49,7 @@ def _run_without_cache(fun, **_kwargs): @pytest.mark.parametrize( - "dataset", + "task", [ "alpaca-eval", "arena-hard-v2.0", @@ -58,10 +58,10 @@ def _run_without_cache(fun, **_kwargs): "m-arena-hard-EU", ], ) -def test_generate_and_evaluate_context_completion(dataset: str, tmp_path): +def test_generate_and_evaluate_context_completion(task: str, tmp_path): prefs = main_generate_and_eval( CliArgs( - dataset=dataset, + task=task, model_A="Dummy/no answer", model_B="Dummy/open is better than close isnt'it", judge_model="Dummy/score A: 0 score B: 10", @@ -84,7 +84,7 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path): """ prefs = main_generate_and_eval( CliArgs( - dataset="alpaca-eval", + task="alpaca-eval", model_A="Dummy/no answer", model_B="Dummy/open is better than close isnt'it", judge_model="Dummy/score A: 0 score B: 10", diff --git a/tests/test_utils.py b/tests/test_utils.py index d2a7411..9ca02f4 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,4 +1,5 @@ import judgearena.utils as utils +from judgearena.utils import make_model def test_download_all_dispatches_arena_hard_versions(monkeypatch, tmp_path): @@ -39,3 +40,27 @@ def test_download_all_dispatches_arena_hard_versions(monkeypatch, tmp_path): "geoalgo/multilingual-contexts-to-be-completed", tmp_path / "contexts", ) + + +def test_make_model_openrouter_strips_vllm_only_kwargs(monkeypatch): + """vLLM-engine-only kwargs must not leak into ChatOpenAI.model_kwargs. + + Regression guard for #20: unknown kwargs forwarded to ``ChatOpenAI`` land + in ``model_kwargs`` and are then sent to ``chat.completions.create``, + which rejects them with ``TypeError: unexpected keyword argument + 'max_model_len'``. + """ + monkeypatch.setenv("OPENROUTER_API_KEY", "dummy") + + model = make_model( + "OpenRouter/google/gemma-3-4b-it", + max_tokens=16, + max_model_len=4096, + chat_template="", + temperature=0.5, + ) + + assert "max_model_len" not in model.model_kwargs + assert "chat_template" not in model.model_kwargs + assert model.max_tokens == 16 + assert model.temperature == 0.5