From ba4220dc94edbb0a1eaf02dbdd13bdd25565114b Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sat, 14 Feb 2026 20:45:18 +0100 Subject: [PATCH 01/35] Add llamacpp dependency and update gitignore with generated directories --- .gitignore | 5 +++++ pyproject.toml | 1 + 2 files changed, 6 insertions(+) diff --git a/.gitignore b/.gitignore index 505a3b1..092e713 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,8 @@ wheels/ # Virtual environments .venv + +# Evaluation data and results +models/ +openjury-eval-data/ +results/ \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c0a7422..77f5ad0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,5 @@ exclude = ["slurmpilot_scripts*"] [project.optional-dependencies] vllm = ["vllm==0.10.2"] +llamacpp = ["llama-cpp-python>=0.3.0"] dev = ["black>=25.1.0", "pytest>=8.4.2"] From d2a5a42d8ccb2378c88c5d83e873867c7ad43dc7 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sat, 14 Feb 2026 20:45:30 +0100 Subject: [PATCH 02/35] Add documentation for llamacpp in Readme --- README.md | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 490d95d..eb15eb0 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,8 @@ or send a PR, we will be happy to update the information. git clone https://github.com/OpenEuroLLM/OpenJury cd OpenJury uv sync -uv sync --extra vllm # Optional: install vLLM support +uv sync --extra vllm # Optional: install vLLM support +uv sync --extra llamacpp # Optional: install LlamaCpp support ``` ### Basic Evaluation @@ -98,6 +99,48 @@ python openjury/generate_and_evaluate.py \ --n_instructions 10 ``` +### Running locally with LlamaCpp + +LlamaCpp lets you run GGUF models locally on CPU, which is useful for testing your setup without needing a GPU or API keys. + +**Install the LlamaCpp extra:** + +```bash +uv sync --extra llamacpp +``` + +**Download GGUF models** using `huggingface-cli` (included via `huggingface-hub`): + +```bash +huggingface-cli download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q8_0.gguf --local-dir ./models +huggingface-cli download Qwen/Qwen2.5-1.5B-Instruct-GGUF qwen2.5-1.5b-instruct-q8_0.gguf --local-dir ./models +``` + +The `LlamaCpp` provider expects a **file path** to a `.gguf` model after the `LlamaCpp/` prefix. +For absolute paths, this results in a double slash (e.g., `LlamaCpp//home/user/models/model.gguf`). + +**Mixed example** — local LlamaCpp model with a remote judge: + +```bash +uv run python openjury/generate_and_evaluate.py \ + --dataset alpaca-eval \ + --model_A LlamaCpp/./models/qwen2.5-0.5b-instruct-q8_0.gguf \ + --model_B OpenRouter/qwen/qwen-2.5-7b-instruct \ + --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ + --n_instructions 10 --max_out_tokens_models 16384 +``` + +**Fully local example** — no API keys required (useful for verifying your setup): + +```bash +uv run python openjury/generate_and_evaluate.py \ + --dataset alpaca-eval \ + --model_A LlamaCpp/./models/qwen2.5-0.5b-instruct-q8_0.gguf \ + --model_B LlamaCpp/./models/qwen2.5-1.5b-instruct-q8_0.gguf \ + --judge_model LlamaCpp/./models/qwen2.5-1.5b-instruct-q8_0.gguf \ + --n_instructions 5 --max_out_tokens_models 16384 +``` + **Note:** Ensure you have the required LangChain dependencies installed for your chosen provider. If you use remote endpoint, you would have to set your credentials. From a828adbc9d2a573ed74c46223277202a37feec9f Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 15 Feb 2026 10:34:24 +0100 Subject: [PATCH 03/35] Document direnv usage for environment variables management --- README.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/README.md b/README.md index eb15eb0..ad866cd 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,29 @@ uv sync --extra vllm # Optional: install vLLM support uv sync --extra llamacpp # Optional: install LlamaCpp support ``` +### Automatic Environment (direnv) + +We use `direnv` to automatically manage the virtual environment and environment variables. + +1. **Install direnv:** +```bash +curl -sfL https://direnv.net/install.sh | bash +``` + +2. **Setup Hook:** +```bash +echo 'eval "$(direnv hook zsh)"' >> ~/.zshrc # or ~/.bashrc depending on which shell you use +``` + +3. **Configure:** Create a `.envrc` file in the project root: +```bash +dotenv_if_exists +``` + +4. **Allow:** Run `direnv allow` to authorize the setup. + +5. **Env variables** Add env variables like `OPEN_JURY_EVAL_DATA` to `.env` file. + ### Basic Evaluation Compare two models head-to-head: From 0dcebf974387e3590a7843ff55e9530533078771 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 15 Feb 2026 11:08:07 +0100 Subject: [PATCH 04/35] narrow down transformers dependency to fix version mismatch --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 77f5ad0..b3f37b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,6 @@ include = ["openjury*"] exclude = ["slurmpilot_scripts*"] [project.optional-dependencies] -vllm = ["vllm==0.10.2"] +vllm = ["vllm==0.10.2", "transformers>=4.55.2,<5.0.0"] llamacpp = ["llama-cpp-python>=0.3.0"] dev = ["black>=25.1.0", "pytest>=8.4.2"] From d60073bfe5b6ca433de7b722304985da9ec3b735 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 15 Feb 2026 15:47:12 +0100 Subject: [PATCH 05/35] Add max_model_len param for VLLM in order to prevent OOM errors --- openjury/generate.py | 14 ++++++++++++-- openjury/generate_and_evaluate.py | 27 +++++++++++++++++++++++++-- openjury/utils.py | 24 +++++++++++++++++++++--- tests/test_generate_and_evaluate.py | 4 ++-- 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/openjury/generate.py b/openjury/generate.py index 2cb4ee0..17d7f7f 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -19,10 +19,15 @@ def generate_instructions( model: str, truncate_input_chars: int | None = 8192, max_tokens: int | None = 32768, + max_model_len: int | None = None, use_tqdm: bool = True, system_prompt: str | None = None, ) -> pd.DataFrame: - chat_model = make_model(model, max_tokens=max_tokens) + chat_model = make_model( + model, + max_tokens=max_tokens, + max_model_len=max_model_len, + ) # TODO improve prompt to generate instructions if system_prompt is None: @@ -61,9 +66,14 @@ def generate_base( model: str, truncate_input_chars: int | None = 8192, max_tokens: int | None = 32768, + max_model_len: int | None = None, use_tqdm: bool = False, ) -> pd.DataFrame: - model = make_model(model, max_tokens=max_tokens) + model = make_model( + model, + max_tokens=max_tokens, + max_model_len=max_model_len, + ) inputs = [ truncate(instruction, max_len=truncate_input_chars) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 49fdb36..da70fb8 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -36,6 +36,7 @@ class CliArgs: truncate_all_input_chars: int = 8192 max_out_tokens_models: int = 32768 max_out_tokens_judge: int = 32768 + max_model_len: int | None = None result_folder: str = "results" @@ -131,6 +132,16 @@ def parse_args(cls): default=32768, help="Max tokens the judge can generate (reasoning + scores).", ) + parser.add_argument( + "--max_model_len", + type=int, + required=False, + default=None, + help=( + "Optional max context length for VLLM models. If omitted, VLLM uses " + "its default model max length. This is useful on smaller GPUs to avoid OOM." + ), + ) args = parser.parse_args() return cls( @@ -146,6 +157,7 @@ def parse_args(cls): truncate_all_input_chars=args.truncate_all_input_chars, max_out_tokens_models=args.max_out_tokens_models, max_out_tokens_judge=args.max_out_tokens_judge, + max_model_len=args.max_model_len, result_folder=args.result_folder, ) @@ -218,9 +230,19 @@ def main(args: CliArgs): # TODO currently we just support base models for fluency, we could also support instruction-tuned models gen_fun = ( - partial(generate_base, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models) + partial( + generate_base, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + max_model_len=args.max_model_len, + ) if is_fluency_task - else partial(generate_instructions, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models) + else partial( + generate_instructions, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + max_model_len=args.max_model_len, + ) ) completions_A = cache_function_dataframe( lambda: gen_fun( @@ -254,6 +276,7 @@ def main(args: CliArgs): judge_chat_model = make_model( model=args.judge_model, max_tokens=args.max_out_tokens_judge, + max_model_len=args.max_model_len, ) if is_fluency_task: system_prompt = """You are a highly efficient assistant, who evaluates and selects the best large language \ diff --git a/openjury/utils.py b/openjury/utils.py index 85f0f94..31a8afa 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -13,7 +13,7 @@ from langchain_core.globals import set_llm_cache data_root = Path( - os.environ.get("OPENJURY_DATA", Path("~/openjury-eval-data/").expanduser()) + os.environ.get("OPENJURY_EVAL_DATA", Path("~/openjury-eval-data/").expanduser()) ).expanduser() @@ -143,11 +143,26 @@ class ChatVLLM: correctly formats prompts with <|im_start|>, <|im_end|>, tags, etc. """ - def __init__(self, model: str, max_tokens: int = 8192, **vllm_kwargs): + def __init__( + self, + model: str, + max_tokens: int = 8192, + max_model_len: int | None = None, + **vllm_kwargs, + ): from vllm import LLM, SamplingParams self.model_path = model self.max_tokens = max_tokens + + if max_model_len is not None: + assert max_tokens <= max_model_len, ( + f"max_tokens ({max_tokens}) must be <= max_model_len ({max_model_len}). " + f"Either increase --max_model_len or decrease --max_out_tokens_models / " + f"--max_out_tokens_judge." + ) + vllm_kwargs["max_model_len"] = max_model_len + self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs) self.sampling_params = SamplingParams( max_tokens=max_tokens, @@ -215,7 +230,9 @@ async def ainvoke(self, input_item, **invoke_kwargs): ) -def make_model(model: str, max_tokens: int | None = 8192): +def make_model( + model: str, max_tokens: int | None = 8192, max_model_len: int | None = None +): model_provider = model.split("/")[0] if model_provider == "Dummy": @@ -229,6 +246,7 @@ def make_model(model: str, max_tokens: int | None = 8192): return ChatVLLM( model=model_name, max_tokens=max_tokens if max_tokens else 8192, + max_model_len=max_model_len, ) model_kwargs = {} diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 2875a81..08f2873 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -26,7 +26,7 @@ def test_generate_and_evaluate_context_completion(dataset: str): def test_generate_and_evaluate_correct_order_bias(): """Test the correction for model order bias. - + In this test, a judge that is totally biased towards model B should be corrected to be neutral. Since the judge favors model B regardless of the order and the completions, the average preference should be 0.5. @@ -43,4 +43,4 @@ def test_generate_and_evaluate_correct_order_bias(): ) avg_pref = sum(prefs) / len(prefs) - assert avg_pref == 0.5 \ No newline at end of file + assert avg_pref == 0.5 From 38f63ee48f35bf45af7ea7a7b3dacc3e21e0c6e1 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 15 Feb 2026 20:30:32 +0100 Subject: [PATCH 06/35] Fix completion loading and EuroLLM-9B example - Updated README to use EuroLLM-Instruct because the base (EuroLLM-9B) doesn't have a chat template and throws error. - Added functionality to load pre-existing dataset completions for models. Was throwing error previously, becuase it was considering the model as a provider. --- README.md | 6 +- openjury/generate_and_evaluate.py | 97 ++++++++++++++++++++++++------- 2 files changed, 78 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index ad866cd..565d953 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Compare two models head-to-head: python openjury/generate_and_evaluate.py \ --dataset alpaca-eval \ --model_A gpt4_1106_preview \ - --model_B VLLM/utter-project/EuroLLM-9B \ + --model_B VLLM/utter-project/EuroLLM-9B-Instruct \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ --n_instructions 10 ``` @@ -86,7 +86,7 @@ It will then display the results of the battles: ============================================================ 🏆 MODEL BATTLE RESULTS 🏆 📊 Dataset: alpaca-eval -🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B +🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B-Instruct ⚖️ Judge: OpenRouter/deepseek/deepseek-chat-v3.1 📈 Results Summary: Total Battles: 10 @@ -107,7 +107,7 @@ Models are specified using the format: `{LangChain Backend}/{Model Path}` Together/meta-llama/Llama-3.3-70B-Instruct-Turbo ChatOpenAI/gpt-4o LlamaCpp/jwiggerthale_Llama-3.2-3B-Q8_0-GGUF_llama-3.2-3b-q8_0.gguf -VLLM/utter-project/EuroLLM-9B +VLLM/utter-project/EuroLLM-9B-Instruct OpenRouter/deepseek/deepseek-chat-v3.1 ``` diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index da70fb8..3476ccc 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -17,10 +17,47 @@ from openjury.evaluate import annotate_battles, PairScore from openjury.generate import generate_instructions, generate_base from openjury.instruction_dataset import load_instructions -from openjury.utils import data_root +from openjury.utils import data_root, read_df, download_hf from openjury.utils import make_model, cache_function_dataframe +def try_load_dataset_completions( + dataset: str, model: str, n_instructions: int | None +) -> pd.DataFrame | None: + """Try loading pre-existing completions from the dataset. + + Some datasets (e.g. alpaca-eval) ship with completions for well-known + models such as ``gpt4_1106_preview``. When ``model`` matches a column in + ``model_outputs/{dataset}.csv.zip``, those completions are returned + directly so that no model instantiation / generation is needed. + + Returns a DataFrame with columns ``completion`` and ``instruction_index``, + or ``None`` when no pre-existing completions are found. + """ + local_path_tables = data_root / "tables" + download_hf(name=dataset, local_path=local_path_tables) + output_path = local_path_tables / "model_outputs" / f"{dataset}.csv.zip" + if not output_path.exists(): + return None + df_outputs = read_df(output_path) + df_outputs.loc[:, "output"] = df_outputs.loc[:, "output"].fillna("") + df_outputs = df_outputs.pivot_table( + index="instruction_index", columns="model", values="output", aggfunc="last" + ).sort_index() + if model not in df_outputs.columns: + return None + print(f"Found pre-existing completions for '{model}' in {dataset} dataset.") + completions = df_outputs.loc[:, model] + if n_instructions is not None: + completions = completions.head(n_instructions) + return pd.DataFrame( + { + "completion": completions.values, + "instruction_index": completions.index.tolist(), + } + ) + + @dataclass class CliArgs: dataset: str @@ -244,27 +281,43 @@ def main(args: CliArgs): max_model_len=args.max_model_len, ) ) - completions_A = cache_function_dataframe( - lambda: gen_fun( - instructions=instructions, - model=args.model_A, - use_tqdm=args.use_tqdm, - ), - ignore_cache=ignore_cache, - cache_name=f"{args.dataset}_{args.model_A}_{args.n_instructions}", - ).set_index("instruction_index") - completions_A = completions_A.loc[:, "completion"] - - completions_B = cache_function_dataframe( - lambda: gen_fun( - instructions=instructions, - model=args.model_B, - use_tqdm=args.use_tqdm, - ), - ignore_cache=ignore_cache, - cache_name=f"{args.dataset}_{args.model_B}_{args.n_instructions}", - ).set_index("instruction_index") - completions_B = completions_B.loc[:, "completion"] + dataset_completions_A = try_load_dataset_completions( + args.dataset, args.model_A, n_instructions + ) + if dataset_completions_A is not None: + completions_A = dataset_completions_A.set_index("instruction_index").loc[ + :, "completion" + ] + else: + completions_A = cache_function_dataframe( + lambda: gen_fun( + instructions=instructions, + model=args.model_A, + use_tqdm=args.use_tqdm, + ), + ignore_cache=ignore_cache, + cache_name=f"{args.dataset}_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + completions_A = completions_A.loc[:, "completion"] + + dataset_completions_B = try_load_dataset_completions( + args.dataset, args.model_B, n_instructions + ) + if dataset_completions_B is not None: + completions_B = dataset_completions_B.set_index("instruction_index").loc[ + :, "completion" + ] + else: + completions_B = cache_function_dataframe( + lambda: gen_fun( + instructions=instructions, + model=args.model_B, + use_tqdm=args.use_tqdm, + ), + ignore_cache=ignore_cache, + cache_name=f"{args.dataset}_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + completions_B = completions_B.loc[:, "completion"] print(f"\nFirst instruction/context: {instructions.values[0]}") print(f"\nFirst completion of {args.model_A}") From 6f5e0fcebdf68d8387a156bb4b15347e3ff219d2 Mon Sep 17 00:00:00 2001 From: Erlis Lushtaku <59629249+ErlisLushtaku@users.noreply.github.com> Date: Tue, 17 Feb 2026 16:02:47 +0100 Subject: [PATCH 07/35] Remove `direnv` documentation --- README.md | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/README.md b/README.md index 565d953..b9ed1ac 100644 --- a/README.md +++ b/README.md @@ -39,29 +39,6 @@ uv sync --extra vllm # Optional: install vLLM support uv sync --extra llamacpp # Optional: install LlamaCpp support ``` -### Automatic Environment (direnv) - -We use `direnv` to automatically manage the virtual environment and environment variables. - -1. **Install direnv:** -```bash -curl -sfL https://direnv.net/install.sh | bash -``` - -2. **Setup Hook:** -```bash -echo 'eval "$(direnv hook zsh)"' >> ~/.zshrc # or ~/.bashrc depending on which shell you use -``` - -3. **Configure:** Create a `.envrc` file in the project root: -```bash -dotenv_if_exists -``` - -4. **Allow:** Run `direnv allow` to authorize the setup. - -5. **Env variables** Add env variables like `OPEN_JURY_EVAL_DATA` to `.env` file. - ### Basic Evaluation Compare two models head-to-head: From 42ff2aea9535aa1bf1154f5b4a8ad811062f4f9f Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 17 Feb 2026 16:41:00 +0100 Subject: [PATCH 08/35] Revert stylistic (formatting) changes and add more documentation for the new `max_model_len` and related parameters --- README.md | 8 ++++++ openjury/generate_and_evaluate.py | 41 ++++++++++++----------------- tests/test_generate_and_evaluate.py | 4 +-- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index b9ed1ac..8714442 100644 --- a/README.md +++ b/README.md @@ -74,6 +74,14 @@ It will then display the results of the battles: ============================================================ ``` +### Length and Token Parameters + +The evaluation scripts expose four different length controls with different roles: +- `--truncate_all_input_chars`: character-level truncation applied to prompts before model generation and before judge evaluation. +- `--max_out_tokens_models`: generation token budget for each answer from `model_A` and `model_B`. +- `--max_out_tokens_judge`: generation token budget for the judge completion (reasoning + score output). +- `--max_model_len`: optional vLLM context-window limit (prompt + generated tokens), applied to vLLM models; this should be greater than or equal to the two `max_out_tokens_*` values. + ## 🎨 Model Specification Models are specified using the format: `{LangChain Backend}/{Model Path}` diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 3476ccc..da274d8 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -152,22 +152,28 @@ def parse_args(cls): type=int, required=False, default=8192, - help="Max characters to truncate all input text (instructions before models A/B, " - "completions before judge).", + help="Character-level truncation applied before tokenization: truncates each instruction " + "before model A/B generation and truncates each completion before judge evaluation.", ) parser.add_argument( "--max_out_tokens_models", type=int, required=False, default=32768, - help="Max tokens models A/B can generate in their responses.", + help=( + "Generation token budget for each model A/B response. For VLLM, keep this <= " + "--max_model_len (if provided)." + ), ) parser.add_argument( "--max_out_tokens_judge", type=int, required=False, default=32768, - help="Max tokens the judge can generate (reasoning + scores).", + help=( + "Generation token budget for the judge response (reasoning + scores). For " + "VLLM, keep this <= --max_model_len (if provided)." + ), ) parser.add_argument( "--max_model_len", @@ -175,8 +181,9 @@ def parse_args(cls): required=False, default=None, help=( - "Optional max context length for VLLM models. If omitted, VLLM uses " - "its default model max length. This is useful on smaller GPUs to avoid OOM." + "Optional total context window for VLLM models (prompt + generation). This is " + "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap " + "generated tokens. This is useful on smaller GPUs to avoid OOM." ), ) args = parser.parse_args() @@ -267,27 +274,15 @@ def main(args: CliArgs): # TODO currently we just support base models for fluency, we could also support instruction-tuned models gen_fun = ( - partial( - generate_base, - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - max_model_len=args.max_model_len, - ) + partial(generate_base, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models, max_model_len=args.max_model_len) if is_fluency_task - else partial( - generate_instructions, - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - max_model_len=args.max_model_len, - ) + else partial(generate_instructions, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models, max_model_len=args.max_model_len) ) dataset_completions_A = try_load_dataset_completions( args.dataset, args.model_A, n_instructions ) if dataset_completions_A is not None: - completions_A = dataset_completions_A.set_index("instruction_index").loc[ - :, "completion" - ] + completions_A = dataset_completions_A.set_index("instruction_index").loc[:, "completion"] else: completions_A = cache_function_dataframe( lambda: gen_fun( @@ -304,9 +299,7 @@ def main(args: CliArgs): args.dataset, args.model_B, n_instructions ) if dataset_completions_B is not None: - completions_B = dataset_completions_B.set_index("instruction_index").loc[ - :, "completion" - ] + completions_B = dataset_completions_B.set_index("instruction_index").loc[:, "completion"] else: completions_B = cache_function_dataframe( lambda: gen_fun( diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 08f2873..2875a81 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -26,7 +26,7 @@ def test_generate_and_evaluate_context_completion(dataset: str): def test_generate_and_evaluate_correct_order_bias(): """Test the correction for model order bias. - + In this test, a judge that is totally biased towards model B should be corrected to be neutral. Since the judge favors model B regardless of the order and the completions, the average preference should be 0.5. @@ -43,4 +43,4 @@ def test_generate_and_evaluate_correct_order_bias(): ) avg_pref = sum(prefs) / len(prefs) - assert avg_pref == 0.5 + assert avg_pref == 0.5 \ No newline at end of file From 8fcb03206f610e950acec607da5f17784d1353b6 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 17 Feb 2026 21:38:59 +0100 Subject: [PATCH 09/35] Rename OPENJURY_EVAL_DATA to OPENJURY_DATA --- README.md | 4 ++-- openjury/utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 8714442..ff157e8 100644 --- a/README.md +++ b/README.md @@ -172,8 +172,8 @@ python -c "from openjury.utils import download_all; download_all()" # Download ``` Datasets are stored in: -- `$OPENJURY_EVAL_DATA` (if set) -- `~/openjury-eval-data/` (default) +- `$OPENJURY_DATA` (if set) +- `~/openjury-data/` (default) ## 🤝 Contributing diff --git a/openjury/utils.py b/openjury/utils.py index 31a8afa..7c7affa 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -13,7 +13,7 @@ from langchain_core.globals import set_llm_cache data_root = Path( - os.environ.get("OPENJURY_EVAL_DATA", Path("~/openjury-eval-data/").expanduser()) + os.environ.get("OPENJURY_DATA", Path("~/openjury-data/").expanduser()) ).expanduser() From 35856f2a67b661ee78ea5425b57648448a4029f5 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sat, 21 Feb 2026 12:00:14 +0100 Subject: [PATCH 10/35] Revert changes in gitignore --- .gitignore | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.gitignore b/.gitignore index 092e713..505a3b1 100644 --- a/.gitignore +++ b/.gitignore @@ -8,8 +8,3 @@ wheels/ # Virtual environments .venv - -# Evaluation data and results -models/ -openjury-eval-data/ -results/ \ No newline at end of file From 6a1118256ea10bf74f0dafc433f59676e87a669d Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sat, 21 Feb 2026 22:18:35 +0100 Subject: [PATCH 11/35] Handle models with max_position_embeddings when we pass max_model_len - Moved max_model_len and chat_template to **model_kwargs for readability. - Adjusted ChatVLLM initialization to cap max_model_len based on model's max_position_embeddings. - Added warnings for potential max_model_len issues. --- openjury/generate.py | 18 ++++-------------- openjury/utils.py | 39 ++++++++++++++++++++++++++++----------- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/openjury/generate.py b/openjury/generate.py index 5703c68..87b3fc5 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -19,16 +19,11 @@ def generate_instructions( model: str, truncate_input_chars: int | None = 8192, max_tokens: int | None = 32768, - max_model_len: int | None = None, use_tqdm: bool = True, system_prompt: str | None = None, - chat_template: str | None = None, + **model_kwargs, ) -> pd.DataFrame: - chat_model = make_model( - model, - max_tokens=max_tokens, - max_model_len=max_model_len, - chat_template=chat_template) + chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) # TODO improve prompt to generate instructions if system_prompt is None: @@ -67,15 +62,10 @@ def generate_base( model: str, truncate_input_chars: int | None = 8192, max_tokens: int | None = 32768, - max_model_len: int | None = None, use_tqdm: bool = False, - chat_template: str | None = None, + **model_kwargs, ) -> pd.DataFrame: - model = make_model( - model, - max_tokens=max_tokens, - max_model_len=max_model_len, - chat_template=chat_template) + model = make_model(model, max_tokens=max_tokens, **model_kwargs) inputs = [ truncate(instruction, max_len=truncate_input_chars) diff --git a/openjury/utils.py b/openjury/utils.py index 4a52eea..19a073a 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -151,19 +151,33 @@ class ChatVLLM: default chat template. """ - def __init__(self, model: str, max_tokens: int = 8192, max_model_len: int | None = None, chat_template: str | None = None, **vllm_kwargs): + def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None = None, **vllm_kwargs): from vllm import LLM, SamplingParams self.model_path = model self.max_tokens = max_tokens + # Cap max_model_len to the model's max_position_embeddings so that + # vLLM doesn't reject an overly large context window. + max_model_len = vllm_kwargs.get("max_model_len") if max_model_len is not None: - assert max_tokens <= max_model_len, ( - f"max_tokens ({max_tokens}) must be <= max_model_len ({max_model_len}). " - f"Either increase --max_model_len or decrease --max_out_tokens_models / " - f"--max_out_tokens_judge." - ) - vllm_kwargs["max_model_len"] = max_model_len + try: + from transformers import AutoConfig + config = AutoConfig.from_pretrained(model, trust_remote_code=True) + model_max_pos = getattr(config, "max_position_embeddings", None) + if model_max_pos is not None and max_model_len > model_max_pos: + warnings.warn( + f"Capping max_model_len from {max_model_len} to " + f"{model_max_pos} (max_position_embeddings) for '{model}'." + ) + vllm_kwargs["max_model_len"] = model_max_pos + except Exception as e: + warnings.warn( + "Could not validate max_model_len against " + f"max_position_embeddings for '{model}': {e}. " + "Proceeding without clamping; vLLM may raise if the value is too large.", + RuntimeWarning, + ) self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs) self.sampling_params = SamplingParams( @@ -276,15 +290,16 @@ async def ainvoke(self, input_item, **invoke_kwargs): ) -def make_model(model: str, max_tokens: int | None = 8192, max_model_len: int | None = None, chat_template: str | None = None): +def make_model(model: str, max_tokens: int | None = 8192, **kwargs): """Instantiate a model wrapper from a provider/model-name string. Args: model: Format ``{Provider}/{model_path}``, e.g. ``VLLM/meta-llama/Llama-3.3-70B-Instruct``. max_tokens: Maximum tokens the model may generate. - chat_template: Optional Jinja2 chat template override. Only used by - the VLLM provider; silently ignored for other providers. + **kwargs: Provider-specific options forwarded to the model wrapper. + For VLLM these include ``max_model_len``, ``chat_template``, and + any other ``vllm.LLM`` constructor arguments. """ model_provider = model.split("/")[0] @@ -296,11 +311,13 @@ def make_model(model: str, max_tokens: int | None = 8192, max_model_len: int | N # Use our custom ChatVLLM wrapper which properly applies chat templates if model_provider == "VLLM": + chat_template = kwargs.pop("chat_template", None) + vllm_kwargs = {k: v for k, v in kwargs.items() if v is not None} return ChatVLLM( model=model_name, max_tokens=max_tokens if max_tokens else 8192, - max_model_len=max_model_len, chat_template=chat_template, + **vllm_kwargs, ) model_kwargs = {} From fecd3edb9ab20e8e0b95c959f28bca6ec99fcedc Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sat, 21 Feb 2026 22:21:08 +0100 Subject: [PATCH 12/35] Revert EuroLLM-9B-Instruct to EuroLLM-9B since there is a default chat template now --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 639ce09..5473fdf 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ Compare two models head-to-head: python openjury/generate_and_evaluate.py \ --dataset alpaca-eval \ --model_A gpt4_1106_preview \ - --model_B VLLM/utter-project/EuroLLM-9B-Instruct \ + --model_B VLLM/utter-project/EuroLLM-9B \ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ --n_instructions 10 ``` @@ -63,7 +63,7 @@ It will then display the results of the battles: ============================================================ 🏆 MODEL BATTLE RESULTS 🏆 📊 Dataset: alpaca-eval -🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B-Instruct +🤖 Competitors: Model A: gpt4_1106_preview vs Model B: VLLM/utter-project/EuroLLM-9B ⚖️ Judge: OpenRouter/deepseek/deepseek-chat-v3.1 📈 Results Summary: Total Battles: 10 @@ -92,7 +92,7 @@ Models are specified using the format: `{LangChain Backend}/{Model Path}` Together/meta-llama/Llama-3.3-70B-Instruct-Turbo ChatOpenAI/gpt-4o LlamaCpp/jwiggerthale_Llama-3.2-3B-Q8_0-GGUF_llama-3.2-3b-q8_0.gguf -VLLM/utter-project/EuroLLM-9B-Instruct +VLLM/utter-project/EuroLLM-9B OpenRouter/deepseek/deepseek-chat-v3.1 ``` From 0b4eaec96434894ef5e840177484249f83acd1c5 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 22 Feb 2026 11:16:24 +0100 Subject: [PATCH 13/35] fix tests - mock external api calls - add safety check for content in completions --- openjury/generate.py | 2 +- tests/test_generate_and_evaluate.py | 47 +++++++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/openjury/generate.py b/openjury/generate.py index 87b3fc5..3fa749c 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -76,7 +76,7 @@ def generate_base( inputs=inputs, max_tokens=max_tokens, ) - completions = [x.content for x in completions] + completions = [x.content if hasattr(x, "content") else x for x in completions] df_outputs = pd.DataFrame( data={ diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 2875a81..040b424 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -1,14 +1,55 @@ +import pandas as pd import pytest + +import openjury.generate_and_evaluate as generate_and_evaluate from openjury.generate_and_evaluate import ( main as main_generate_and_eval, CliArgs, ) +@pytest.fixture(autouse=True) +def mock_external_data_and_cache(monkeypatch): + instructions = pd.DataFrame( + { + "instruction": [f"Synthetic instruction {i}" for i in range(20)], + }, + index=pd.Index(range(20), name="instruction_index"), + ) + + monkeypatch.setattr( + generate_and_evaluate, + "load_instructions", + lambda dataset, n_instructions=None: ( + instructions.head(n_instructions) + if n_instructions is not None + else instructions + ), + ) + monkeypatch.setattr( + generate_and_evaluate, + "load_contexts", + lambda dataset: instructions.loc[:, "instruction"], + ) + + monkeypatch.setattr( + generate_and_evaluate, + "try_load_dataset_completions", + lambda dataset, model, n_instructions: None, + ) + + def _run_without_cache(fun, **_kwargs): + return fun() + + monkeypatch.setattr( + generate_and_evaluate, "cache_function_dataframe", _run_without_cache + ) + + @pytest.mark.parametrize( "dataset", ["alpaca-eval", "fluency-french", "m-arena-hard-EU"] ) -def test_generate_and_evaluate_context_completion(dataset: str): +def test_generate_and_evaluate_context_completion(dataset: str, tmp_path): prefs = main_generate_and_eval( CliArgs( dataset=dataset, @@ -16,6 +57,7 @@ def test_generate_and_evaluate_context_completion(dataset: str): model_B="Dummy/open is better than close isnt'it", judge_model="Dummy/score A: 0 score B: 10", n_instructions=5, + result_folder=str(tmp_path), # default for swap_mode is "fixed" ) ) @@ -24,7 +66,7 @@ def test_generate_and_evaluate_context_completion(dataset: str): assert avg_pref >= 0.9 -def test_generate_and_evaluate_correct_order_bias(): +def test_generate_and_evaluate_correct_order_bias(tmp_path): """Test the correction for model order bias. In this test, a judge that is totally biased towards model B should be corrected to be neutral. @@ -39,6 +81,7 @@ def test_generate_and_evaluate_correct_order_bias(): judge_model="Dummy/score A: 0 score B: 10", n_instructions=5, swap_mode="both", + result_folder=str(tmp_path), ) ) From 29340b0aa48a26dce191bea6458fae29bf707b2b Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 22 Feb 2026 12:13:36 +0100 Subject: [PATCH 14/35] Change test github workflow to use uv instead of pip for a more robust dependency resolution --- .github/workflows/run-pytest.yml | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 1e3e359..2bd89fe 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -1,6 +1,3 @@ -# This workflow will install Python dependencies, run tests and lint with a single version of Python -# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python - name: Run pytest on: @@ -14,20 +11,16 @@ permissions: jobs: build: - + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.12 - uses: actions/setup-python@v3 - with: - python-version: "3.12" - - name: Install python dependencies and run tests - run: | - python -m pip install --upgrade pip - python -m pip install pytest - python -m pip install -e ".[all]" - - - name: Test with pytest - run: | - pytest \ No newline at end of file + - uses: actions/checkout@v3 + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: true + python-version: "3.12" + - name: Install dependencies + run: uv sync --all-extras --group dev + - name: Test with pytest + run: uv run pytest \ No newline at end of file From 2c294f18673334c9afac40001a7a3dd012d9bd6f Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 22 Feb 2026 12:15:09 +0100 Subject: [PATCH 15/35] Move dev dependencies to dependency-group --- pyproject.toml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b3f37b5..1f58b7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,13 @@ dependencies = [ include = ["openjury*"] exclude = ["slurmpilot_scripts*"] +[dependency-groups] +dev = [ + "black>=25.1.0", + "pre-commit>=4.5.1", + "pytest>=8.4.2", +] + [project.optional-dependencies] vllm = ["vllm==0.10.2", "transformers>=4.55.2,<5.0.0"] llamacpp = ["llama-cpp-python>=0.3.0"] -dev = ["black>=25.1.0", "pytest>=8.4.2"] From 4be61bf5002a27a1c32a6962dc671f74fdd273e2 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 22 Feb 2026 12:22:20 +0100 Subject: [PATCH 16/35] Revert comment removal --- .github/workflows/run-pytest.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 2bd89fe..d1bf6ac 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -1,3 +1,6 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + name: Run pytest on: From 51d2597b9120d121f3874c1274b1ece984de308e Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Sun, 22 Feb 2026 13:21:52 +0100 Subject: [PATCH 17/35] Add pre-commit hook --- .pre-commit-config.yaml | 14 ++++++++++++++ README.md | 10 ++++++++++ 2 files changed, 24 insertions(+) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..70badde --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files + + - repo: https://github.com/psf/black + rev: 24.1.1 + hooks: + - id: black + language_version: python3 \ No newline at end of file diff --git a/README.md b/README.md index 5473fdf..8f0b7bf 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,16 @@ Datasets are stored in: - `$OPENJURY_DATA` (if set) - `~/openjury-data/` (default) +## 🛠️ Development + +To maintain code quality, we use **pre-commit** hooks. Run this once to set them up: + +```bash +uv run pre-commit install +``` + +Once installed, hooks will automatically check and format your code on every `git commit`. If a commit is blocked, simply `git add` the changes made by the hooks and commit again. + ## 🤝 Contributing We welcome contributions! Whether it's bug fixes, new features, or additional benchmark support, feel free to open an issue or submit a pull request. From 8dee7b228597333c4fdcc1f5bc1e5d0a6a45bf77 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 23 Feb 2026 11:46:39 +0100 Subject: [PATCH 18/35] add project scripts and move slurmpilot to dev group - moved slurmpilot to dev group since it doesn't have a published version on Pypi and doesn't allow we are not allowed to publish Openjury on Pypi otherwise --- openjury/generate_and_evaluate.py | 8 +++++--- pyproject.toml | 5 ++++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index c43a125..c8ea7e1 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -451,9 +451,11 @@ def main(args: CliArgs): return prefs -if __name__ == "__main__": +def cli(): args = CliArgs.parse_args() - print(f"Running with CLI args: {args.__dict__}") - main(args) + + +if __name__ == "__main__": + cli() diff --git a/pyproject.toml b/pyproject.toml index 1f58b7e..5a840f3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,6 +2,9 @@ requires = ["setuptools>=61.0", "wheel"] build-backend = "setuptools.build_meta" +[project.scripts] +openjury = "openjury.generate_and_evaluate:cli" + [project] name = "llm-judge-eval" version = "0.1.0" @@ -24,7 +27,6 @@ dependencies = [ "rich>=14.1.0", "scikit-learn>=1.7.2", "seaborn>=0.13.2", - "slurmpilot @ git+https://github.com/geoalgo/slurmpilot.git@main", "tqdm>=4.67.1", ] @@ -37,6 +39,7 @@ dev = [ "black>=25.1.0", "pre-commit>=4.5.1", "pytest>=8.4.2", + "slurmpilot @ git+https://github.com/geoalgo/slurmpilot.git@main", ] [project.optional-dependencies] From fdc9410133c8e22e9a459b0ecf0d3f9a247d0b08 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 2 Mar 2026 20:54:00 +0100 Subject: [PATCH 19/35] fix LlamaCpp bug with ChatTemplate - There was a halting issue with LlamaCpp since the model was not emitting EOS token and doesn't call Llama.reset() between calls (turns), causing a KV cache position mismatch crash so ChatLlamaCppModel was created as a custom wrapper to fix this - BaseLocalModel was extracted as common logic for ChatLlamaCppModel and ChatVLLM --- openjury/utils.py | 201 ++++++++++++++++++++++++++++++---------------- 1 file changed, 133 insertions(+), 68 deletions(-) diff --git a/openjury/utils.py b/openjury/utils.py index 19a073a..d8b5440 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -8,7 +8,6 @@ from huggingface_hub import snapshot_download import pandas as pd from tqdm.asyncio import tqdm -from langchain_community.llms import LlamaCpp from langchain_openai import ChatOpenAI from langchain_community.cache import SQLiteCache from langchain_core.globals import set_llm_cache @@ -136,7 +135,122 @@ async def ainvoke(self, input, **invoke_kwargs): return self.message -class ChatVLLM: +class BaseLocalModel: + """Shared prompt conversion and invoke helpers for local model wrappers.""" + + def _to_messages(self, input_item) -> list[dict]: + """Convert LangChain prompt input to OpenAI-style messages.""" + role_map = {"human": "user", "ai": "assistant", "system": "system"} + + if hasattr(input_item, "to_messages"): + lc_messages = input_item.to_messages() + return [ + {"role": role_map.get(msg.type, msg.type), "content": msg.content} + for msg in lc_messages + ] + elif ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], tuple) + ): + return [ + {"role": role if role != "human" else "user", "content": content} + for role, content in input_item + ] + elif ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], dict) + ): + return input_item + elif isinstance(input_item, str): + return [{"role": "user", "content": input_item}] + else: + raise ValueError(f"Unsupported input type: {type(input_item)}") + + def _to_raw_text(self, input_item) -> str: + """Extract raw text from an input item for text-completion mode.""" + if isinstance(input_item, str): + return input_item + if hasattr(input_item, "to_string"): + return input_item.to_string() + if isinstance(input_item, list) and input_item and isinstance(input_item[0], dict): + return "\n".join(msg["content"] for msg in input_item) + raise ValueError(f"Cannot extract raw text from: {type(input_item)}") + + def invoke(self, input_item, **invoke_kwargs) -> str: + return self.batch([input_item], **invoke_kwargs)[0] + + async def ainvoke(self, input_item, **invoke_kwargs): + loop = asyncio.get_event_loop() + return await loop.run_in_executor( + None, lambda: self.invoke(input_item, **invoke_kwargs) + ) + + +class ChatLlamaCppModel(BaseLocalModel): + """LlamaCpp wrapper that auto-detects and applies the GGUF chat template. + + Mirrors the ChatVLLM pattern but for local GGUF models via llama-cpp-python. + + Chat template handling: + - If the GGUF file embeds a chat template (typical for instruct models), + uses ``create_chat_completion()`` which applies the template and + handles EOS tokens correctly. + - If no template is found (base/pretrained models), falls back to + ``create_completion()`` (text mode) and emits a warning. + + Unlike langchain's ``ChatLlamaCpp``, this wrapper explicitly calls + ``Llama.reset()`` between conversations to clear stale KV-cache state. + """ + + def __init__(self, model_path: str, max_tokens: int = 1024, n_ctx: int = 0, **kwargs): + from llama_cpp import Llama + + self.model_path = model_path + self.max_tokens = max_tokens + self.llama = Llama( + model_path=model_path, + n_ctx=n_ctx, + verbose=True, + **kwargs, + ) + + chat_template = self.llama.metadata.get("tokenizer.chat_template") + if chat_template: + self._use_generate = False + print(f"ChatLlamaCppModel: using GGUF chat template for '{model_path}'") + else: + self._use_generate = True + warnings.warn( + f"Model '{model_path}' does not embed a chat template. " + f"Falling back to text-completion mode (no chat formatting). " + f"Override with --chat_template if this model needs one.", + ) + + def batch(self, inputs: list, **kwargs) -> list[str]: + """Process a batch of inputs, resetting KV cache between conversations.""" + results = [] + for inp in inputs: + self.llama.reset() + if self._use_generate: + text = self._to_raw_text(inp) + response = self.llama.create_completion( + prompt=text, + max_tokens=self.max_tokens, + ) + results.append(response["choices"][0]["text"]) + else: + messages = self._to_messages(inp) + response = self.llama.create_chat_completion( + messages=messages, + max_tokens=self.max_tokens, + ) + results.append(response["choices"][0]["message"]["content"]) + return results + + +class ChatVLLM(BaseLocalModel): """VLLM wrapper that auto-detects whether to use chat() or generate(). Chat template handling: @@ -209,53 +323,6 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None self._use_generate = False print(f"ChatVLLM: using tokenizer's chat template for '{model}'") - def _to_messages(self, input_item) -> list[dict]: - """Convert LangChain prompt input to OpenAI-style messages.""" - # Map LangChain message types to OpenAI roles - role_map = {"human": "user", "ai": "assistant", "system": "system"} - - # Handle ChatPromptValue from LangChain - if hasattr(input_item, "to_messages"): - lc_messages = input_item.to_messages() - return [ - {"role": role_map.get(msg.type, msg.type), "content": msg.content} - for msg in lc_messages - ] - # Handle list of tuples like [("system", "..."), ("user", "...")] - elif ( - isinstance(input_item, list) - and input_item - and isinstance(input_item[0], tuple) - ): - return [ - {"role": role if role != "human" else "user", "content": content} - for role, content in input_item - ] - # Handle already formatted messages - elif ( - isinstance(input_item, list) - and input_item - and isinstance(input_item[0], dict) - ): - return input_item - # Handle plain string (wrap as user message) - elif isinstance(input_item, str): - return [{"role": "user", "content": input_item}] - else: - raise ValueError(f"Unsupported input type: {type(input_item)}") - - def _to_raw_text(self, input_item) -> str: - """Extract raw text from an input item for use with llm.generate().""" - if isinstance(input_item, str): - return input_item - # ChatPromptValue from LangChain - if hasattr(input_item, "to_string"): - return input_item.to_string() - # List of dicts (messages) - concatenate contents - if isinstance(input_item, list) and input_item and isinstance(input_item[0], dict): - return "\n".join(msg["content"] for msg in input_item) - raise ValueError(f"Cannot extract raw text from: {type(input_item)}") - def batch(self, inputs: list, **invoke_kwargs) -> list[str]: """Process a batch of inputs using vllm.LLM.chat() or llm.generate(). @@ -275,20 +342,6 @@ def batch(self, inputs: list, **invoke_kwargs) -> list[str]: ) return [out.outputs[0].text for out in outputs] - def invoke(self, input_item, **invoke_kwargs) -> str: - """Process a single input.""" - results = self.batch([input_item], **invoke_kwargs) - return results[0] - - async def ainvoke(self, input_item, **invoke_kwargs): - """Async version - runs sync version in executor for compatibility.""" - import asyncio - - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, lambda: self.invoke(input_item, **invoke_kwargs) - ) - def make_model(model: str, max_tokens: int | None = 8192, **kwargs): """Instantiate a model wrapper from a provider/model-name string. @@ -332,15 +385,15 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): model=model_name, **model_kwargs, ) + elif model_provider == "LlamaCpp": + model_kwargs["model_path"] = model_name + model_kwargs.setdefault("n_ctx", 0) + return ChatLlamaCppModel(**model_kwargs) else: model_classes = [ - LlamaCpp, ChatOpenAI, ] - if model_provider == "LlamaCpp": - model_kwargs["model_path"] = model_name - else: - model_kwargs["model"] = model_name + model_kwargs["model"] = model_name try: from langchain_together.llms import Together @@ -367,6 +420,18 @@ def download_all(): local_path_tables = data_root / "tables" download_hf(name=dataset, local_path=local_path_tables) + # MT-Bench questions live in the LMSYS HuggingFace space. + snapshot_download( + repo_id="lmsys/mt-bench", + repo_type="space", + allow_patterns=[ + "data/mt_bench/question.jsonl", + "data/mt_bench/reference_answer/*", + ], + local_dir=data_root / "mt-bench", + force_download=False, + ) + snapshot_download( repo_id="geoalgo/multilingual-contexts-to-be-completed", repo_type="dataset", From 48c53733eb4805ef76dea6e95ce83d8ae09f7f59 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 2 Mar 2026 20:59:38 +0100 Subject: [PATCH 20/35] Add MT-Bench multi-turn evaluation support - Implement MT-Bench loader and multi-turn generation/judging logic. - Add paper-aligned prompt templates while keeping the score-based evaluation to be consistent with OpenJury. - Support reference answers, per-turn breakdowns, and swap mode. - Add comprehensive MT-Bench pipeline tests. --- README.md | 21 +- openjury/evaluate.py | 23 +- openjury/generate.py | 94 +++- openjury/generate_and_evaluate.py | 483 +++++++++++++++++- openjury/instruction_dataset/__init__.py | 7 +- openjury/instruction_dataset/mt_bench.py | 203 ++++++++ .../prompt-multi-turn-with-explanation.txt | 21 + openjury/prompts/prompt-multi-turn.txt | 22 + openjury/utils.py | 27 +- tests/test_generate_and_evaluate.py | 174 ++++++- 10 files changed, 1027 insertions(+), 48 deletions(-) create mode 100644 openjury/instruction_dataset/mt_bench.py create mode 100644 openjury/prompts/prompt-multi-turn-with-explanation.txt create mode 100644 openjury/prompts/prompt-multi-turn.txt diff --git a/README.md b/README.md index 8f0b7bf..91903e8 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features: | **Arena-Hard-Auto** | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | | **Lighteval** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | | **Evalchemy** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| **OpenJury** | 🔜 | ✅ | ✅ | ✅ | ✅ | ✅ | +| **OpenJury** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue or send a PR, we will be happy to update the information. @@ -172,10 +172,29 @@ python openjury/generate_and_evaluate.py \ This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side. +### MT-Bench (Multi-Turn Evaluation) + +MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories +(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities). +It uses category-dependent judge prompts and reference answers for math/reasoning/coding. +Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench). + +```bash +uv run python openjury/generate_and_evaluate.py \ + --dataset mt-bench \ + --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \ + --model_B OpenRouter/openai/gpt-4o \ + --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \ + --n_instructions 10 +``` + +Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias. + ## 📊 Supported Datasets | Dataset | Description | |-----------------------|------------------------------------------------------------------------------------------------| +| `mt-bench` | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) | | `alpaca-eval` | General instruction-following benchmark | | `arena-hard` | More challenging evaluation suite | | `m-arena-hard` | Translated version of Arena-Hard in 23 languages | diff --git a/openjury/evaluate.py b/openjury/evaluate.py index 39a5411..d31db67 100644 --- a/openjury/evaluate.py +++ b/openjury/evaluate.py @@ -15,6 +15,7 @@ data_root, download_hf, do_inference, + truncate, ) @@ -51,14 +52,22 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1): def load_judge_system_and_user_prompt( provide_explanation: bool = True, + multi_turn: bool = False, ) -> tuple[str, str]: # Prepare judge with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f: system_prompt = str(f.read()) - prompt_filename = ( - "prompt-with-explanation.txt" if provide_explanation else "prompt.txt" - ) + if multi_turn: + prompt_filename = ( + "prompt-multi-turn-with-explanation.txt" + if provide_explanation + else "prompt-multi-turn.txt" + ) + else: + prompt_filename = ( + "prompt-with-explanation.txt" if provide_explanation else "prompt.txt" + ) with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f: user_prompt_template = str(f.read()) @@ -240,14 +249,6 @@ def annotate_battles( [("system", system_prompt), ("user", user_prompt_template)] ) - def truncate(s: str, max_len: int | None = None): - if not isinstance(s, str): - return "" - if max_len is not None: - return s[:max_len] - else: - return s - inputs = prompt_template.batch( [ { diff --git a/openjury/generate.py b/openjury/generate.py index 3fa749c..5c469ff 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -4,16 +4,10 @@ from openjury.utils import ( do_inference, make_model, + truncate, ) -def truncate(s: str, max_len: int | None = None): - if max_len is not None: - return s[:max_len] - else: - return s - - def generate_instructions( instructions: pd.Series, model: str, @@ -57,6 +51,92 @@ def generate_instructions( return df_outputs +def generate_multiturn( + questions: pd.DataFrame, + model: str, + truncate_input_chars: int | None = 8192, + max_tokens: int | None = 8192, + use_tqdm: bool = True, + **model_kwargs, +) -> pd.DataFrame: + """Generate two-turn completions for MT-Bench style questions. + + Generates turn 1 answers first, then uses them as conversation context + to generate turn 2 answers. + + Args: + questions: DataFrame with columns turn_1, turn_2, and index instruction_index. + model: Model specification string (e.g. "VLLM/model-name"). + **model_kwargs: Provider-specific options forwarded to make_model + (e.g. max_model_len, chat_template for VLLM). + Returns: + DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2 + """ + chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) + + system_prompt = "You are a helpful assistant." + turn1_template = ChatPromptTemplate.from_messages( + [("system", system_prompt), ("user", "{user_prompt}")] + ) + + turn1_inputs = turn1_template.batch( + [ + {"user_prompt": truncate(row["turn_1"], max_len=truncate_input_chars)} + for _, row in questions.iterrows() + ] + ) + + print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).") + completions_turn_1 = do_inference( + chat_model=chat_model, + inputs=turn1_inputs, + use_tqdm=use_tqdm, + ) + + turn2_inputs = [] + for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1): + if row["turn_2"] is None: + turn2_inputs.append( + turn1_template.invoke( + {"user_prompt": "No follow-up question."} + ) + ) + else: + multi_turn_template = ChatPromptTemplate.from_messages( + [ + ("system", system_prompt), + ("user", "{turn_1}"), + ("assistant", "{turn_1_answer}"), + ("user", "{turn_2}"), + ] + ) + turn2_inputs.append( + multi_turn_template.invoke( + { + "turn_1": truncate(row["turn_1"], max_len=truncate_input_chars), + "turn_1_answer": truncate(str(t1_answer), max_len=truncate_input_chars), + "turn_2": truncate(row["turn_2"], max_len=truncate_input_chars), + } + ) + ) + + print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).") + completions_turn_2 = do_inference( + chat_model=chat_model, + inputs=turn2_inputs, + use_tqdm=use_tqdm, + ) + + df_outputs = pd.DataFrame( + data={ + "instruction_index": questions.index.tolist(), + "completion_turn_1": completions_turn_1, + "completion_turn_2": completions_turn_2, + }, + ) + return df_outputs + + def generate_base( instructions: pd.Series, model: str, diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index c8ea7e1..2428bd1 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -11,15 +11,20 @@ from functools import partial from pathlib import Path -import numpy as np import pandas as pd -from openjury.evaluate import annotate_battles, PairScore -from openjury.generate import generate_instructions, generate_base +from openjury.evaluate import ( + annotate_battles, + PairScore, + load_judge_system_and_user_prompt, +) +from openjury.generate import generate_instructions, generate_base, generate_multiturn from openjury.instruction_dataset import load_instructions -from openjury.utils import data_root, read_df, download_hf +from openjury.utils import data_root, read_df, download_hf, truncate from openjury.utils import make_model, cache_function_dataframe +NEED_REF_CATS = {"math", "reasoning", "coding"} + def try_load_dataset_completions( dataset: str, model: str, n_instructions: int | None @@ -75,6 +80,7 @@ class CliArgs: max_out_tokens_judge: int = 32768 max_model_len: int | None = None chat_template: str | None = None + mt_bench_turns: str = "both" result_folder: str = "results" @@ -195,6 +201,15 @@ def parse_args(cls): help="Jinja2 chat template string to use instead of the model's tokenizer template. " "If not provided, ChatML is used as fallback for models without a chat template.", ) + parser.add_argument( + "--mt_bench_turns", + type=str, + choices=["both", "single", "multi"], + default="both", + help="Which MT-Bench turns to evaluate. 'single': only turn 1, " + "'multi': only turn 2 (with full conversation context), " + "'both' (default): evaluate both turns.", + ) args = parser.parse_args() return cls( @@ -212,6 +227,7 @@ def parse_args(cls): max_out_tokens_judge=args.max_out_tokens_judge, max_model_len=args.max_model_len, chat_template=args.chat_template, + mt_bench_turns=args.mt_bench_turns, result_folder=args.result_folder, ) @@ -237,9 +253,218 @@ def print_results(results): print(f" ✅ Wins: {results['num_wins']}") print(f" ❌ Losses: {results['num_losses']}") print(f" 🤝 Ties: {results['num_ties']}") + if "num_errors" in results: + print(f" ⚠️ Errors: {results['num_errors']}") + if "num_missing" in results: + print(f" ❓ Missing: {results['num_missing']}") + + per_category = results.get("per_category") + if per_category: + print("\nPer-Category Breakdown:") + print( + f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}" + ) + print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}") + for cat, stats in sorted(per_category.items()): + print( + f" {cat:<14} | {stats['winrate']:>10.1%} | " + f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}" + ) + + per_turn = results.get("per_turn") + if per_turn: + print("\nPer-Turn Breakdown:") + for turn, stats in sorted(per_turn.items()): + print( + f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} " + f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})" + ) print("=" * 60 + "\n") +def _safe_text(value: object, truncate_chars: int | None) -> str: + if value is None or pd.isna(value): + return "" + return truncate(str(value), max_len=truncate_chars) + + +def format_mt_bench_for_evaluation( + questions: pd.DataFrame, + completions_A: pd.DataFrame, + completions_B: pd.DataFrame, + turns_mode: str, + truncate_input_chars: int | None, +) -> tuple[ + tuple[list[str], list[str], list[str], list[dict[str, object]]], + tuple[list[str], list[str], list[str], list[dict[str, object]]], +]: + """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" + assert turns_mode in ("both", "single", "multi") + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + instructions_turn_1: list[str] = [] + completions_a_turn_1: list[str] = [] + completions_b_turn_1: list[str] = [] + metadata_turn_1: list[dict[str, object]] = [] + + instructions_turn_2: list[str] = [] + completions_a_turn_2: list[str] = [] + completions_b_turn_2: list[str] = [] + metadata_turn_2: list[dict[str, object]] = [] + + for idx in questions.index: + row = questions.loc[idx] + comp_A_row = ( + completions_A.loc[idx] if idx in completions_A.index else completions_A.iloc[0] + ) + comp_B_row = ( + completions_B.loc[idx] if idx in completions_B.index else completions_B.iloc[0] + ) + category = row.get("category") + needs_ref = category in NEED_REF_CATS + + turn_1_question = _safe_text(row.get("turn_1"), truncate_input_chars) + turn_2_question = _safe_text(row.get("turn_2"), truncate_input_chars) + + answer_a_1 = _safe_text(comp_A_row.get("completion_turn_1", ""), truncate_input_chars) + answer_a_2 = _safe_text(comp_A_row.get("completion_turn_2", ""), truncate_input_chars) + answer_b_1 = _safe_text(comp_B_row.get("completion_turn_1", ""), truncate_input_chars) + answer_b_2 = _safe_text(comp_B_row.get("completion_turn_2", ""), truncate_input_chars) + + ref_1 = _safe_text(row.get("reference_turn_1"), truncate_input_chars) + ref_2 = _safe_text(row.get("reference_turn_2"), truncate_input_chars) + + if eval_single: + if needs_ref and ref_1: + instruction = ( + "[MT-Bench | Turn 1]\n" + "Use the reference answer for correctness checks.\n\n" + f"[Question]\n{turn_1_question}\n\n" + f"[Reference Answer]\n{ref_1}" + ) + else: + instruction = turn_1_question + + instructions_turn_1.append(instruction) + completions_a_turn_1.append(answer_a_1) + completions_b_turn_1.append(answer_b_1) + metadata_turn_1.append( + { + "question_id": idx, + "category": category, + "turn": 1, + } + ) + + if eval_multi and turn_2_question: + instruction_parts = [ + "Please focus on which assistant provides a better answer to the second user question." + ] + if needs_ref and (ref_1 or ref_2): + instruction_parts.extend( + [ + "<|The Start of Reference Answer|>", + "### User:", + turn_1_question, + "### Reference answer:", + ref_1, + "### User:", + turn_2_question, + "### Reference answer:", + ref_2, + "<|The End of Reference Answer|>", + ] + ) + + conversation_a = ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{answer_a_1}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{answer_a_2}" + ) + conversation_b = ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{answer_b_1}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{answer_b_2}" + ) + + instructions_turn_2.append("\n\n".join(instruction_parts)) + completions_a_turn_2.append(conversation_a) + completions_b_turn_2.append(conversation_b) + metadata_turn_2.append( + { + "question_id": idx, + "category": category, + "turn": 2, + } + ) + + return ( + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ), + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ), + ) + + +def compute_preference_stats(prefs: pd.Series) -> dict: + """Derive win/loss/tie counts and winrate from a Series of preferences. + + Preference < 0.5 means model A wins, > 0.5 means model B wins, + exactly 0.5 is a tie. None/NaN values are counted as missing. + """ + num_battles = len(prefs) + num_wins = int(sum(prefs < 0.5)) + num_losses = int(sum(prefs > 0.5)) + num_ties = int(sum(prefs == 0.5)) + num_missing = num_battles - (num_wins + num_losses + num_ties) + denom = num_wins + num_losses + num_ties + winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0 + return { + "num_battles": num_battles, + "num_wins": num_wins, + "num_losses": num_losses, + "num_ties": num_ties, + "num_missing": num_missing, + "winrate": winrate, + } + + +def _compute_grouped_stats( + preferences: pd.Series, + metadata: list[dict[str, object]], + group_by: str, +) -> dict[object, dict[str, float | int]]: + grouped: dict[object, list[float]] = {} + for meta, pref in zip(metadata, preferences): + key = meta.get(group_by) + if key is None: + continue + grouped.setdefault(key, []).append(pref) + return { + key: compute_preference_stats(pd.Series(vals)) + for key, vals in grouped.items() + } + + def main(args: CliArgs): """ 1) take as input: @@ -261,6 +486,10 @@ def main(args: CliArgs): # set_langchain_cache() ignore_cache = args.ignore_cache + # MT-Bench has its own pipeline: multi-turn generation + category-aware judging + if args.dataset == "mt-bench": + return _run_mt_bench(args, ignore_cache) + # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset if is_fluency_task: @@ -420,24 +649,13 @@ def main(args: CliArgs): ) prefs = pd.concat([prefs, (1 - prefs_reversed)]).reset_index(drop=True) - # compute and report statistics - num_wins = sum(prefs < 0.5) - num_losses = sum(prefs > 0.5) - num_ties = sum([1 if not x or x == 0.5 or x == np.nan else 0 for x in prefs]) - num_battles = len(prefs) - winrate = float((num_wins + 0.5 * num_ties) / (num_ties + num_wins + num_losses)) - + stats = compute_preference_stats(prefs) results = { "dataset": args.dataset, "model_A": args.model_A, "model_B": args.model_B, "judge_model": args.judge_model, - "num_battles": num_battles, - "winrate": winrate, - "num_wins": num_wins, - "num_losses": num_losses, - "num_ties": num_ties, - "num_missing": num_battles - (num_losses + num_wins + num_ties), + **stats, "preferences": prefs.tolist(), "date": str(datetime.now().isoformat()), "user": os.getenv("USER", ""), @@ -451,6 +669,237 @@ def main(args: CliArgs): return prefs +def _run_mt_bench(args: CliArgs, ignore_cache: bool): + """MT-Bench pipeline routed through score-based judging.""" + questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) + + print( + f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + ) + + gen_kwargs = dict( + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_A = cache_function_dataframe( + lambda: generate_multiturn( + questions=questions_df, + model=args.model_A, + use_tqdm=args.use_tqdm, + **gen_kwargs, + ), + ignore_cache=ignore_cache, + cache_name=f"mt-bench_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_B = cache_function_dataframe( + lambda: generate_multiturn( + questions=questions_df, + model=args.model_B, + use_tqdm=args.use_tqdm, + **gen_kwargs, + ), + ignore_cache=ignore_cache, + cache_name=f"mt-bench_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( + questions=questions_df, + completions_A=completions_A, + completions_B=completions_B, + turns_mode=args.mt_bench_turns, + truncate_input_chars=args.truncate_all_input_chars, + ) + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ) = turn_2_inputs + + score_parser = PairScore() + annotations = [] + metadata_for_annotations: list[dict[str, object]] = [] + annotations_reversed = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + preference_parts: list[pd.Series] = [] + combined_metadata: list[dict[str, object]] = [] + + if args.swap_mode == "both": + print("Running reversed evaluation for position bias correction.") + + if instructions_turn_1: + annotations_turn_1 = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_1, + completions_A=completions_a_turn_1, + completions_B=completions_b_turn_1, + provide_explanation=args.provide_explanation, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations.extend(annotations_turn_1) + metadata_for_annotations.extend(metadata_turn_1) + prefs_turn_1 = pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations_turn_1 + ] + ) + preference_parts.append(prefs_turn_1) + combined_metadata.extend(metadata_turn_1) + + if args.swap_mode == "both": + annotations_turn_1_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_1, + completions_A=completions_b_turn_1, + completions_B=completions_a_turn_1, + provide_explanation=args.provide_explanation, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations_reversed.extend(annotations_turn_1_reversed) + metadata_for_reversed_annotations.extend(metadata_turn_1) + prefs_turn_1_reversed = pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations_turn_1_reversed + ] + ) + preference_parts.append(1 - prefs_turn_1_reversed) + combined_metadata.extend(metadata_turn_1) + + if instructions_turn_2: + ( + mt_system_prompt, + mt_user_prompt_template, + ) = load_judge_system_and_user_prompt( + provide_explanation=args.provide_explanation, + multi_turn=True, + ) + annotations_turn_2 = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_2, + completions_A=completions_a_turn_2, + completions_B=completions_b_turn_2, + provide_explanation=args.provide_explanation, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations.extend(annotations_turn_2) + metadata_for_annotations.extend(metadata_turn_2) + prefs_turn_2 = pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations_turn_2 + ] + ) + preference_parts.append(prefs_turn_2) + combined_metadata.extend(metadata_turn_2) + + if args.swap_mode == "both": + annotations_turn_2_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_2, + completions_A=completions_b_turn_2, + completions_B=completions_a_turn_2, + provide_explanation=args.provide_explanation, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations_reversed.extend(annotations_turn_2_reversed) + metadata_for_reversed_annotations.extend(metadata_turn_2) + prefs_turn_2_reversed = pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations_turn_2_reversed + ] + ) + preference_parts.append(1 - prefs_turn_2_reversed) + combined_metadata.extend(metadata_turn_2) + + prefs = ( + pd.concat(preference_parts).reset_index(drop=True) + if preference_parts + else pd.Series(dtype=float) + ) + + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + name = name.replace("/", "_") + + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + df = pd.DataFrame(annotations) + df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] + df["category"] = [meta["category"] for meta in metadata_for_annotations] + df["turn"] = [meta["turn"] for meta in metadata_for_annotations] + df["model_A"] = args.model_A + df["model_B"] = args.model_B + df["judge"] = args.judge_model + if args.swap_mode == "both": + df_reversed = pd.DataFrame(annotations_reversed) + df_reversed["instruction_index"] = [ + meta["question_id"] for meta in metadata_for_reversed_annotations + ] + df_reversed["category"] = [ + meta["category"] for meta in metadata_for_reversed_annotations + ] + df_reversed["turn"] = [ + meta["turn"] for meta in metadata_for_reversed_annotations + ] + df_reversed["model_A"] = args.model_B + df_reversed["model_B"] = args.model_A + df_reversed["judge"] = args.judge_model + df = pd.concat([df, df_reversed], ignore_index=True) + df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + return prefs + + def cli(): args = CliArgs.parse_args() print(f"Running with CLI args: {args.__dict__}") diff --git a/openjury/instruction_dataset/__init__.py b/openjury/instruction_dataset/__init__.py index ac211e2..fc75155 100644 --- a/openjury/instruction_dataset/__init__.py +++ b/openjury/instruction_dataset/__init__.py @@ -4,7 +4,12 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: - if "m-arena-hard" in dataset: + if dataset == "mt-bench": + from openjury.instruction_dataset.mt_bench import load_mt_bench + + df_instructions = load_mt_bench() + + elif "m-arena-hard" in dataset: if dataset == "m-arena-hard": language = None else: diff --git a/openjury/instruction_dataset/mt_bench.py b/openjury/instruction_dataset/mt_bench.py new file mode 100644 index 0000000..3d5a44e --- /dev/null +++ b/openjury/instruction_dataset/mt_bench.py @@ -0,0 +1,203 @@ +import json +from pathlib import Path + +import pandas as pd +from huggingface_hub import snapshot_download + +from openjury.utils import data_root + +def _read_json_or_jsonl(path: Path) -> list[dict]: + if path.suffix == ".jsonl": + records = [] + with open(path, "r") as f: + for line in f: + line = line.strip() + if not line: + continue + records.append(json.loads(line)) + return records + elif path.suffix == ".json": + with open(path, "r") as f: + data = json.load(f) + if isinstance(data, list): + return data + raise ValueError(f"Expected a JSON list in {path}, got {type(data)}") + raise ValueError(f"Unsupported MT-Bench file format: {path}") + + +def _try_resolve_mt_bench_paths( + root: Path, +) -> tuple[Path | None, Path | None]: + """Find question.jsonl and reference_answer/*.jsonl under root.""" + question_candidates = [ + root / "data" / "mt_bench" / "question.jsonl", + root / "data" / "mt_bench" / "questions.jsonl", + ] + # FastChat stores reference answers inside a *directory* (one file per model). + ref_dir_candidates = [ + root / "data" / "mt_bench" / "reference_answer", + ] + + question_path = next((p for p in question_candidates if p.exists()), None) + if question_path is None: + for p in root.rglob("question.jsonl"): + question_path = p + break + if question_path is None: + for p in root.rglob("questions.jsonl"): + question_path = p + break + + ref_path: Path | None = None + for d in ref_dir_candidates: + if d.is_dir(): + gpt4 = d / "gpt-4.jsonl" + if gpt4.exists(): + ref_path = gpt4 + else: + jsonl_files = sorted(d.glob("*.jsonl")) + if jsonl_files: + ref_path = jsonl_files[0] + break + + if ref_path is None: + for name in ("gpt-4.jsonl", "reference_answer.jsonl", "reference_answers.jsonl"): + for p in root.rglob(name): + ref_path = p + break + if ref_path is not None: + break + + return question_path, ref_path + + +def _extract_ref_turns(rec: dict) -> list[str] | None: + """Extract reference answer turns from either model-answer or flat format. + + FastChat reference_answer/gpt-4.jsonl uses: + {"choices": [{"turns": ["ans1", "ans2"]}]} + while question.jsonl inlines: + {"reference": ["hint1", "hint2"]} + """ + choices = rec.get("choices") + if isinstance(choices, list) and len(choices) > 0: + turns = choices[0].get("turns") + if isinstance(turns, list): + return turns + turns = rec.get("turns") + if isinstance(turns, list): + return turns + return None + + +def load_mt_bench() -> pd.DataFrame: + """Load MT-Bench questions (and reference answers when available). + + Downloads the MT-Bench HuggingFace space snapshot to `$OPENJURY_DATA/mt-bench/` + (or `~/openjury-data/mt-bench/`) and returns a DataFrame with at least: + - instruction_index (question id) + - category + - turn_1, turn_2 + - reference_turn_1, reference_turn_2 (may be missing/NaN) + """ + local_dir = data_root / "mt-bench" + try: + local_dir.mkdir(parents=True, exist_ok=True) + except PermissionError as e: + raise PermissionError( + f"Cannot create MT-Bench cache directory at {local_dir}. " + "Set environment variable OPENJURY_DATA to a writable location." + ) from e + + question_path, ref_path = _try_resolve_mt_bench_paths(local_dir) + if question_path is None: + try: + snapshot_download( + repo_id="lmsys/mt-bench", + repo_type="space", + allow_patterns=[ + "data/mt_bench/question.jsonl", + "data/mt_bench/reference_answer/*", + ], + local_dir=local_dir, + force_download=False, + ) + except Exception as e: + raise RuntimeError( + "Failed to download MT-Bench questions from HuggingFace space " + "'lmsys/mt-bench'. If you're in an offline / restricted-network " + "environment, pre-download the space snapshot and place the " + "questions file under " + f"{local_dir}/data/mt_bench/question.jsonl (and optionally " + "reference_answer/gpt-4.jsonl), or set OPENJURY_DATA to point " + "to that directory." + ) from e + question_path, ref_path = _try_resolve_mt_bench_paths(local_dir) + + if question_path is None: + raise FileNotFoundError( + "Could not locate MT-Bench questions after download. " + f"Searched under {local_dir}. " + "Expected a file like 'data/mt_bench/question.jsonl'." + ) + + questions = _read_json_or_jsonl(question_path) + + # --- Load reference answers from the separate reference file (gpt-4.jsonl) --- + ref_by_id: dict[int | str, list[str]] = {} + if ref_path is not None: + for rec in _read_json_or_jsonl(ref_path): + qid = rec.get("question_id", rec.get("id")) + if qid is None: + continue + turns = _extract_ref_turns(rec) + if turns is not None: + ref_by_id[qid] = turns + + rows = [] + for rec in questions: + qid_raw = rec.get("question_id", rec.get("id")) + if qid_raw is None: + raise ValueError( + f"MT-Bench question record missing question_id/id: keys={list(rec.keys())}" + ) + try: + qid = int(qid_raw) + except Exception: + qid = qid_raw + + category = rec.get("category") + turns = rec.get("turns") + if isinstance(turns, list): + turn_1 = turns[0] if len(turns) > 0 else None + turn_2 = turns[1] if len(turns) > 1 else None + else: + turn_1 = rec.get("turn_1", rec.get("instruction")) + turn_2 = rec.get("turn_2") + + # Prefer the separate gpt-4 reference file; fall back to the inline + # "reference" field embedded in question.jsonl (short hints). + ref_turns = ref_by_id.get(qid_raw) or ref_by_id.get(qid) + if ref_turns is None: + inline_ref = rec.get("reference") + if isinstance(inline_ref, list): + ref_turns = inline_ref + + ref_turn_1 = ref_turns[0] if isinstance(ref_turns, list) and len(ref_turns) > 0 else None + ref_turn_2 = ref_turns[1] if isinstance(ref_turns, list) and len(ref_turns) > 1 else None + + rows.append( + { + "instruction_index": qid, + "category": category, + "turn_1": turn_1, + "turn_2": turn_2, + "reference_turn_1": ref_turn_1, + "reference_turn_2": ref_turn_2, + "instruction": turn_1, + } + ) + + df = pd.DataFrame(rows) + return df + diff --git a/openjury/prompts/prompt-multi-turn-with-explanation.txt b/openjury/prompts/prompt-multi-turn-with-explanation.txt new file mode 100644 index 0000000..f3ea1c2 --- /dev/null +++ b/openjury/prompts/prompt-multi-turn-with-explanation.txt @@ -0,0 +1,21 @@ +<|User Prompt|> +{user_prompt} + +<|The Start of Assistant A's Conversation with User|> +{completion_A} +<|The End of Assistant A's Conversation with User|> + +<|The Start of Assistant B's Conversation with User|> +{completion_B} +<|The End of Assistant B's Conversation with User|> + +# Your output + +## Format description +Your output should follow this format: +``` +score_A: +score_B: +``` + +## Your output, do not repeat the input above, first starts with an explanation of your judgement diff --git a/openjury/prompts/prompt-multi-turn.txt b/openjury/prompts/prompt-multi-turn.txt new file mode 100644 index 0000000..85cc5b6 --- /dev/null +++ b/openjury/prompts/prompt-multi-turn.txt @@ -0,0 +1,22 @@ +<|User Prompt|> +{user_prompt} + +<|The Start of Assistant A's Conversation with User|> +{completion_A} +<|The End of Assistant A's Conversation with User|> + +<|The Start of Assistant B's Conversation with User|> +{completion_B} +<|The End of Assistant B's Conversation with User|> + +# Your output + +## Format description +Your output should follow this format: +``` +score_A: +score_B: +``` + +## Your output, do not repeat the input above +``` diff --git a/openjury/utils.py b/openjury/utils.py index d8b5440..6c7405a 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -42,6 +42,14 @@ def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame: return pd.read_parquet(filename, **pandas_kwargs) +def truncate(s: str, max_len: int | None = None) -> str: + if not isinstance(s, str): + return "" + if max_len is not None: + return s[:max_len] + return s + + def do_inference(chat_model, inputs, use_tqdm: bool = False): # Retries on rate-limit/server errors with exponential backoff. # Async path retries individual calls; batch path splits into 4^attempt chunks on failure. @@ -174,7 +182,11 @@ def _to_raw_text(self, input_item) -> str: return input_item if hasattr(input_item, "to_string"): return input_item.to_string() - if isinstance(input_item, list) and input_item and isinstance(input_item[0], dict): + if ( + isinstance(input_item, list) + and input_item + and isinstance(input_item[0], dict) + ): return "\n".join(msg["content"] for msg in input_item) raise ValueError(f"Cannot extract raw text from: {type(input_item)}") @@ -204,7 +216,9 @@ class ChatLlamaCppModel(BaseLocalModel): ``Llama.reset()`` between conversations to clear stale KV-cache state. """ - def __init__(self, model_path: str, max_tokens: int = 1024, n_ctx: int = 0, **kwargs): + def __init__( + self, model_path: str, max_tokens: int = 1024, n_ctx: int = 0, **kwargs + ): from llama_cpp import Llama self.model_path = model_path @@ -265,7 +279,13 @@ class ChatVLLM(BaseLocalModel): default chat template. """ - def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None = None, **vllm_kwargs): + def __init__( + self, + model: str, + max_tokens: int = 8192, + chat_template: str | None = None, + **vllm_kwargs, + ): from vllm import LLM, SamplingParams self.model_path = model @@ -277,6 +297,7 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None if max_model_len is not None: try: from transformers import AutoConfig + config = AutoConfig.from_pretrained(model, trust_remote_code=True) model_max_pos = getattr(config, "max_position_embeddings", None) if model_max_pos is not None and max_model_len > model_max_pos: diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 040b424..9c53916 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -10,26 +10,51 @@ @pytest.fixture(autouse=True) def mock_external_data_and_cache(monkeypatch): - instructions = pd.DataFrame( + single_turn_instructions = pd.DataFrame( { "instruction": [f"Synthetic instruction {i}" for i in range(20)], }, index=pd.Index(range(20), name="instruction_index"), ) + # Mix of general and NEED_REF_CATS categories to exercise both code paths. + categories = ["writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay", + "writing", "math", "reasoning", "coding", "roleplay"] + ref_turn_1 = [ + f"Reference answer turn 1 for q{i}" if cat in ("math", "reasoning", "coding") else None + for i, cat in enumerate(categories) + ] + ref_turn_2 = [ + f"Reference answer turn 2 for q{i}" if cat in ("math", "reasoning", "coding") else None + for i, cat in enumerate(categories) + ] + mt_bench_questions = pd.DataFrame( + { + "category": categories, + "turn_1": [f"Synthetic MT-Bench turn 1 question {i}" for i in range(20)], + "turn_2": [f"Synthetic MT-Bench turn 2 follow-up {i}" for i in range(20)], + "reference_turn_1": ref_turn_1, + "reference_turn_2": ref_turn_2, + }, + index=pd.Index(range(20), name="instruction_index"), + ) + mt_bench_questions["instruction"] = mt_bench_questions["turn_1"] + + def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: + df = mt_bench_questions if dataset == "mt-bench" else single_turn_instructions + return df.head(n_instructions) if n_instructions is not None else df + monkeypatch.setattr( generate_and_evaluate, "load_instructions", - lambda dataset, n_instructions=None: ( - instructions.head(n_instructions) - if n_instructions is not None - else instructions - ), + _load_instructions, ) monkeypatch.setattr( generate_and_evaluate, "load_contexts", - lambda dataset: instructions.loc[:, "instruction"], + lambda dataset: single_turn_instructions.loc[:, "instruction"], ) monkeypatch.setattr( @@ -86,4 +111,137 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path): ) avg_pref = sum(prefs) / len(prefs) - assert avg_pref == 0.5 \ No newline at end of file + assert avg_pref == 0.5 + + +def test_format_mt_bench_turn_2_uses_conversation_blocks(): + questions = pd.DataFrame( + { + "category": ["math", "writing"], + "turn_1": ["Math question turn 1", "Writing question turn 1"], + "turn_2": ["Math question turn 2", "Writing question turn 2"], + "reference_turn_1": ["Math reference turn 1", None], + "reference_turn_2": ["Math reference turn 2", None], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + completions_a = pd.DataFrame( + { + "completion_turn_1": ["A1 math", "A1 writing"], + "completion_turn_2": ["A2 math", "A2 writing"], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + completions_b = pd.DataFrame( + { + "completion_turn_1": ["B1 math", "B1 writing"], + "completion_turn_2": ["B2 math", "B2 writing"], + }, + index=pd.Index([0, 1], name="instruction_index"), + ) + + turn_1_inputs, turn_2_inputs = generate_and_evaluate.format_mt_bench_for_evaluation( + questions=questions, + completions_A=completions_a, + completions_B=completions_b, + turns_mode="both", + truncate_input_chars=8192, + ) + ( + instructions_turn_1, + _completions_a_turn_1, + _completions_b_turn_1, + _metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + _metadata_turn_2, + ) = turn_2_inputs + + assert "Please focus on which assistant provides a better answer to the second user question." in instructions_turn_2[0] + assert "<|The Start of Reference Answer|>" in instructions_turn_2[0] + assert "Math reference turn 1" in instructions_turn_2[0] + assert "Math reference turn 2" in instructions_turn_2[0] + assert "<|The Start of Reference Answer|>" not in instructions_turn_2[1] + + assert "### User:\nMath question turn 1" in completions_a_turn_2[0] + assert "### Assistant:\nA1 math" in completions_a_turn_2[0] + assert "### User:\nMath question turn 2" in completions_a_turn_2[0] + assert "### Assistant:\nA2 math" in completions_a_turn_2[0] + + assert "### User:\nMath question turn 1" in completions_b_turn_2[0] + assert "### Assistant:\nB1 math" in completions_b_turn_2[0] + assert "### User:\nMath question turn 2" in completions_b_turn_2[0] + assert "### Assistant:\nB2 math" in completions_b_turn_2[0] + + assert instructions_turn_1[1] == "Writing question turn 1" + assert "[MT-Bench | Turn 1]" in instructions_turn_1[0] + + +def test_mt_bench_pairwise(): + """Test MT-Bench pipeline through score-based parsing.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer for turn 1 and turn 2", + model_B="Dummy/another answer", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=5, + ) + ) + + assert all(p < 0.5 for p in prefs) + assert len(prefs) == 10 # two turns per question + + +def test_mt_bench_swap_mode(): + """Test that MT-Bench swap mode doubles the annotations and corrects bias.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=3, + swap_mode="both", + ) + ) + + assert len(prefs) == 12 # (3 questions * 2 turns) * 2 swap directions + assert float(sum(prefs) / len(prefs)) == pytest.approx(0.5) + + +def test_mt_bench_single_turn_only(): + """Test MT-Bench single-turn-only evaluation (--mt_bench_turns single).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 10 score B: 0", + n_instructions=5, + mt_bench_turns="single", + ) + ) + + assert all(p < 0.5 for p in prefs) + assert len(prefs) == 5 # one annotation per question, turn 1 only + + +def test_mt_bench_multi_turn_only(): + """Test MT-Bench multi-turn-only evaluation (--mt_bench_turns multi).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/score A: 0 score B: 10", + n_instructions=5, + mt_bench_turns="multi", + ) + ) + + assert all(p > 0.5 for p in prefs) + assert len(prefs) == 5 # one annotation per question, turn 2 only \ No newline at end of file From 14f747ecdcd833db1f0551a782c95a7a33c681aa Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 2 Mar 2026 21:53:50 +0100 Subject: [PATCH 21/35] fix result formatting --- openjury/generate_and_evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 17767c8..b6e6c0d 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -268,7 +268,7 @@ def print_results(results): print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}") for cat, stats in sorted(per_category.items()): print( - f" {cat:<14} | {stats['winrate']:>10.1%} | " + f" {cat:<14} | {stats['winrate']:>11.1%} | " f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}" ) From e67ea795984c6bffb693f3005c880d0c04163bc0 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 2 Mar 2026 21:55:39 +0100 Subject: [PATCH 22/35] remove double environment variable --- openjury/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openjury/utils.py b/openjury/utils.py index 8f4ab2c..63d2778 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -14,7 +14,6 @@ data_root = Path( os.environ.get("OPENJURY_DATA", Path("~/openjury-data/").expanduser()) - os.environ.get("OPENJURY_DATA", Path("~/openjury-data/").expanduser()) ).expanduser() From 4089be864d75ca00cd613015c426c1050a6a4f3b Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 2 Mar 2026 23:53:24 +0100 Subject: [PATCH 23/35] remove accidental duplications --- openjury/generate_and_evaluate.py | 1 - openjury/utils.py | 3 --- tests/test_generate_and_evaluate.py | 3 --- 3 files changed, 7 deletions(-) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index b6e6c0d..6e5bbce 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -79,7 +79,6 @@ class CliArgs: max_out_tokens_models: int = 32768 max_out_tokens_judge: int = 32768 max_model_len: int | None = None - max_model_len: int | None = None chat_template: str | None = None mt_bench_turns: str = "both" diff --git a/openjury/utils.py b/openjury/utils.py index 63d2778..6c7405a 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -385,8 +385,6 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): # Use our custom ChatVLLM wrapper which properly applies chat templates if model_provider == "VLLM": - chat_template = kwargs.pop("chat_template", None) - vllm_kwargs = {k: v for k, v in kwargs.items() if v is not None} chat_template = kwargs.pop("chat_template", None) vllm_kwargs = {k: v for k, v in kwargs.items() if v is not None} return ChatVLLM( @@ -394,7 +392,6 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): max_tokens=max_tokens if max_tokens else 8192, chat_template=chat_template, **vllm_kwargs, - **vllm_kwargs, ) model_kwargs = {} diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 8158617..9c53916 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -1,9 +1,6 @@ import pandas as pd -import pandas as pd import pytest -import openjury.generate_and_evaluate as generate_and_evaluate - import openjury.generate_and_evaluate as generate_and_evaluate from openjury.generate_and_evaluate import ( main as main_generate_and_eval, From 03f5cceb86a8d481b55f132b989193830d405ef3 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Wed, 4 Mar 2026 15:25:53 +0100 Subject: [PATCH 24/35] Refactor - Implemented a new function to download MT-Bench questions and GPT-4 reference answers, with fallback mechanisms for missing references. - Remove duplication. --- openjury/generate_and_evaluate.py | 251 +++++++++++++---------- openjury/instruction_dataset/mt_bench.py | 205 ++++++++---------- openjury/utils.py | 16 +- tests/test_generate_and_evaluate.py | 50 ++++- tests/test_mt_bench_downloads.py | 66 ++++++ 5 files changed, 345 insertions(+), 243 deletions(-) create mode 100644 tests/test_mt_bench_downloads.py diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 6e5bbce..60619f6 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -253,9 +253,7 @@ def print_results(results): print(f" ✅ Wins: {results['num_wins']}") print(f" ❌ Losses: {results['num_losses']}") print(f" 🤝 Ties: {results['num_ties']}") - if "num_errors" in results: - print(f" ⚠️ Errors: {results['num_errors']}") - if "num_missing" in results: + if results.get("num_missing", 0) > 0: print(f" ❓ Missing: {results['num_missing']}") per_category = results.get("per_category") @@ -465,6 +463,92 @@ def _compute_grouped_stats( } +def _parse_preferences_from_annotations( + annotations: list, + score_parser: PairScore, +) -> pd.Series: + return pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations + ] + ) + + +def _judge_turn( + *, + judge_chat_model, + instructions: list[str], + completions_A: list[str], + completions_B: list[str], + metadata: list[dict[str, object]], + score_parser: PairScore, + provide_explanation: bool, + swap_mode: str, + truncate_input_chars: int | None, + use_tqdm: bool, + system_prompt: str | None = None, + user_prompt_template: str | None = None, +) -> tuple[ + list, + list, + list[dict[str, object]], + list[dict[str, object]], + pd.Series, + list[dict[str, object]], +]: + if not instructions: + return [], [], [], [], pd.Series(dtype=float), [] + + annotations = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_A, + completions_B=completions_B, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)] + + annotations_reversed: list = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + combined_metadata = list(metadata) + + if swap_mode == "both": + print("Correction for judge bias towards a certain model position is set.") + print("Evaluating completions with models reversed.") + annotations_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_B, + completions_B=completions_A, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + prefs_reversed = _parse_preferences_from_annotations( + annotations_reversed, score_parser + ) + preference_parts.append(1 - prefs_reversed) + metadata_for_reversed_annotations = list(metadata) + combined_metadata.extend(metadata) + + preferences = pd.concat(preference_parts).reset_index(drop=True) + return ( + annotations, + annotations_reversed, + list(metadata), + metadata_for_reversed_annotations, + preferences, + combined_metadata, + ) + + def main(args: CliArgs): """ 1) take as input: @@ -574,33 +658,32 @@ def main(args: CliArgs): # the default system prompt of annotate is to compare instruction tuned models. system_prompt = None - annotations = annotate_battles( + + instruction_subset = instructions.head(n_instructions) + instruction_indices = instruction_subset.index.tolist() + metadata = [{"instruction_index": idx} for idx in instruction_indices] + score_parser = PairScore() + ( + annotations, + annotations_reversed, + metadata_for_annotations, + metadata_for_reversed_annotations, + prefs, + _combined_metadata, + ) = _judge_turn( judge_chat_model=judge_chat_model, - instructions=instructions.head(n_instructions).tolist(), + instructions=instruction_subset.tolist(), completions_A=completions_A.head(n_instructions).tolist(), completions_B=completions_B.head(n_instructions).tolist(), + metadata=metadata, + score_parser=score_parser, provide_explanation=args.provide_explanation, - system_prompt=system_prompt, + swap_mode=args.swap_mode, truncate_input_chars=args.truncate_all_input_chars, use_tqdm=args.use_tqdm, + system_prompt=system_prompt, ) - if args.swap_mode == "both": - print("Correction for judge bias towards a certain model position is set.") - print( - f"Evaluating completions with models reversed with judge {args.judge_model}." - ) - annotations_reversed = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions.head(n_instructions).tolist(), - completions_A=completions_B.head(n_instructions).tolist(), - completions_B=completions_A.head(n_instructions).tolist(), - provide_explanation=args.provide_explanation, - system_prompt=system_prompt, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" name += f"-{args.swap_mode}" name = name.replace("/", "_") @@ -614,16 +697,18 @@ def main(args: CliArgs): print(f"Saving results to {res_folder}") df = pd.DataFrame(annotations) - df["instruction_index"] = instructions.head(n_instructions).index.tolist() + df["instruction_index"] = [ + meta["instruction_index"] for meta in metadata_for_annotations + ] df["model_A"] = args.model_A df["model_B"] = args.model_B df["judge"] = args.judge_model if args.swap_mode == "both": df_reversed = pd.DataFrame(annotations_reversed) - df_reversed["instruction_index"] = instructions.head( - n_instructions - ).index.tolist() + df_reversed["instruction_index"] = [ + meta["instruction_index"] for meta in metadata_for_reversed_annotations + ] df_reversed["model_A"] = args.model_B df_reversed["model_B"] = args.model_A df_reversed["judge"] = args.judge_model @@ -631,24 +716,6 @@ def main(args: CliArgs): df.to_csv(res_folder / f"{name}-annotations.csv", index=False) - # compute preferences between A and B - score_parser = PairScore() - prefs = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations - ] - ) - - if args.swap_mode == "both": - prefs_reversed = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_reversed - ] - ) - prefs = pd.concat([prefs, (1 - prefs_reversed)]).reset_index(drop=True) - stats = compute_preference_stats(prefs) results = { "dataset": args.dataset, @@ -745,46 +812,33 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): print("Running reversed evaluation for position bias correction.") if instructions_turn_1: - annotations_turn_1 = annotate_battles( + ( + annotations_turn_1, + annotations_turn_1_reversed, + metadata_turn_1_for_annotations, + metadata_turn_1_for_reversed_annotations, + prefs_turn_1, + combined_metadata_turn_1, + ) = _judge_turn( judge_chat_model=judge_chat_model, instructions=instructions_turn_1, completions_A=completions_a_turn_1, completions_B=completions_b_turn_1, + metadata=metadata_turn_1, + score_parser=score_parser, provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, truncate_input_chars=args.truncate_all_input_chars, use_tqdm=args.use_tqdm, ) annotations.extend(annotations_turn_1) - metadata_for_annotations.extend(metadata_turn_1) - prefs_turn_1 = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_turn_1 - ] + annotations_reversed.extend(annotations_turn_1_reversed) + metadata_for_annotations.extend(metadata_turn_1_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_1_for_reversed_annotations ) preference_parts.append(prefs_turn_1) - combined_metadata.extend(metadata_turn_1) - - if args.swap_mode == "both": - annotations_turn_1_reversed = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions_turn_1, - completions_A=completions_b_turn_1, - completions_B=completions_a_turn_1, - provide_explanation=args.provide_explanation, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - annotations_reversed.extend(annotations_turn_1_reversed) - metadata_for_reversed_annotations.extend(metadata_turn_1) - prefs_turn_1_reversed = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_turn_1_reversed - ] - ) - preference_parts.append(1 - prefs_turn_1_reversed) - combined_metadata.extend(metadata_turn_1) + combined_metadata.extend(combined_metadata_turn_1) if instructions_turn_2: ( @@ -794,50 +848,35 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): provide_explanation=args.provide_explanation, multi_turn=True, ) - annotations_turn_2 = annotate_battles( + ( + annotations_turn_2, + annotations_turn_2_reversed, + metadata_turn_2_for_annotations, + metadata_turn_2_for_reversed_annotations, + prefs_turn_2, + combined_metadata_turn_2, + ) = _judge_turn( judge_chat_model=judge_chat_model, instructions=instructions_turn_2, completions_A=completions_a_turn_2, completions_B=completions_b_turn_2, + metadata=metadata_turn_2, + score_parser=score_parser, provide_explanation=args.provide_explanation, - system_prompt=mt_system_prompt, - user_prompt_template=mt_user_prompt_template, + swap_mode=args.swap_mode, truncate_input_chars=args.truncate_all_input_chars, use_tqdm=args.use_tqdm, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, ) annotations.extend(annotations_turn_2) - metadata_for_annotations.extend(metadata_turn_2) - prefs_turn_2 = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_turn_2 - ] + annotations_reversed.extend(annotations_turn_2_reversed) + metadata_for_annotations.extend(metadata_turn_2_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_2_for_reversed_annotations ) preference_parts.append(prefs_turn_2) - combined_metadata.extend(metadata_turn_2) - - if args.swap_mode == "both": - annotations_turn_2_reversed = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions_turn_2, - completions_A=completions_b_turn_2, - completions_B=completions_a_turn_2, - provide_explanation=args.provide_explanation, - system_prompt=mt_system_prompt, - user_prompt_template=mt_user_prompt_template, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - annotations_reversed.extend(annotations_turn_2_reversed) - metadata_for_reversed_annotations.extend(metadata_turn_2) - prefs_turn_2_reversed = pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations_turn_2_reversed - ] - ) - preference_parts.append(1 - prefs_turn_2_reversed) - combined_metadata.extend(metadata_turn_2) + combined_metadata.extend(combined_metadata_turn_2) prefs = ( pd.concat(preference_parts).reset_index(drop=True) diff --git a/openjury/instruction_dataset/mt_bench.py b/openjury/instruction_dataset/mt_bench.py index 3d5a44e..910a045 100644 --- a/openjury/instruction_dataset/mt_bench.py +++ b/openjury/instruction_dataset/mt_bench.py @@ -1,106 +1,39 @@ -import json from pathlib import Path +from urllib.request import urlretrieve +import warnings import pandas as pd from huggingface_hub import snapshot_download from openjury.utils import data_root -def _read_json_or_jsonl(path: Path) -> list[dict]: - if path.suffix == ".jsonl": - records = [] - with open(path, "r") as f: - for line in f: - line = line.strip() - if not line: - continue - records.append(json.loads(line)) - return records - elif path.suffix == ".json": - with open(path, "r") as f: - data = json.load(f) - if isinstance(data, list): - return data - raise ValueError(f"Expected a JSON list in {path}, got {type(data)}") - raise ValueError(f"Unsupported MT-Bench file format: {path}") - - -def _try_resolve_mt_bench_paths( - root: Path, -) -> tuple[Path | None, Path | None]: - """Find question.jsonl and reference_answer/*.jsonl under root.""" - question_candidates = [ - root / "data" / "mt_bench" / "question.jsonl", - root / "data" / "mt_bench" / "questions.jsonl", - ] - # FastChat stores reference answers inside a *directory* (one file per model). - ref_dir_candidates = [ - root / "data" / "mt_bench" / "reference_answer", - ] - - question_path = next((p for p in question_candidates if p.exists()), None) - if question_path is None: - for p in root.rglob("question.jsonl"): - question_path = p - break - if question_path is None: - for p in root.rglob("questions.jsonl"): - question_path = p - break - - ref_path: Path | None = None - for d in ref_dir_candidates: - if d.is_dir(): - gpt4 = d / "gpt-4.jsonl" - if gpt4.exists(): - ref_path = gpt4 - else: - jsonl_files = sorted(d.glob("*.jsonl")) - if jsonl_files: - ref_path = jsonl_files[0] - break - - if ref_path is None: - for name in ("gpt-4.jsonl", "reference_answer.jsonl", "reference_answers.jsonl"): - for p in root.rglob(name): - ref_path = p - break - if ref_path is not None: - break - - return question_path, ref_path - - -def _extract_ref_turns(rec: dict) -> list[str] | None: - """Extract reference answer turns from either model-answer or flat format. - - FastChat reference_answer/gpt-4.jsonl uses: - {"choices": [{"turns": ["ans1", "ans2"]}]} - while question.jsonl inlines: - {"reference": ["hint1", "hint2"]} - """ - choices = rec.get("choices") - if isinstance(choices, list) and len(choices) > 0: - turns = choices[0].get("turns") - if isinstance(turns, list): - return turns - turns = rec.get("turns") - if isinstance(turns, list): - return turns - return None +FASTCHAT_GPT4_REFERENCE_URL = ( + "https://raw.githubusercontent.com/lm-sys/FastChat/main/" + "fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl" +) + +def _download_gpt4_references(local_dir: Path) -> Path | None: + reference_dir = local_dir / "reference_answer" + reference_dir.mkdir(parents=True, exist_ok=True) + gpt4_reference_path = reference_dir / "gpt-4.jsonl" + if gpt4_reference_path.exists(): + return gpt4_reference_path + try: + urlretrieve(FASTCHAT_GPT4_REFERENCE_URL, gpt4_reference_path) + except Exception as e: + warnings.warn( + "Could not download MT-Bench GPT-4 reference answers from FastChat. " + f"Falling back to inline references from question.jsonl: {e}", + RuntimeWarning, + ) + return None + return gpt4_reference_path -def load_mt_bench() -> pd.DataFrame: - """Load MT-Bench questions (and reference answers when available). - - Downloads the MT-Bench HuggingFace space snapshot to `$OPENJURY_DATA/mt-bench/` - (or `~/openjury-data/mt-bench/`) and returns a DataFrame with at least: - - instruction_index (question id) - - category - - turn_1, turn_2 - - reference_turn_1, reference_turn_2 (may be missing/NaN) - """ - local_dir = data_root / "mt-bench" +def download_mt_bench(local_dir: Path | None = None) -> tuple[Path, Path | None]: + """Download MT-Bench questions and GPT-4 references if missing.""" + if local_dir is None: + local_dir = data_root / "mt-bench" try: local_dir.mkdir(parents=True, exist_ok=True) except PermissionError as e: @@ -109,15 +42,14 @@ def load_mt_bench() -> pd.DataFrame: "Set environment variable OPENJURY_DATA to a writable location." ) from e - question_path, ref_path = _try_resolve_mt_bench_paths(local_dir) - if question_path is None: + question_path = local_dir / "data" / "mt_bench" / "question.jsonl" + if not question_path.exists(): try: snapshot_download( repo_id="lmsys/mt-bench", repo_type="space", allow_patterns=[ "data/mt_bench/question.jsonl", - "data/mt_bench/reference_answer/*", ], local_dir=local_dir, force_download=False, @@ -127,32 +59,62 @@ def load_mt_bench() -> pd.DataFrame: "Failed to download MT-Bench questions from HuggingFace space " "'lmsys/mt-bench'. If you're in an offline / restricted-network " "environment, pre-download the space snapshot and place the " - "questions file under " - f"{local_dir}/data/mt_bench/question.jsonl (and optionally " - "reference_answer/gpt-4.jsonl), or set OPENJURY_DATA to point " - "to that directory." + f"questions file at {question_path}, or set OPENJURY_DATA to " + "point to that directory." ) from e - question_path, ref_path = _try_resolve_mt_bench_paths(local_dir) - - if question_path is None: + if not question_path.exists(): raise FileNotFoundError( "Could not locate MT-Bench questions after download. " - f"Searched under {local_dir}. " - "Expected a file like 'data/mt_bench/question.jsonl'." + f"Expected file at {question_path}." ) - questions = _read_json_or_jsonl(question_path) + gpt4_reference_path = _download_gpt4_references(local_dir) + return question_path, gpt4_reference_path + + +def load_mt_bench() -> pd.DataFrame: + """Load MT-Bench questions and reference answers. + + Downloads MT-Bench questions from the HuggingFace LMSYS space and tries to + load GPT-4 references from FastChat GitHub. If GPT-4 references cannot be + downloaded or parsed, falls back to inline references from question.jsonl. + """ + question_path, ref_path = download_mt_bench() + + questions = pd.read_json(question_path, lines=True).to_dict(orient="records") - # --- Load reference answers from the separate reference file (gpt-4.jsonl) --- ref_by_id: dict[int | str, list[str]] = {} + use_inline_reference_fallback = ref_path is None if ref_path is not None: - for rec in _read_json_or_jsonl(ref_path): - qid = rec.get("question_id", rec.get("id")) - if qid is None: - continue - turns = _extract_ref_turns(rec) - if turns is not None: + try: + reference_records = pd.read_json(ref_path, lines=True).to_dict( + orient="records" + ) + for rec in reference_records: + qid = rec.get("question_id", rec.get("id")) + if qid is None: + continue + choices = rec.get("choices") + if not (isinstance(choices, list) and choices): + continue + first_choice = choices[0] + if not isinstance(first_choice, dict): + continue + turns = first_choice.get("turns") + if not isinstance(turns, list): + continue ref_by_id[qid] = turns + try: + ref_by_id[int(qid)] = turns + except Exception: + pass + except Exception as e: + warnings.warn( + "Failed to parse GPT-4 reference answers from FastChat. " + f"Falling back to inline references from question.jsonl: {e}", + RuntimeWarning, + ) + use_inline_reference_fallback = True rows = [] for rec in questions: @@ -175,16 +137,18 @@ def load_mt_bench() -> pd.DataFrame: turn_1 = rec.get("turn_1", rec.get("instruction")) turn_2 = rec.get("turn_2") - # Prefer the separate gpt-4 reference file; fall back to the inline - # "reference" field embedded in question.jsonl (short hints). ref_turns = ref_by_id.get(qid_raw) or ref_by_id.get(qid) - if ref_turns is None: + if ref_turns is None and use_inline_reference_fallback: inline_ref = rec.get("reference") if isinstance(inline_ref, list): ref_turns = inline_ref - ref_turn_1 = ref_turns[0] if isinstance(ref_turns, list) and len(ref_turns) > 0 else None - ref_turn_2 = ref_turns[1] if isinstance(ref_turns, list) and len(ref_turns) > 1 else None + ref_turn_1 = ( + ref_turns[0] if isinstance(ref_turns, list) and len(ref_turns) > 0 else None + ) + ref_turn_2 = ( + ref_turns[1] if isinstance(ref_turns, list) and len(ref_turns) > 1 else None + ) rows.append( { @@ -198,6 +162,5 @@ def load_mt_bench() -> pd.DataFrame: } ) - df = pd.DataFrame(rows) - return df + return pd.DataFrame(rows) diff --git a/openjury/utils.py b/openjury/utils.py index 6c7405a..1ffedc0 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -436,23 +436,13 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): def download_all(): + from openjury.instruction_dataset.mt_bench import download_mt_bench + print(f"Downloading all dataset in {data_root}") for dataset in ["alpaca-eval", "arena-hard", "m-arena-hard"]: local_path_tables = data_root / "tables" download_hf(name=dataset, local_path=local_path_tables) - # MT-Bench questions live in the LMSYS HuggingFace space. - snapshot_download( - repo_id="lmsys/mt-bench", - repo_type="space", - allow_patterns=[ - "data/mt_bench/question.jsonl", - "data/mt_bench/reference_answer/*", - ], - local_dir=data_root / "mt-bench", - force_download=False, - ) - snapshot_download( repo_id="geoalgo/multilingual-contexts-to-be-completed", repo_type="dataset", @@ -461,6 +451,8 @@ def download_all(): force_download=False, ) + download_mt_bench() + class Timeblock: """Timer context manager""" diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 9c53916..d58460e 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -114,6 +114,44 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path): assert avg_pref == 0.5 +def test_main_non_mt_bench_reuses_judge_turn(monkeypatch, tmp_path): + captured = {"calls": 0, "kwargs": None} + + def _judge_turn_stub(**kwargs): + captured["calls"] += 1 + captured["kwargs"] = kwargs + return ( + [{"judge_completion": "score A: 0 score B: 10"}], + [], + [{"instruction_index": 0}], + [], + pd.Series([1.0]), + [{"instruction_index": 0}], + ) + + monkeypatch.setattr( + generate_and_evaluate, + "_judge_turn", + _judge_turn_stub, + ) + + prefs = main_generate_and_eval( + CliArgs( + dataset="alpaca-eval", + model_A="Dummy/no answer", + model_B="Dummy/open is better than close isnt'it", + judge_model="Dummy/score A: 0 score B: 10", + n_instructions=1, + result_folder=str(tmp_path), + ) + ) + + assert captured["calls"] == 1 + assert captured["kwargs"]["swap_mode"] == "fixed" + assert captured["kwargs"]["metadata"] == [{"instruction_index": 0}] + assert prefs.tolist() == [1.0] + + def test_format_mt_bench_turn_2_uses_conversation_blocks(): questions = pd.DataFrame( { @@ -180,7 +218,7 @@ def test_format_mt_bench_turn_2_uses_conversation_blocks(): assert "[MT-Bench | Turn 1]" in instructions_turn_1[0] -def test_mt_bench_pairwise(): +def test_mt_bench_pairwise(tmp_path): """Test MT-Bench pipeline through score-based parsing.""" prefs = main_generate_and_eval( CliArgs( @@ -189,6 +227,7 @@ def test_mt_bench_pairwise(): model_B="Dummy/another answer", judge_model="Dummy/score A: 10 score B: 0", n_instructions=5, + result_folder=str(tmp_path), ) ) @@ -196,7 +235,7 @@ def test_mt_bench_pairwise(): assert len(prefs) == 10 # two turns per question -def test_mt_bench_swap_mode(): +def test_mt_bench_swap_mode(tmp_path): """Test that MT-Bench swap mode doubles the annotations and corrects bias.""" prefs = main_generate_and_eval( CliArgs( @@ -206,6 +245,7 @@ def test_mt_bench_swap_mode(): judge_model="Dummy/score A: 10 score B: 0", n_instructions=3, swap_mode="both", + result_folder=str(tmp_path), ) ) @@ -213,7 +253,7 @@ def test_mt_bench_swap_mode(): assert float(sum(prefs) / len(prefs)) == pytest.approx(0.5) -def test_mt_bench_single_turn_only(): +def test_mt_bench_single_turn_only(tmp_path): """Test MT-Bench single-turn-only evaluation (--mt_bench_turns single).""" prefs = main_generate_and_eval( CliArgs( @@ -223,6 +263,7 @@ def test_mt_bench_single_turn_only(): judge_model="Dummy/score A: 10 score B: 0", n_instructions=5, mt_bench_turns="single", + result_folder=str(tmp_path), ) ) @@ -230,7 +271,7 @@ def test_mt_bench_single_turn_only(): assert len(prefs) == 5 # one annotation per question, turn 1 only -def test_mt_bench_multi_turn_only(): +def test_mt_bench_multi_turn_only(tmp_path): """Test MT-Bench multi-turn-only evaluation (--mt_bench_turns multi).""" prefs = main_generate_and_eval( CliArgs( @@ -240,6 +281,7 @@ def test_mt_bench_multi_turn_only(): judge_model="Dummy/score A: 0 score B: 10", n_instructions=5, mt_bench_turns="multi", + result_folder=str(tmp_path), ) ) diff --git a/tests/test_mt_bench_downloads.py b/tests/test_mt_bench_downloads.py new file mode 100644 index 0000000..9058a3b --- /dev/null +++ b/tests/test_mt_bench_downloads.py @@ -0,0 +1,66 @@ +from pathlib import Path + +import openjury.instruction_dataset.mt_bench as mt_bench +import openjury.utils as utils + + +def test_download_mt_bench_skips_question_download_if_cached(tmp_path, monkeypatch): + question_path = tmp_path / "data" / "mt_bench" / "question.jsonl" + question_path.parent.mkdir(parents=True, exist_ok=True) + question_path.write_text('{"question_id": 1, "turns": ["Q1"]}\n') + + reference_path = tmp_path / "reference_answer" / "gpt-4.jsonl" + reference_path.parent.mkdir(parents=True, exist_ok=True) + reference_path.write_text('{"question_id": 1, "choices": [{"turns": ["A1"]}]}\n') + + calls = {"snapshot_download": 0} + + def _snapshot_download_stub(**_kwargs): + calls["snapshot_download"] += 1 + + monkeypatch.setattr(mt_bench, "snapshot_download", _snapshot_download_stub) + monkeypatch.setattr( + mt_bench, + "_download_gpt4_references", + lambda _local_dir: reference_path, + ) + + downloaded_question_path, downloaded_reference_path = mt_bench.download_mt_bench( + local_dir=tmp_path + ) + + assert downloaded_question_path == question_path + assert downloaded_reference_path == reference_path + assert calls["snapshot_download"] == 0 + + +def test_download_all_includes_mt_bench(tmp_path, monkeypatch): + hf_datasets = [] + calls = {"contexts": 0, "mt_bench": 0} + + monkeypatch.setattr(utils, "data_root", tmp_path) + monkeypatch.setattr( + utils, + "download_hf", + lambda name, local_path: hf_datasets.append((name, local_path)), + ) + + def _contexts_snapshot_stub(**_kwargs): + calls["contexts"] += 1 + + monkeypatch.setattr(utils, "snapshot_download", _contexts_snapshot_stub) + monkeypatch.setattr( + mt_bench, + "download_mt_bench", + lambda: calls.__setitem__("mt_bench", calls["mt_bench"] + 1), + ) + + utils.download_all() + + assert [name for name, _ in hf_datasets] == [ + "alpaca-eval", + "arena-hard", + "m-arena-hard", + ] + assert calls["contexts"] == 1 + assert calls["mt_bench"] == 1 From 8ffe3a6564a134550108bd6654f1c60f57417dee Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Wed, 4 Mar 2026 16:02:34 +0100 Subject: [PATCH 25/35] Remove duplication between prompt templates --- openjury/evaluate.py | 35 ++++++++++--------- .../prompt-multi-turn-with-explanation.txt | 21 ----------- openjury/prompts/prompt-multi-turn.txt | 22 ------------ openjury/prompts/prompt-with-explanation.txt | 21 ----------- openjury/prompts/prompt.txt | 11 +++--- 5 files changed, 24 insertions(+), 86 deletions(-) delete mode 100644 openjury/prompts/prompt-multi-turn-with-explanation.txt delete mode 100644 openjury/prompts/prompt-multi-turn.txt delete mode 100644 openjury/prompts/prompt-with-explanation.txt diff --git a/openjury/evaluate.py b/openjury/evaluate.py index d31db67..cc0d45d 100644 --- a/openjury/evaluate.py +++ b/openjury/evaluate.py @@ -50,26 +50,29 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1): return float(m.group(group_index).strip(" ")) +_COMPLETION_LABEL_SINGLE = "Answer" +_COMPLETION_LABEL_MULTI_TURN = "Conversation with User" +_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement" +_SCORE_FENCE = "\n```" + + def load_judge_system_and_user_prompt( provide_explanation: bool = True, multi_turn: bool = False, ) -> tuple[str, str]: - # Prepare judge - with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f: - system_prompt = str(f.read()) - - if multi_turn: - prompt_filename = ( - "prompt-multi-turn-with-explanation.txt" - if provide_explanation - else "prompt-multi-turn.txt" - ) - else: - prompt_filename = ( - "prompt-with-explanation.txt" if provide_explanation else "prompt.txt" - ) - with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f: - user_prompt_template = str(f.read()) + prompts_dir = Path(__file__).parent / "prompts" + + system_prompt = (prompts_dir / "system-prompt.txt").read_text() + + user_prompt_template = (prompts_dir / "prompt.txt").read_text() + user_prompt_template = user_prompt_template.replace( + "{completion_label}", + _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE, + ) + user_prompt_template = user_prompt_template.replace( + "{explanation_suffix}", + _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE, + ) return system_prompt, user_prompt_template diff --git a/openjury/prompts/prompt-multi-turn-with-explanation.txt b/openjury/prompts/prompt-multi-turn-with-explanation.txt deleted file mode 100644 index f3ea1c2..0000000 --- a/openjury/prompts/prompt-multi-turn-with-explanation.txt +++ /dev/null @@ -1,21 +0,0 @@ -<|User Prompt|> -{user_prompt} - -<|The Start of Assistant A's Conversation with User|> -{completion_A} -<|The End of Assistant A's Conversation with User|> - -<|The Start of Assistant B's Conversation with User|> -{completion_B} -<|The End of Assistant B's Conversation with User|> - -# Your output - -## Format description -Your output should follow this format: -``` -score_A: -score_B: -``` - -## Your output, do not repeat the input above, first starts with an explanation of your judgement diff --git a/openjury/prompts/prompt-multi-turn.txt b/openjury/prompts/prompt-multi-turn.txt deleted file mode 100644 index 85cc5b6..0000000 --- a/openjury/prompts/prompt-multi-turn.txt +++ /dev/null @@ -1,22 +0,0 @@ -<|User Prompt|> -{user_prompt} - -<|The Start of Assistant A's Conversation with User|> -{completion_A} -<|The End of Assistant A's Conversation with User|> - -<|The Start of Assistant B's Conversation with User|> -{completion_B} -<|The End of Assistant B's Conversation with User|> - -# Your output - -## Format description -Your output should follow this format: -``` -score_A: -score_B: -``` - -## Your output, do not repeat the input above -``` diff --git a/openjury/prompts/prompt-with-explanation.txt b/openjury/prompts/prompt-with-explanation.txt deleted file mode 100644 index 6600f51..0000000 --- a/openjury/prompts/prompt-with-explanation.txt +++ /dev/null @@ -1,21 +0,0 @@ -<|User Prompt|> -{user_prompt} - -<|The Start of Assistant A's Answer|> -{completion_A} -<|The End of Assistant A's Answer|> - -<|The Start of Assistant B's Answer|> -{completion_B} -<|The End of Assistant B's Answer|> - -# Your output - -## Format description -Your output should follow this format: -``` -score_A: -score_B: -``` - -## Your output, do not repeat the input above, first starts with an explanation of your judgement diff --git a/openjury/prompts/prompt.txt b/openjury/prompts/prompt.txt index 21d2e48..38021e6 100644 --- a/openjury/prompts/prompt.txt +++ b/openjury/prompts/prompt.txt @@ -1,13 +1,13 @@ <|User Prompt|> {user_prompt} -<|The Start of Assistant A's Answer|> +<|The Start of Assistant A's {completion_label}|> {completion_A} -<|The End of Assistant A's Answer|> +<|The End of Assistant A's {completion_label}|> -<|The Start of Assistant B's Answer|> +<|The Start of Assistant B's {completion_label}|> {completion_B} -<|The End of Assistant B's Answer|> +<|The End of Assistant B's {completion_label}|> # Your output @@ -18,5 +18,4 @@ score_A: ``` -## Your output, do not repeat the input above -``` +## Your output, do not repeat the input above{explanation_suffix} From b877f114f6bc694857ab638d21ee00098d18769c Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 9 Mar 2026 21:00:29 +0100 Subject: [PATCH 26/35] add temperature argument --- openjury/utils.py | 66 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 53 insertions(+), 13 deletions(-) diff --git a/openjury/utils.py b/openjury/utils.py index 1ffedc0..56e57db 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -214,15 +214,25 @@ class ChatLlamaCppModel(BaseLocalModel): Unlike langchain's ``ChatLlamaCpp``, this wrapper explicitly calls ``Llama.reset()`` between conversations to clear stale KV-cache state. + + Sampling defaults: + - ``temperature=None`` means do not pass temperature explicitly and keep + llama-cpp's backend default behavior. """ def __init__( - self, model_path: str, max_tokens: int = 1024, n_ctx: int = 0, **kwargs + self, + model_path: str, + max_tokens: int = 1024, + n_ctx: int = 0, + temperature: float | None = None, + **kwargs, ): from llama_cpp import Llama self.model_path = model_path self.max_tokens = max_tokens + self.temperature = temperature self.llama = Llama( model_path=model_path, n_ctx=n_ctx, @@ -249,20 +259,23 @@ def batch(self, inputs: list, **kwargs) -> list[str]: self.llama.reset() if self._use_generate: text = self._to_raw_text(inp) - response = self.llama.create_completion( - prompt=text, - max_tokens=self.max_tokens, - ) + create_kwargs = {"prompt": text, "max_tokens": self.max_tokens} + if self.temperature is not None: + create_kwargs["temperature"] = self.temperature + response = self.llama.create_completion(**create_kwargs) results.append(response["choices"][0]["text"]) else: messages = self._to_messages(inp) - response = self.llama.create_chat_completion( - messages=messages, - max_tokens=self.max_tokens, - ) + create_kwargs = {"messages": messages, "max_tokens": self.max_tokens} + if self.temperature is not None: + create_kwargs["temperature"] = self.temperature + response = self.llama.create_chat_completion(**create_kwargs) results.append(response["choices"][0]["message"]["content"]) return results + def set_temperature(self, temperature: float | None) -> None: + self.temperature = None if temperature is None else float(temperature) + class ChatVLLM(BaseLocalModel): """VLLM wrapper that auto-detects whether to use chat() or generate(). @@ -277,12 +290,18 @@ class ChatVLLM(BaseLocalModel): falls back to ``llm.generate()`` and emits a warning. This avoids the ``ValueError`` raised by ``transformers >= v4.44`` which removed the default chat template. + + Sampling defaults: + - Uses ``temperature=0.6`` and ``top_p=0.95`` unless explicitly + overridden. """ def __init__( self, model: str, max_tokens: int = 8192, + temperature: float = 0.6, + top_p: float = 0.95, chat_template: str | None = None, **vllm_kwargs, ): @@ -315,10 +334,13 @@ def __init__( ) self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs) - self.sampling_params = SamplingParams( + self._SamplingParams = SamplingParams + self._temperature = temperature + self._top_p = top_p + self.sampling_params = self._SamplingParams( max_tokens=max_tokens, - temperature=0.6, - top_p=0.95, + temperature=self._temperature, + top_p=self._top_p, ) # Resolve chat template: @@ -344,6 +366,14 @@ def __init__( self._use_generate = False print(f"ChatVLLM: using tokenizer's chat template for '{model}'") + def set_temperature(self, temperature: float) -> None: + self._temperature = float(temperature) + self.sampling_params = self._SamplingParams( + max_tokens=self.max_tokens, + temperature=self._temperature, + top_p=self._top_p, + ) + def batch(self, inputs: list, **invoke_kwargs) -> list[str]: """Process a batch of inputs using vllm.LLM.chat() or llm.generate(). @@ -364,13 +394,20 @@ def batch(self, inputs: list, **invoke_kwargs) -> list[str]: return [out.outputs[0].text for out in outputs] -def make_model(model: str, max_tokens: int | None = 8192, **kwargs): +def make_model( + model: str, + max_tokens: int | None = 8192, + temperature: float | None = None, + **kwargs, +): """Instantiate a model wrapper from a provider/model-name string. Args: model: Format ``{Provider}/{model_path}``, e.g. ``VLLM/meta-llama/Llama-3.3-70B-Instruct``. max_tokens: Maximum tokens the model may generate. + temperature: Optional generation temperature override. ``None`` keeps + each provider wrapper's default temperature behavior. **kwargs: Provider-specific options forwarded to the model wrapper. For VLLM these include ``max_model_len``, ``chat_template``, and any other ``vllm.LLM`` constructor arguments. @@ -390,6 +427,7 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): return ChatVLLM( model=model_name, max_tokens=max_tokens if max_tokens else 8192, + temperature=temperature if temperature is not None else 0.6, chat_template=chat_template, **vllm_kwargs, ) @@ -397,6 +435,8 @@ def make_model(model: str, max_tokens: int | None = 8192, **kwargs): model_kwargs = {} if max_tokens is not None: model_kwargs["max_tokens"] = max_tokens + if temperature is not None: + model_kwargs["temperature"] = temperature if model_provider == "OpenRouter": # Special case we need to override API url and key From c2056b54a7448836b44ddb26e0ad6dc1b81b438b Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 9 Mar 2026 21:02:25 +0100 Subject: [PATCH 27/35] add option for making mt-bench consistent with the original one from fastchat --- openjury/generate.py | 116 ++++- openjury/generate_and_evaluate.py | 365 ++++++++----- openjury/mt_bench/__init__.py | 5 + openjury/mt_bench/common.py | 67 +++ openjury/mt_bench/fastchat_compat.py | 479 ++++++++++++++++++ openjury/prompts/mt_bench/system-base.txt | 1 + openjury/prompts/mt_bench/user-multi-base.txt | 32 ++ .../mt_bench/user-multi-reference-block.txt | 16 + .../prompts/mt_bench/user-single-base.txt | 10 + .../mt_bench/user-single-reference-block.txt | 4 + openjury/utils.py | 9 + tests/test_generate_and_evaluate.py | 42 +- 12 files changed, 1012 insertions(+), 134 deletions(-) create mode 100644 openjury/mt_bench/__init__.py create mode 100644 openjury/mt_bench/common.py create mode 100644 openjury/mt_bench/fastchat_compat.py create mode 100644 openjury/prompts/mt_bench/system-base.txt create mode 100644 openjury/prompts/mt_bench/user-multi-base.txt create mode 100644 openjury/prompts/mt_bench/user-multi-reference-block.txt create mode 100644 openjury/prompts/mt_bench/user-single-base.txt create mode 100644 openjury/prompts/mt_bench/user-single-reference-block.txt diff --git a/openjury/generate.py b/openjury/generate.py index 5c469ff..cda4bcc 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -1,5 +1,6 @@ import pandas as pd from langchain.prompts import ChatPromptTemplate +from typing import Any from openjury.utils import ( do_inference, @@ -8,6 +9,53 @@ ) +def _set_temperature_on_model(chat_model, temperature: float) -> None: + if hasattr(chat_model, "set_temperature"): + chat_model.set_temperature(temperature) + return + if hasattr(chat_model, "temperature"): + setattr(chat_model, "temperature", temperature) + + +def _infer_grouped_by_temperature( + *, + model_spec: str, + provider: str, + max_tokens: int | None, + model_kwargs: dict[str, Any], + base_model, + inputs: list, + temperatures: list[float], + use_tqdm: bool, +) -> list[str]: + outputs: list[str] = [""] * len(inputs) + groups: dict[float, list[int]] = {} + for idx, temp in enumerate(temperatures): + groups.setdefault(float(temp), []).append(idx) + + for temp in sorted(groups.keys()): + idxs = groups[temp] + group_inputs = [inputs[i] for i in idxs] + + if provider in {"VLLM", "LlamaCpp"}: + _set_temperature_on_model(base_model, temp) + group_model = base_model + else: + group_model = make_model( + model_spec, max_tokens=max_tokens, temperature=temp, **model_kwargs + ) + + group_outs = do_inference( + chat_model=group_model, + inputs=group_inputs, + use_tqdm=use_tqdm, + ) + for i, out in zip(idxs, group_outs): + outputs[i] = out + + return outputs + + def generate_instructions( instructions: pd.Series, model: str, @@ -57,6 +105,7 @@ def generate_multiturn( truncate_input_chars: int | None = 8192, max_tokens: int | None = 8192, use_tqdm: bool = True, + temperature_config: dict[str, float] | None = None, **model_kwargs, ) -> pd.DataFrame: """Generate two-turn completions for MT-Bench style questions. @@ -67,14 +116,33 @@ def generate_multiturn( Args: questions: DataFrame with columns turn_1, turn_2, and index instruction_index. model: Model specification string (e.g. "VLLM/model-name"). + temperature_config: Optional category -> temperature mapping. When set, + inputs are inferred in temperature-homogeneous groups to match + MT-Bench/FastChat category defaults. **model_kwargs: Provider-specific options forwarded to make_model (e.g. max_model_len, chat_template for VLLM). Returns: DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2 """ - chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) + provider = model.split("/")[0] + use_category_temperatures = temperature_config is not None + local_provider = provider in {"VLLM", "LlamaCpp"} + + chat_model = None + if use_category_temperatures and local_provider: + chat_model = make_model(model, max_tokens=max_tokens, temperature=0.0, **model_kwargs) + else: + chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) system_prompt = "You are a helpful assistant." + idxs = questions.index.tolist() + temperatures: list[float] = [] + if use_category_temperatures: + temperatures = [ + temperature_config.get(str(questions.loc[idx].get("category") or ""), 0.7) + for idx in idxs + ] + turn1_template = ChatPromptTemplate.from_messages( [("system", system_prompt), ("user", "{user_prompt}")] ) @@ -87,11 +155,23 @@ def generate_multiturn( ) print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).") - completions_turn_1 = do_inference( - chat_model=chat_model, - inputs=turn1_inputs, - use_tqdm=use_tqdm, - ) + if use_category_temperatures: + completions_turn_1 = _infer_grouped_by_temperature( + model_spec=model, + provider=provider, + max_tokens=max_tokens, + model_kwargs=model_kwargs, + base_model=chat_model, + inputs=turn1_inputs, + temperatures=temperatures, + use_tqdm=use_tqdm, + ) + else: + completions_turn_1 = do_inference( + chat_model=chat_model, + inputs=turn1_inputs, + use_tqdm=use_tqdm, + ) turn2_inputs = [] for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1): @@ -121,15 +201,27 @@ def generate_multiturn( ) print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).") - completions_turn_2 = do_inference( - chat_model=chat_model, - inputs=turn2_inputs, - use_tqdm=use_tqdm, - ) + if use_category_temperatures: + completions_turn_2 = _infer_grouped_by_temperature( + model_spec=model, + provider=provider, + max_tokens=max_tokens, + model_kwargs=model_kwargs, + base_model=chat_model, + inputs=turn2_inputs, + temperatures=temperatures, + use_tqdm=use_tqdm, + ) + else: + completions_turn_2 = do_inference( + chat_model=chat_model, + inputs=turn2_inputs, + use_tqdm=use_tqdm, + ) df_outputs = pd.DataFrame( data={ - "instruction_index": questions.index.tolist(), + "instruction_index": idxs, "completion_turn_1": completions_turn_1, "completion_turn_2": completions_turn_2, }, diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 60619f6..4754b7e 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -20,8 +20,18 @@ ) from openjury.generate import generate_instructions, generate_base, generate_multiturn from openjury.instruction_dataset import load_instructions -from openjury.utils import data_root, read_df, download_hf, truncate -from openjury.utils import make_model, cache_function_dataframe +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.mt_bench.fastchat_compat import ( + FASTCHAT_TEMPERATURE_CONFIG, + judge_mt_bench_pairwise_fastchat, +) +from openjury.utils import ( + cache_function_dataframe, + data_root, + download_hf, + make_model, + read_df, +) NEED_REF_CATS = {"math", "reasoning", "coding"} @@ -81,6 +91,7 @@ class CliArgs: max_model_len: int | None = None chat_template: str | None = None mt_bench_turns: str = "both" + mt_bench_compatibility: str = "openjury" result_folder: str = "results" @@ -89,6 +100,10 @@ def __post_init__(self): assert ( self.swap_mode in supported_modes ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." + supported_mt_bench_modes = ["openjury", "fastchat"] + assert ( + self.mt_bench_compatibility in supported_mt_bench_modes + ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}." @classmethod def parse_args(cls): @@ -210,6 +225,18 @@ def parse_args(cls): "'multi': only turn 2 (with full conversation context), " "'both' (default): evaluate both turns.", ) + parser.add_argument( + "--mt_bench_compatibility", + type=str, + choices=["openjury", "fastchat"], + default="openjury", + help=( + "MT-Bench evaluation/generation mode. " + "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. " + "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, " + "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures." + ), + ) args = parser.parse_args() return cls( @@ -228,6 +255,7 @@ def parse_args(cls): max_model_len=args.max_model_len, chat_template=args.chat_template, mt_bench_turns=args.mt_bench_turns, + mt_bench_compatibility=args.mt_bench_compatibility, result_folder=args.result_folder, ) @@ -280,12 +308,6 @@ def print_results(results): print("=" * 60 + "\n") -def _safe_text(value: object, truncate_chars: int | None) -> str: - if value is None or pd.isna(value): - return "" - return truncate(str(value), max_len=truncate_chars) - - def format_mt_bench_for_evaluation( questions: pd.DataFrame, completions_A: pd.DataFrame, @@ -311,89 +333,66 @@ def format_mt_bench_for_evaluation( completions_b_turn_2: list[str] = [] metadata_turn_2: list[dict[str, object]] = [] - for idx in questions.index: - row = questions.loc[idx] - comp_A_row = ( - completions_A.loc[idx] if idx in completions_A.index else completions_A.iloc[0] - ) - comp_B_row = ( - completions_B.loc[idx] if idx in completions_B.index else completions_B.iloc[0] - ) - category = row.get("category") - needs_ref = category in NEED_REF_CATS - - turn_1_question = _safe_text(row.get("turn_1"), truncate_input_chars) - turn_2_question = _safe_text(row.get("turn_2"), truncate_input_chars) - - answer_a_1 = _safe_text(comp_A_row.get("completion_turn_1", ""), truncate_input_chars) - answer_a_2 = _safe_text(comp_A_row.get("completion_turn_2", ""), truncate_input_chars) - answer_b_1 = _safe_text(comp_B_row.get("completion_turn_1", ""), truncate_input_chars) - answer_b_2 = _safe_text(comp_B_row.get("completion_turn_2", ""), truncate_input_chars) - - ref_1 = _safe_text(row.get("reference_turn_1"), truncate_input_chars) - ref_2 = _safe_text(row.get("reference_turn_2"), truncate_input_chars) - + for row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_A, + completions_b=completions_B, + truncate_input_chars=truncate_input_chars, + ): + needs_ref = row.category in NEED_REF_CATS if eval_single: - if needs_ref and ref_1: + if needs_ref and row.ref_1: instruction = ( "[MT-Bench | Turn 1]\n" "Use the reference answer for correctness checks.\n\n" - f"[Question]\n{turn_1_question}\n\n" - f"[Reference Answer]\n{ref_1}" + f"[Question]\n{row.turn_1_question}\n\n" + f"[Reference Answer]\n{row.ref_1}" ) else: - instruction = turn_1_question + instruction = row.turn_1_question instructions_turn_1.append(instruction) - completions_a_turn_1.append(answer_a_1) - completions_b_turn_1.append(answer_b_1) + completions_a_turn_1.append(row.answer_a_1) + completions_b_turn_1.append(row.answer_b_1) metadata_turn_1.append( { - "question_id": idx, - "category": category, + "question_id": row.question_id, + "category": row.category, "turn": 1, } ) - if eval_multi and turn_2_question: + if eval_multi and row.turn_2_question: instruction_parts = [ "Please focus on which assistant provides a better answer to the second user question." ] - if needs_ref and (ref_1 or ref_2): + if needs_ref and (row.ref_1 or row.ref_2): instruction_parts.extend( [ "<|The Start of Reference Answer|>", "### User:", - turn_1_question, + row.turn_1_question, "### Reference answer:", - ref_1, + row.ref_1, "### User:", - turn_2_question, + row.turn_2_question, "### Reference answer:", - ref_2, + row.ref_2, "<|The End of Reference Answer|>", ] ) - conversation_a = ( - "### User:\n" - f"{turn_1_question}\n\n" - "### Assistant:\n" - f"{answer_a_1}\n\n" - "### User:\n" - f"{turn_2_question}\n\n" - "### Assistant:\n" - f"{answer_a_2}" + conversation_a = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_a_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_a_2, ) - conversation_b = ( - "### User:\n" - f"{turn_1_question}\n\n" - "### Assistant:\n" - f"{answer_b_1}\n\n" - "### User:\n" - f"{turn_2_question}\n\n" - "### Assistant:\n" - f"{answer_b_2}" + conversation_b = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_b_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_b_2, ) instructions_turn_2.append("\n\n".join(instruction_parts)) @@ -401,8 +400,8 @@ def format_mt_bench_for_evaluation( completions_b_turn_2.append(conversation_b) metadata_turn_2.append( { - "question_id": idx, - "category": category, + "question_id": row.question_id, + "category": row.category, "turn": 2, } ) @@ -423,6 +422,25 @@ def format_mt_bench_for_evaluation( ) +def _format_mt_bench_multiturn_conversation( + *, + turn_1_question: str, + turn_1_answer: str, + turn_2_question: str, + turn_2_answer: str, +) -> str: + return ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{turn_1_answer}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{turn_2_answer}" + ) + + def compute_preference_stats(prefs: pd.Series) -> dict: """Derive win/loss/tie counts and winrate from a Series of preferences. @@ -736,54 +754,140 @@ def main(args: CliArgs): return prefs -def _run_mt_bench(args: CliArgs, ignore_cache: bool): - """MT-Bench pipeline routed through score-based judging.""" - questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) - - print( - f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." - ) - - gen_kwargs = dict( - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - max_model_len=args.max_model_len, - chat_template=args.chat_template, +def _generate_mt_bench_completions( + args: CliArgs, + questions_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + cache_prefix = ( + "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench" ) - completions_A = cache_function_dataframe( - lambda: generate_multiturn( + def _run_generation(model_name: str) -> pd.DataFrame: + if args.mt_bench_compatibility == "fastchat": + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + temperature_config=FASTCHAT_TEMPERATURE_CONFIG, + ) + return generate_multiturn( questions=questions_df, - model=args.model_A, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, use_tqdm=args.use_tqdm, - **gen_kwargs, - ), + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), ignore_cache=ignore_cache, - cache_name=f"mt-bench_{args.model_A}_{args.n_instructions}", + cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}", ).set_index("instruction_index") - completions_B = cache_function_dataframe( - lambda: generate_multiturn( - questions=questions_df, - model=args.model_B, - use_tqdm=args.use_tqdm, - **gen_kwargs, - ), + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), ignore_cache=ignore_cache, - cache_name=f"mt-bench_{args.model_B}_{args.n_instructions}", + cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}", ).set_index("instruction_index") + return completions_a, completions_b - judge_chat_model = make_model( - model=args.judge_model, - max_tokens=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, + +def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + +def _save_mt_bench_results( + *, + args: CliArgs, + results: dict[str, object], + annotations_df: pd.DataFrame, + name_suffix: str | None = None, +) -> None: + name = _build_mt_bench_result_name(args, suffix=name_suffix) + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + +def _run_mt_bench_fastchat( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + prefs, annotations, combined_metadata, num_inconsistent = ( + judge_mt_bench_pairwise_fastchat( + judge_chat_model=judge_chat_model, + judge_model=args.judge_model, + questions=questions_df, + completions_a=completions_a, + completions_b=completions_b, + model_a=args.model_A, + model_b=args.model_B, + turns_mode=args.mt_bench_turns, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + ) + + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "mt_bench_compatibility": args.mt_bench_compatibility, + "num_inconsistent": num_inconsistent, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + _save_mt_bench_results( + args=args, + results=results, + annotations_df=pd.DataFrame(annotations), + name_suffix=f"mtbench_{args.mt_bench_compatibility}", ) + return prefs + +def _run_mt_bench_openjury( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( questions=questions_df, - completions_A=completions_A, - completions_B=completions_B, + completions_A=completions_a, + completions_B=completions_b, turns_mode=args.mt_bench_turns, truncate_input_chars=args.truncate_all_input_chars, ) @@ -841,10 +945,7 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): combined_metadata.extend(combined_metadata_turn_1) if instructions_turn_2: - ( - mt_system_prompt, - mt_user_prompt_template, - ) = load_judge_system_and_user_prompt( + mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt( provide_explanation=args.provide_explanation, multi_turn=True, ) @@ -883,7 +984,6 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): if preference_parts else pd.Series(dtype=float) ) - stats = compute_preference_stats(prefs) results = { "dataset": args.dataset, @@ -899,16 +999,6 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): } print_results(results) - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" - name += f"-{args.swap_mode}" - name = name.replace("/", "_") - - res_folder = Path(args.result_folder) / name - res_folder.mkdir(parents=True, exist_ok=True) - - with open(res_folder / f"args-{name}.json", "w") as f: - json.dump(asdict(args), f, indent=2) - df = pd.DataFrame(annotations) df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] df["category"] = [meta["category"] for meta in metadata_for_annotations] @@ -916,6 +1006,7 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): df["model_A"] = args.model_A df["model_B"] = args.model_B df["judge"] = args.judge_model + if args.swap_mode == "both": df_reversed = pd.DataFrame(annotations_reversed) df_reversed["instruction_index"] = [ @@ -924,21 +1015,55 @@ def _run_mt_bench(args: CliArgs, ignore_cache: bool): df_reversed["category"] = [ meta["category"] for meta in metadata_for_reversed_annotations ] - df_reversed["turn"] = [ - meta["turn"] for meta in metadata_for_reversed_annotations - ] + df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations] df_reversed["model_A"] = args.model_B df_reversed["model_B"] = args.model_A df_reversed["judge"] = args.judge_model df = pd.concat([df, df_reversed], ignore_index=True) - df.to_csv(res_folder / f"{name}-annotations.csv", index=False) - - with open(res_folder / f"results-{name}.json", "w") as f: - json.dump(results, f, indent=2) + _save_mt_bench_results( + args=args, + results=results, + annotations_df=df, + ) return prefs +def _run_mt_bench(args: CliArgs, ignore_cache: bool): + """MT-Bench pipeline (optionally FastChat-compatible).""" + questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) + print( + f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_completions( + args=args, + questions_df=questions_df, + ignore_cache=ignore_cache, + ) + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + if args.mt_bench_compatibility == "fastchat": + return _run_mt_bench_fastchat( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) + return _run_mt_bench_openjury( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) + + def cli(): args = CliArgs.parse_args() print(f"Running with CLI args: {args.__dict__}") diff --git a/openjury/mt_bench/__init__.py b/openjury/mt_bench/__init__.py new file mode 100644 index 0000000..c5cdd59 --- /dev/null +++ b/openjury/mt_bench/__init__.py @@ -0,0 +1,5 @@ +"""MT-Bench-specific helpers. + +This package intentionally contains MT-Bench specific logic. +""" + diff --git a/openjury/mt_bench/common.py b/openjury/mt_bench/common.py new file mode 100644 index 0000000..8a0028e --- /dev/null +++ b/openjury/mt_bench/common.py @@ -0,0 +1,67 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Iterator + +import pandas as pd + +from openjury.utils import safe_text + + +@dataclass(frozen=True) +class MTBenchPairwiseRow: + question_id: object + category: str | None + turn_1_question: str + turn_2_question: str + answer_a_1: str + answer_a_2: str + answer_b_1: str + answer_b_2: str + ref_1: str + ref_2: str + + +def iter_mt_bench_pairwise_rows( + *, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + truncate_input_chars: int | None, +) -> Iterator[MTBenchPairwiseRow]: + for question_id in questions.index.tolist(): + row = questions.loc[question_id] + comp_a_row = ( + completions_a.loc[question_id] + if question_id in completions_a.index + else completions_a.iloc[0] + ) + comp_b_row = ( + completions_b.loc[question_id] + if question_id in completions_b.index + else completions_b.iloc[0] + ) + yield MTBenchPairwiseRow( + question_id=question_id, + category=row.get("category"), + turn_1_question=safe_text(row.get("turn_1"), truncate_input_chars), + turn_2_question=safe_text(row.get("turn_2"), truncate_input_chars), + answer_a_1=safe_text( + comp_a_row.get("completion_turn_1", ""), + truncate_input_chars, + ), + answer_a_2=safe_text( + comp_a_row.get("completion_turn_2", ""), + truncate_input_chars, + ), + answer_b_1=safe_text( + comp_b_row.get("completion_turn_1", ""), + truncate_input_chars, + ), + answer_b_2=safe_text( + comp_b_row.get("completion_turn_2", ""), + truncate_input_chars, + ), + ref_1=safe_text(row.get("reference_turn_1"), truncate_input_chars), + ref_2=safe_text(row.get("reference_turn_2"), truncate_input_chars), + ) diff --git a/openjury/mt_bench/fastchat_compat.py b/openjury/mt_bench/fastchat_compat.py new file mode 100644 index 0000000..5883391 --- /dev/null +++ b/openjury/mt_bench/fastchat_compat.py @@ -0,0 +1,479 @@ +from __future__ import annotations + +import math +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Literal + +import pandas as pd +from langchain.prompts import ChatPromptTemplate + +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.utils import do_inference + + +FASTCHAT_TEMPERATURE_CONFIG: dict[str, float] = { + "writing": 0.7, + "roleplay": 0.7, + "extraction": 0.0, + "math": 0.0, + "coding": 0.0, + "reasoning": 0.0, + "stem": 0.1, + "humanities": 0.1, + "arena-hard-200": 0.0, +} + +# "arena-hard-200" is a FastChat-internal category label, not OpenJury's arena-hard dataset. +FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding", "arena-hard-200"} + +FastChatVerdict = Literal["A", "B", "tie", "error"] +PairwiseWinner = Literal["model_A", "model_B", "tie", "error"] + + +@dataclass(frozen=True) +class FastChatPairwisePrompt: + name: str + system_prompt: str + user_prompt_template: str + multi_turn: bool + ref_based: bool + + +_PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts" / "mt_bench" +_SYSTEM_BASE_FILE = "system-base.txt" +_USER_SINGLE_BASE_FILE = "user-single-base.txt" +_USER_MULTI_BASE_FILE = "user-multi-base.txt" +_USER_SINGLE_REF_BLOCK_FILE = "user-single-reference-block.txt" +_USER_MULTI_REF_BLOCK_FILE = "user-multi-reference-block.txt" + + +def _load_prompt_text(filename: str) -> str: + path = _PROMPTS_DIR / filename + return path.read_text(encoding="utf-8") + + +def _render_prompt_text(filename: str, **kwargs: str) -> str: + return _load_prompt_text(filename).format(**kwargs) + + +def _build_system_prompt( + *, + user_subject: str, + task_description: str, + begin_instruction: str, + focus_line: str = "", +) -> str: + focus_segment = f"{focus_line} " if focus_line else "" + return _render_prompt_text( + _SYSTEM_BASE_FILE, + user_subject=user_subject, + task_description=task_description, + focus_line=focus_segment, + begin_instruction=begin_instruction, + ) + + +def _build_user_prompt_template(*, multi_turn: bool, ref_based: bool) -> str: + base_filename = _USER_MULTI_BASE_FILE if multi_turn else _USER_SINGLE_BASE_FILE + reference_block = "" + if ref_based: + ref_block_filename = ( + _USER_MULTI_REF_BLOCK_FILE if multi_turn else _USER_SINGLE_REF_BLOCK_FILE + ) + reference_block = _load_prompt_text(ref_block_filename) + return _render_prompt_text(base_filename, reference_block=reference_block) + + +def _load_pairwise_prompt( + *, + name: str, + multi_turn: bool, + ref_based: bool, + system_user_subject: str, + system_task_description: str, + system_begin_instruction: str, + system_focus_line: str = "", +) -> FastChatPairwisePrompt: + return FastChatPairwisePrompt( + name=name, + multi_turn=multi_turn, + ref_based=ref_based, + system_prompt=_build_system_prompt( + user_subject=system_user_subject, + task_description=system_task_description, + begin_instruction=system_begin_instruction, + focus_line=system_focus_line, + ), + user_prompt_template=_build_user_prompt_template( + multi_turn=multi_turn, + ref_based=ref_based, + ), + ) + + +_PAIR_V2 = _load_pairwise_prompt( + name="pair-v2", + multi_turn=False, + ref_based=False, + system_user_subject="question displayed below", + system_task_description=( + "You should choose the assistant that follows the user's instructions and answers " + "the user's question better. Your evaluation should consider factors such as the " + "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their " + "responses." + ), + system_begin_instruction="comparing the two responses and provide a short explanation", +) + +_PAIR_V2_MULTI = _load_pairwise_prompt( + name="pair-v2-multi-turn", + multi_turn=True, + ref_based=False, + system_user_subject="questions", + system_task_description=( + "You should choose the assistant that follows the user's instructions and answers " + "the user's questions better. Your evaluation should consider factors such as the " + "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their " + "responses." + ), + system_focus_line="You should focus on who provides a better answer to the second user question.", + system_begin_instruction=( + "comparing the responses of the two assistants and provide a short explanation" + ), +) + +_PAIR_MATH_V1 = _load_pairwise_prompt( + name="pair-math-v1", + multi_turn=False, + ref_based=True, + system_user_subject="question displayed below", + system_task_description=( + "Your evaluation should consider correctness and helpfulness. You will be given a " + "reference answer, assistant A's answer, and assistant B's answer. Your job is to " + "evaluate which assistant's answer is better." + ), + system_begin_instruction=( + "comparing both assistants' answers with the reference answer. Identify and correct any mistakes" + ), +) + +_PAIR_MATH_V1_MULTI = _load_pairwise_prompt( + name="pair-math-v1-multi-turn", + multi_turn=True, + ref_based=True, + system_user_subject="questions", + system_task_description=( + "Your evaluation should consider correctness and helpfulness. You will be given " + "reference answers, the assistant A's answers, the assistant B's answers. Your job is " + "to determine which assistant provides correct and helpful answers to the second user question." + ), + system_begin_instruction=( + "comparing both assistants' answers with the reference answers. Identify and correct any mistakes" + ), +) + + +def _parse_fastchat_verdict(judgment: str) -> FastChatVerdict: + if "[[A]]" in judgment: + return "A" + if "[[B]]" in judgment: + return "B" + if "[[C]]" in judgment: + return "tie" + return "error" + + +def _map_verdict_to_winner(verdict: FastChatVerdict, swapped: bool) -> PairwiseWinner: + if verdict == "tie": + return "tie" + if verdict == "error": + return "error" + if verdict == "A": + return "model_B" if swapped else "model_A" + if verdict == "B": + return "model_A" if swapped else "model_B" + return "error" + + +def _conservative_winner(g1: PairwiseWinner, g2: PairwiseWinner) -> tuple[PairwiseWinner, bool]: + """Conservative position-bias handling (FastChat/MT-Bench paper). + + Declare a winner only if the two orderings agree; otherwise treat as tie. + """ + if g1 == "error" or g2 == "error": + return "error", False + if g1 == g2: + return g1, False + return "tie", True + + +def _winner_to_preference(winner: PairwiseWinner) -> float: + if winner == "model_A": + return 0.0 + if winner == "model_B": + return 1.0 + if winner == "tie": + return 0.5 + return math.nan + + +def _select_prompt(category: str | None, multi_turn: bool) -> FastChatPairwisePrompt: + needs_ref = (category or "") in FASTCHAT_NEED_REF_CATS + if needs_ref and multi_turn: + return _PAIR_MATH_V1_MULTI + if needs_ref: + return _PAIR_MATH_V1 + if multi_turn: + return _PAIR_V2_MULTI + return _PAIR_V2 + + +def _group_indices_by_prompt( + items: list[dict[str, Any]], +) -> dict[str, list[int]]: + grouped: dict[str, list[int]] = {} + for idx, item in enumerate(items): + grouped.setdefault(item["prompt_name"], []).append(idx) + return grouped + + +def _swap_prompt_kwargs(kwargs: dict[str, str], *, multi_turn: bool) -> dict[str, str]: + swapped = dict(kwargs) + if multi_turn: + swapped["answer_a_1"], swapped["answer_b_1"] = swapped["answer_b_1"], swapped["answer_a_1"] + swapped["answer_a_2"], swapped["answer_b_2"] = swapped["answer_b_2"], swapped["answer_a_2"] + return swapped + swapped["answer_a"], swapped["answer_b"] = swapped["answer_b"], swapped["answer_a"] + return swapped + + +def _infer_by_prompt_groups( + *, + judge_chat_model, + items: list[dict[str, Any]], + use_tqdm: bool, + swap_answers: bool, +) -> list[str]: + """Run judge inference, grouping by prompt variant for batching.""" + grouped_indices = _group_indices_by_prompt(items) + + judgments: list[str] = [""] * len(items) + for prompt_name, idxs in grouped_indices.items(): + prompt: FastChatPairwisePrompt = items[idxs[0]]["prompt"] + prompt_template = ChatPromptTemplate.from_messages( + [("system", prompt.system_prompt), ("user", prompt.user_prompt_template)] + ) + + batch_kwargs = [] + for i in idxs: + kwargs = items[i]["prompt_kwargs"] + if swap_answers: + kwargs = _swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn) + batch_kwargs.append(kwargs) + + prompt_inputs = prompt_template.batch(batch_kwargs) + outs = do_inference( + chat_model=judge_chat_model, + inputs=prompt_inputs, + use_tqdm=use_tqdm, + ) + for i, out in zip(idxs, outs): + judgments[i] = str(out) + return judgments + + +def _build_fastchat_judge_items( + *, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + eval_single: bool, + eval_multi: bool, + truncate_input_chars: int | None, +) -> list[dict[str, Any]]: + items: list[dict[str, Any]] = [] + for pair_row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_a, + completions_b=completions_b, + truncate_input_chars=truncate_input_chars, + ): + category = pair_row.category + if eval_single: + prompt = _select_prompt(category, multi_turn=False) + kwargs: dict[str, str] = { + "question": pair_row.turn_1_question, + "answer_a": pair_row.answer_a_1, + "answer_b": pair_row.answer_b_1, + } + if prompt.ref_based: + kwargs["ref_answer_1"] = pair_row.ref_1 + items.append( + { + "question_id": pair_row.question_id, + "category": category, + "turn": 1, + "prompt": prompt, + "prompt_name": prompt.name, + "prompt_kwargs": kwargs, + } + ) + + if eval_multi and pair_row.turn_2_question: + prompt = _select_prompt(category, multi_turn=True) + kwargs = { + "question_1": pair_row.turn_1_question, + "question_2": pair_row.turn_2_question, + "answer_a_1": pair_row.answer_a_1, + "answer_a_2": pair_row.answer_a_2, + "answer_b_1": pair_row.answer_b_1, + "answer_b_2": pair_row.answer_b_2, + } + if prompt.ref_based: + kwargs["ref_answer_1"] = pair_row.ref_1 + kwargs["ref_answer_2"] = pair_row.ref_2 + items.append( + { + "question_id": pair_row.question_id, + "category": category, + "turn": 2, + "prompt": prompt, + "prompt_name": prompt.name, + "prompt_kwargs": kwargs, + } + ) + return items + + +def _resolve_fastchat_item_result( + *, + item: dict[str, Any], + g1_raw: str, + g2_raw: str | None, + judge_model: str, + model_a: str, + model_b: str, +) -> tuple[dict[str, Any], dict[str, object], float, bool]: + prompt: FastChatPairwisePrompt = item["prompt"] + kwargs = item["prompt_kwargs"] + g1_user_prompt = prompt.user_prompt_template.format(**kwargs) + g1_verdict = _parse_fastchat_verdict(g1_raw) + g1_winner = _map_verdict_to_winner(g1_verdict, swapped=False) + + final_winner = g1_winner + inconsistent = False + annotation_row: dict[str, Any] = { + "question_id": item["question_id"], + "category": item["category"], + "turn": item["turn"], + "model_A": model_a, + "model_B": model_b, + "judge": judge_model, + "prompt_name": prompt.name, + "system_prompt": prompt.system_prompt, + "g1_user_prompt": g1_user_prompt, + "g1_judgment": g1_raw, + "g1_verdict": g1_verdict, + "g1_winner": g1_winner, + } + + if g2_raw is not None: + g2_verdict = _parse_fastchat_verdict(g2_raw) + g2_winner = _map_verdict_to_winner(g2_verdict, swapped=True) + final_winner, inconsistent = _conservative_winner(g1_winner, g2_winner) + annotation_row.update( + { + "g2_user_prompt": prompt.user_prompt_template.format( + **_swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn) + ), + "g2_judgment": g2_raw, + "g2_verdict": g2_verdict, + "g2_winner": g2_winner, + "final_winner": final_winner, + "inconsistent": inconsistent, + } + ) + else: + annotation_row["final_winner"] = final_winner + annotation_row["inconsistent"] = False + + preference = _winner_to_preference(final_winner) + annotation_row["preference"] = preference + metadata = { + "question_id": item["question_id"], + "category": item["category"], + "turn": item["turn"], + } + return annotation_row, metadata, preference, inconsistent + + +def judge_mt_bench_pairwise_fastchat( + *, + judge_chat_model, + judge_model: str, + questions: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + model_a: str, + model_b: str, + turns_mode: str, + swap_mode: str, + truncate_input_chars: int | None, + use_tqdm: bool, +) -> tuple[pd.Series, list[dict[str, Any]], list[dict[str, object]], int]: + """Pairwise MT-Bench judging compatible with FastChat's `[[A]]/[[B]]/[[C]]` format.""" + assert turns_mode in ("both", "single", "multi") + assert swap_mode in ("fixed", "both") + + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + items = _build_fastchat_judge_items( + questions=questions, + completions_a=completions_a, + completions_b=completions_b, + eval_single=eval_single, + eval_multi=eval_multi, + truncate_input_chars=truncate_input_chars, + ) + + g1_judgments = _infer_by_prompt_groups( + judge_chat_model=judge_chat_model, + items=items, + use_tqdm=use_tqdm, + swap_answers=False, + ) + + g2_judgments: list[str] | None = None + if swap_mode == "both": + g2_judgments = _infer_by_prompt_groups( + judge_chat_model=judge_chat_model, + items=items, + use_tqdm=use_tqdm, + swap_answers=True, + ) + + annotations: list[dict[str, Any]] = [] + metadata: list[dict[str, object]] = [] + prefs: list[float] = [] + num_inconsistent = 0 + + for idx, item in enumerate(items): + g2_raw = g2_judgments[idx] if g2_judgments is not None else None + annotation_row, item_metadata, preference, inconsistent = _resolve_fastchat_item_result( + item=item, + g1_raw=g1_judgments[idx], + g2_raw=g2_raw, + judge_model=judge_model, + model_a=model_a, + model_b=model_b, + ) + if inconsistent: + num_inconsistent += 1 + annotations.append(annotation_row) + metadata.append(item_metadata) + prefs.append(preference) + + return pd.Series(prefs, dtype=float), annotations, metadata, num_inconsistent + diff --git a/openjury/prompts/mt_bench/system-base.txt b/openjury/prompts/mt_bench/system-base.txt new file mode 100644 index 0000000..b4aff2e --- /dev/null +++ b/openjury/prompts/mt_bench/system-base.txt @@ -0,0 +1 @@ +Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user {user_subject}. {task_description} {focus_line}Begin your evaluation by {begin_instruction}. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie. diff --git a/openjury/prompts/mt_bench/user-multi-base.txt b/openjury/prompts/mt_bench/user-multi-base.txt new file mode 100644 index 0000000..33abb79 --- /dev/null +++ b/openjury/prompts/mt_bench/user-multi-base.txt @@ -0,0 +1,32 @@ +{reference_block}<|The Start of Assistant A's Conversation with User|> + +### User: +{{question_1}} + +### Assistant A: +{{answer_a_1}} + +### User: +{{question_2}} + +### Assistant A: +{{answer_a_2}} + +<|The End of Assistant A's Conversation with User|> + + +<|The Start of Assistant B's Conversation with User|> + +### User: +{{question_1}} + +### Assistant B: +{{answer_b_1}} + +### User: +{{question_2}} + +### Assistant B: +{{answer_b_2}} + +<|The End of Assistant B's Conversation with User|> diff --git a/openjury/prompts/mt_bench/user-multi-reference-block.txt b/openjury/prompts/mt_bench/user-multi-reference-block.txt new file mode 100644 index 0000000..703554d --- /dev/null +++ b/openjury/prompts/mt_bench/user-multi-reference-block.txt @@ -0,0 +1,16 @@ +<|The Start of Reference Answer|> + +### User: +{question_1} + +### Reference answer: +{ref_answer_1} + +### User: +{question_2} + +### Reference answer: +{ref_answer_2} + +<|The End of Reference Answer|> + diff --git a/openjury/prompts/mt_bench/user-single-base.txt b/openjury/prompts/mt_bench/user-single-base.txt new file mode 100644 index 0000000..ee7701c --- /dev/null +++ b/openjury/prompts/mt_bench/user-single-base.txt @@ -0,0 +1,10 @@ +[User Question] +{{question}} + +{reference_block}[The Start of Assistant A's Answer] +{{answer_a}} +[The End of Assistant A's Answer] + +[The Start of Assistant B's Answer] +{{answer_b}} +[The End of Assistant B's Answer] diff --git a/openjury/prompts/mt_bench/user-single-reference-block.txt b/openjury/prompts/mt_bench/user-single-reference-block.txt new file mode 100644 index 0000000..1b687d2 --- /dev/null +++ b/openjury/prompts/mt_bench/user-single-reference-block.txt @@ -0,0 +1,4 @@ +[The Start of Reference Answer] +{ref_answer_1} +[The End of Reference Answer] + diff --git a/openjury/utils.py b/openjury/utils.py index 56e57db..17b9a0d 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -50,6 +50,15 @@ def truncate(s: str, max_len: int | None = None) -> str: return s +def safe_text(value: object, truncate_chars: int | None) -> str: + if value is None: + return "" + is_missing = pd.isna(value) + if isinstance(is_missing, bool) and is_missing: + return "" + return truncate(str(value), max_len=truncate_chars) + + def do_inference(chat_model, inputs, use_tqdm: bool = False): # Retries on rate-limit/server errors with exponential backoff. # Async path retries individual calls; batch path splits into 4^attempt chunks on failure. diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index d58460e..c0c2f20 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -111,7 +111,7 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path): ) avg_pref = sum(prefs) / len(prefs) - assert avg_pref == 0.5 + assert avg_pref == pytest.approx(0.5) def test_main_non_mt_bench_reuses_judge_turn(monkeypatch, tmp_path): @@ -286,4 +286,42 @@ def test_mt_bench_multi_turn_only(tmp_path): ) assert all(p > 0.5 for p in prefs) - assert len(prefs) == 5 # one annotation per question, turn 2 only \ No newline at end of file + assert len(prefs) == 5 # one annotation per question, turn 2 only + + +def test_mt_bench_fastchat_fixed_verdicts(tmp_path): + """FastChat-compatible MT-Bench judging uses [[A]]/[[B]]/[[C]] parsing.""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/[[A]]", + n_instructions=5, + mt_bench_compatibility="fastchat", + result_folder=str(tmp_path), + ) + ) + + assert len(prefs) == 10 # two turns per question + assert all(p < 0.5 for p in prefs) + + +def test_mt_bench_fastchat_conservative_swap_mode(tmp_path): + """FastChat-compatible swap_mode='both' is conservative (tie if inconsistent).""" + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench", + model_A="Dummy/answer A", + model_B="Dummy/answer B", + judge_model="Dummy/[[A]]", # position-A biased judge + n_instructions=3, + swap_mode="both", + mt_bench_compatibility="fastchat", + result_folder=str(tmp_path), + ) + ) + + # Conservative swap runs both orders, but returns one resolved verdict per match. + assert len(prefs) == 6 # 3 questions * 2 turns + assert all(p == pytest.approx(0.5) for p in prefs) \ No newline at end of file From f9832a06829ca34be5367a31a444948de7e948bb Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Mon, 9 Mar 2026 23:28:19 +0100 Subject: [PATCH 28/35] add support for mt-bench-101 --- openjury/generate_and_evaluate.py | 200 ++++++++++++- openjury/instruction_dataset/__init__.py | 11 + openjury/instruction_dataset/mt_bench_101.py | 125 ++++++++ openjury/mt_bench_101/__init__.py | 15 + openjury/mt_bench_101/evaluate.py | 283 ++++++++++++++++++ openjury/mt_bench_101/generate.py | 68 +++++ openjury/prompts/mt_bench_101/AR.txt | 13 + openjury/prompts/mt_bench_101/CC.txt | 17 ++ openjury/prompts/mt_bench_101/CM.txt | 15 + openjury/prompts/mt_bench_101/GR.txt | 15 + openjury/prompts/mt_bench_101/IC.txt | 16 + openjury/prompts/mt_bench_101/MR.txt | 15 + openjury/prompts/mt_bench_101/PI.txt | 15 + openjury/prompts/mt_bench_101/SA.txt | 13 + openjury/prompts/mt_bench_101/SC.txt | 13 + openjury/prompts/mt_bench_101/SI.txt | 13 + openjury/prompts/mt_bench_101/TS.txt | 15 + .../prompts/mt_bench_101/global_system.txt | 5 + openjury/prompts/mt_bench_101/rephrasing.txt | 13 + .../prompts/mt_bench_101/scoring_format.txt | 9 + openjury/utils.py | 2 + tests/test_generate_and_evaluate.py | 48 ++- tests/test_mt_bench_101.py | 139 +++++++++ tests/test_mt_bench_downloads.py | 9 +- 24 files changed, 1075 insertions(+), 12 deletions(-) create mode 100644 openjury/instruction_dataset/mt_bench_101.py create mode 100644 openjury/mt_bench_101/__init__.py create mode 100644 openjury/mt_bench_101/evaluate.py create mode 100644 openjury/mt_bench_101/generate.py create mode 100644 openjury/prompts/mt_bench_101/AR.txt create mode 100644 openjury/prompts/mt_bench_101/CC.txt create mode 100644 openjury/prompts/mt_bench_101/CM.txt create mode 100644 openjury/prompts/mt_bench_101/GR.txt create mode 100644 openjury/prompts/mt_bench_101/IC.txt create mode 100644 openjury/prompts/mt_bench_101/MR.txt create mode 100644 openjury/prompts/mt_bench_101/PI.txt create mode 100644 openjury/prompts/mt_bench_101/SA.txt create mode 100644 openjury/prompts/mt_bench_101/SC.txt create mode 100644 openjury/prompts/mt_bench_101/SI.txt create mode 100644 openjury/prompts/mt_bench_101/TS.txt create mode 100644 openjury/prompts/mt_bench_101/global_system.txt create mode 100644 openjury/prompts/mt_bench_101/rephrasing.txt create mode 100644 openjury/prompts/mt_bench_101/scoring_format.txt create mode 100644 tests/test_mt_bench_101.py diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 4754b7e..f34f395 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -25,6 +25,13 @@ FASTCHAT_TEMPERATURE_CONFIG, judge_mt_bench_pairwise_fastchat, ) +from openjury.mt_bench_101.evaluate import ( + derive_mt_bench_101_pairwise_preferences, + judge_mt_bench_101_single, + summarize_mt_bench_101_absolute_scores, + summarize_mt_bench_101_pairwise, +) +from openjury.mt_bench_101.generate import generate_mt_bench_101_completions from openjury.utils import ( cache_function_dataframe, data_root, @@ -100,10 +107,17 @@ def __post_init__(self): assert ( self.swap_mode in supported_modes ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." - supported_mt_bench_modes = ["openjury", "fastchat"] - assert ( - self.mt_bench_compatibility in supported_mt_bench_modes - ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}." + if self.dataset == "mt-bench": + supported_mt_bench_modes = ["openjury", "fastchat"] + supported_mt_bench_turns = ["both", "single", "multi"] + assert self.mt_bench_compatibility in supported_mt_bench_modes, ( + f"Only {supported_mt_bench_modes} are supported but got " + f"{self.mt_bench_compatibility}." + ) + assert self.mt_bench_turns in supported_mt_bench_turns, ( + f"Only {supported_mt_bench_turns} are supported but got " + f"{self.mt_bench_turns}." + ) @classmethod def parse_args(cls): @@ -112,8 +126,8 @@ def parse_args(cls): ) parser.add_argument( "--dataset", - help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction " - "tuning cases or `french-contexts`, `spanish-contexts` for base models.", + help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU`, `mt-bench`, " + "`mt-bench-101` for instruction tuning cases or `french-contexts`, `spanish-contexts` for base models.", ) parser.add_argument( "--model_A", @@ -216,7 +230,11 @@ def parse_args(cls): help="Jinja2 chat template string to use instead of the model's tokenizer template. " "If not provided, ChatML is used as fallback for models without a chat template.", ) - parser.add_argument( + mt_bench_group = parser.add_argument_group( + "MT-Bench (original) options", + "These options only apply when --dataset is mt-bench.", + ) + mt_bench_group.add_argument( "--mt_bench_turns", type=str, choices=["both", "single", "multi"], @@ -225,7 +243,7 @@ def parse_args(cls): "'multi': only turn 2 (with full conversation context), " "'both' (default): evaluate both turns.", ) - parser.add_argument( + mt_bench_group.add_argument( "--mt_bench_compatibility", type=str, choices=["openjury", "fastchat"], @@ -591,6 +609,8 @@ def main(args: CliArgs): # MT-Bench has its own pipeline: multi-turn generation + category-aware judging if args.dataset == "mt-bench": return _run_mt_bench(args, ignore_cache) + if args.dataset == "mt-bench-101": + return _run_mt_bench_101(args, ignore_cache) # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset @@ -799,6 +819,36 @@ def _run_generation(model_name: str) -> pd.DataFrame: return completions_a, completions_b +def _generate_mt_bench_101_completions( + args: CliArgs, + eval_items_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + def _run_generation(model_name: str) -> pd.DataFrame: + return generate_mt_bench_101_completions( + eval_items=eval_items_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), + ignore_cache=ignore_cache, + cache_name=f"mt-bench-101_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), + ignore_cache=ignore_cache, + cache_name=f"mt-bench-101_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + return completions_a, completions_b + + def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" name += f"-{args.swap_mode}" @@ -807,14 +857,28 @@ def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str return name.replace("/", "_") +def _build_mt_bench_101_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + def _save_mt_bench_results( *, args: CliArgs, results: dict[str, object], annotations_df: pd.DataFrame, name_suffix: str | None = None, + result_name: str | None = None, ) -> None: - name = _build_mt_bench_result_name(args, suffix=name_suffix) + if result_name is not None and name_suffix is not None: + raise ValueError("Provide only one of result_name or name_suffix.") + name = ( + result_name + if result_name is not None + else _build_mt_bench_result_name(args, suffix=name_suffix) + ) res_folder = Path(args.result_folder) / name res_folder.mkdir(parents=True, exist_ok=True) @@ -1029,6 +1093,124 @@ def _run_mt_bench_openjury( return prefs +def _run_mt_bench_101(args: CliArgs, ignore_cache: bool) -> pd.Series: + """MT-Bench-101 pipeline with paper-faithful single-answer grading.""" + if args.mt_bench_compatibility or args.mt_bench_turns: + print( + "MT-Bench-101 is a different benchmark from original MT-Bench. " + "--mt_bench_turns and --mt_bench_compatibility have no effect for this dataset, " + ) + if args.swap_mode: + print( + "--swap_mode has no effect for mt-bench-101 since it does single answer grading before comparing the models" + ) + + eval_items_df = load_instructions( + "mt-bench-101", n_instructions=args.n_instructions + ) + print( + "Generating completions from golden context for MT-Bench-101 with " + f"{args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_101_completions( + args=args, + eval_items_df=eval_items_df, + ignore_cache=ignore_cache, + ) + + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.6, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + scored_a = judge_mt_bench_101_single( + judge_chat_model=judge_chat_model, + eval_items=eval_items_df, + completions=completions_a, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + scored_b = judge_mt_bench_101_single( + judge_chat_model=judge_chat_model, + eval_items=eval_items_df, + completions=completions_b, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + + absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a) + absolute_b = summarize_mt_bench_101_absolute_scores(scored_turns=scored_b) + pairwise_turns = derive_mt_bench_101_pairwise_preferences( + scored_a=scored_a, + scored_b=scored_b, + ) + pairwise_summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns) + dialogue_pairwise = pairwise_summary["dialogue_level"] + + print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}") + print( + "MT-Bench-101 dialogue-level pairwise winrate(A): " + f"{dialogue_pairwise['winrate']:.1%}" + ) + + ann_cols = [ + "instruction_index", + "dialogue_uid", + "dialogue_id", + "task", + "ability", + "turn_index", + "model_completion", + "judge_completion", + "score", + ] + annotations_a = scored_a.loc[:, ann_cols].copy() + annotations_a["evaluated_model"] = args.model_A + annotations_b = scored_b.loc[:, ann_cols].copy() + annotations_b["evaluated_model"] = args.model_B + annotations_df = pd.concat([annotations_a, annotations_b], ignore_index=True) + annotations_df = annotations_df.merge( + pairwise_turns.loc[ + :, ["instruction_index", "score_A", "score_B", "preference"] + ], + on="instruction_index", + how="left", + validate="many_to_one", + ) + + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "judge_temperature": 0.6, + "evaluation_mode": "single_answer_grading", + "num_battles": dialogue_pairwise["num_battles"], + "winrate": dialogue_pairwise["winrate"], + "num_wins": dialogue_pairwise["num_wins"], + "num_losses": dialogue_pairwise["num_losses"], + "num_ties": dialogue_pairwise["num_ties"], + "num_missing": dialogue_pairwise["num_missing"], + "per_category": dialogue_pairwise["per_task"], + "model_A_scores": absolute_a, + "model_B_scores": absolute_b, + "pairwise": pairwise_summary, + "preferences": pairwise_summary["preferences"], + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + + _save_mt_bench_results( + args=args, + results=results, + annotations_df=annotations_df, + result_name=_build_mt_bench_101_result_name(args, suffix="mtbench_101"), + ) + return pd.Series(pairwise_summary["preferences"]) + + def _run_mt_bench(args: CliArgs, ignore_cache: bool): """MT-Bench pipeline (optionally FastChat-compatible).""" questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) diff --git a/openjury/instruction_dataset/__init__.py b/openjury/instruction_dataset/__init__.py index fc75155..2702084 100644 --- a/openjury/instruction_dataset/__init__.py +++ b/openjury/instruction_dataset/__init__.py @@ -4,11 +4,20 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: + apply_head_after_index = True if dataset == "mt-bench": from openjury.instruction_dataset.mt_bench import load_mt_bench df_instructions = load_mt_bench() + elif dataset == "mt-bench-101": + from openjury.instruction_dataset.mt_bench_101 import load_mt_bench_101 + + # MT-Bench-101 is expanded into turn-level eval items in its loader. + # Keep n_instructions semantics as "number of dialogues to load". + df_instructions = load_mt_bench_101(n_dialogues=n_instructions) + apply_head_after_index = False + elif "m-arena-hard" in dataset: if dataset == "m-arena-hard": language = None @@ -64,6 +73,8 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat df_instructions = df_instructions.set_index("instruction_index").sort_index() print(f"Loaded {len(df_instructions)} instructions for {dataset}.") + if not apply_head_after_index: + return df_instructions if n_instructions is None: n_instructions = len(df_instructions) return df_instructions.head(n_instructions) diff --git a/openjury/instruction_dataset/mt_bench_101.py b/openjury/instruction_dataset/mt_bench_101.py new file mode 100644 index 0000000..032139d --- /dev/null +++ b/openjury/instruction_dataset/mt_bench_101.py @@ -0,0 +1,125 @@ +import json +from pathlib import Path +from urllib.request import urlretrieve + +import pandas as pd + +from openjury.utils import data_root + +MT_BENCH_101_DATA_URL = ( + "https://raw.githubusercontent.com/mtbench101/mt-bench-101/main/" + "data/subjective/mtbench101.jsonl" +) + +MT_BENCH_101_TURN2_ONLY_TASKS = {"CM", "AR", "CR", "FR", "SC", "SA"} +MT_BENCH_101_REFERENCE_TASKS = {"MR", "GR"} +MT_BENCH_101_TASK_TO_ABILITY = { + "CM": "perceptivity", + "AR": "perceptivity", + "SI": "perceptivity", + "TS": "perceptivity", + "CC": "perceptivity", + "CR": "adaptability", + "FR": "adaptability", + "SC": "adaptability", + "SA": "adaptability", + "MR": "adaptability", + "GR": "adaptability", + "IC": "interactivity", + "PI": "interactivity", +} + + +def download_mt_bench_101(local_dir: Path | None = None) -> Path: + """Download MT-Bench-101 JSONL dataset if missing and return its path.""" + if local_dir is None: + local_dir = data_root / "mt-bench-101" + + local_dir.mkdir(parents=True, exist_ok=True) + dataset_path = local_dir / "data" / "subjective" / "mtbench101.jsonl" + dataset_path.parent.mkdir(parents=True, exist_ok=True) + if dataset_path.exists(): + return dataset_path + + try: + urlretrieve(MT_BENCH_101_DATA_URL, dataset_path) + except Exception as exc: + raise RuntimeError( + "Failed to download MT-Bench-101 dataset from GitHub. " + "If running in a restricted network environment, manually place the file at " + f"{dataset_path} or point OPENJURY_DATA to a cache containing it." + ) from exc + + return dataset_path + + +def load_mt_bench_101( + n_dialogues: int | None = None, + local_dir: Path | None = None, +) -> pd.DataFrame: + """Load MT-Bench-101 and expand dialogues into turn-level evaluation items. + + The returned dataframe has one row per evaluated turn, using golden context. + """ + dataset_path = download_mt_bench_101(local_dir=local_dir) + + records: list[dict] = [] + with dataset_path.open("r", encoding="utf-8") as handle: + for line in handle: + line = line.strip() + if line: + records.append(json.loads(line)) + + if n_dialogues is not None: + records = records[:n_dialogues] + + rows: list[dict] = [] + for rec in records: + task = rec.get("task") + if task not in MT_BENCH_101_TASK_TO_ABILITY: + raise ValueError(f"Unknown MT-Bench-101 task '{task}' in record: {rec}") + + dialogue_id = rec.get("id") + history = rec.get("history") + if not isinstance(history, list): + raise ValueError( + "Invalid MT-Bench-101 record: expected list in field 'history', " + f"got {type(history)}" + ) + + start_turn = 2 if task in MT_BENCH_101_TURN2_ONLY_TASKS else 1 + for turn_pos, turn in enumerate(history, start=1): + if turn_pos < start_turn: + continue + if not isinstance(turn, dict): + raise ValueError( + "Invalid MT-Bench-101 record: each turn in 'history' must be a dict." + ) + + user_message = str(turn.get("user") or "") + reference_answer = str(turn.get("bot") or "") + golden_context = [ + { + "user": str(prev_turn.get("user") or ""), + "bot": str(prev_turn.get("bot") or ""), + } + for prev_turn in history[: turn_pos - 1] + ] + + rows.append( + { + "instruction_index": len(rows), + "dialogue_id": dialogue_id, + "dialogue_uid": f"{task}:{dialogue_id}", + "task": task, + "ability": MT_BENCH_101_TASK_TO_ABILITY[task], + "turn_index": turn_pos, + "golden_context": golden_context, + "user_message": user_message, + "reference_answer": reference_answer, + "requires_reference": task in MT_BENCH_101_REFERENCE_TASKS, + "instruction": user_message, + } + ) + + return pd.DataFrame(rows) diff --git a/openjury/mt_bench_101/__init__.py b/openjury/mt_bench_101/__init__.py new file mode 100644 index 0000000..abe2e9e --- /dev/null +++ b/openjury/mt_bench_101/__init__.py @@ -0,0 +1,15 @@ +from openjury.mt_bench_101.generate import generate_mt_bench_101_completions +from openjury.mt_bench_101.evaluate import ( + derive_mt_bench_101_pairwise_preferences, + judge_mt_bench_101_single, + summarize_mt_bench_101_absolute_scores, + summarize_mt_bench_101_pairwise, +) + +__all__ = [ + "derive_mt_bench_101_pairwise_preferences", + "generate_mt_bench_101_completions", + "judge_mt_bench_101_single", + "summarize_mt_bench_101_absolute_scores", + "summarize_mt_bench_101_pairwise", +] diff --git a/openjury/mt_bench_101/evaluate.py b/openjury/mt_bench_101/evaluate.py new file mode 100644 index 0000000..bfd2382 --- /dev/null +++ b/openjury/mt_bench_101/evaluate.py @@ -0,0 +1,283 @@ +import re +from functools import lru_cache +from pathlib import Path + +import pandas as pd +from langchain.prompts import ChatPromptTemplate + +from openjury.evaluate import PairScore +from openjury.instruction_dataset.mt_bench_101 import ( + MT_BENCH_101_REFERENCE_TASKS, + MT_BENCH_101_TASK_TO_ABILITY, +) +from openjury.utils import do_inference, safe_text + +PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts" / "mt_bench_101" +DOUBLE_BRACKET_PATTERN = re.compile(r"\[\[(\d+)\]\]") + +TASK_PROMPT_FILES = { + "CM": "CM.txt", + "AR": "AR.txt", + "SI": "SI.txt", + "TS": "TS.txt", + "CC": "CC.txt", + "CR": "rephrasing.txt", + "FR": "rephrasing.txt", + "SC": "SC.txt", + "SA": "SA.txt", + "MR": "MR.txt", + "GR": "GR.txt", + "IC": "IC.txt", + "PI": "PI.txt", +} + + +@lru_cache(maxsize=1) +def load_mt_bench_101_prompts() -> dict[str, object]: + global_system = (PROMPTS_DIR / "global_system.txt").read_text() + scoring_format = (PROMPTS_DIR / "scoring_format.txt").read_text() + task_prompts = { + task: (PROMPTS_DIR / prompt_file).read_text() + for task, prompt_file in TASK_PROMPT_FILES.items() + } + return { + "global_system": global_system, + "scoring_format": scoring_format, + "task_prompts": task_prompts, + } + + +def parse_mt_bench_101_rating(judge_completion: str) -> float | None: + for match in DOUBLE_BRACKET_PATTERN.finditer(judge_completion): + score = int(match.group(1)) + if 1 <= score <= 10: + return float(score) + return None + + +def format_mt_bench_101_dialogue( + *, + golden_context: list[dict[str, str]], + user_message: str, + assistant_message: str, +) -> str: + chunks: list[str] = [] + for turn in golden_context: + chunks.append( + f"\n\n Human: {turn.get('user', '')}\n\nAssistant: {turn.get('bot', '')}" + ) + chunks.append(f"\n\n Human: {user_message}\n\nAssistant: {assistant_message}") + return "".join(chunks) + + +def judge_mt_bench_101_single( + *, + judge_chat_model, + eval_items: pd.DataFrame, + completions: pd.DataFrame, + truncate_input_chars: int | None = 8192, + use_tqdm: bool = False, +) -> pd.DataFrame: + prompts = load_mt_bench_101_prompts() + completion_by_idx = ( + completions + if "instruction_index" not in completions.columns + else completions.set_index("instruction_index") + ) + + rows: list[dict[str, object]] = [] + for idx in eval_items.index: + eval_row = eval_items.loc[idx] + completion_row = completion_by_idx.loc[idx] + task = str(eval_row["task"]) + model_response = safe_text( + completion_row.get("completion", ""), + truncate_input_chars, + ) + + dialogue = format_mt_bench_101_dialogue( + golden_context=eval_row.get("golden_context") or [], + user_message=safe_text(eval_row.get("user_message", ""), truncate_input_chars), + assistant_message=model_response, + ) + + user_prompt = ( + "The dialogue need to be judged is: \n *** \n " + f"{dialogue} \n ***" + ) + if task in MT_BENCH_101_REFERENCE_TASKS: + reference_answer = safe_text( + eval_row.get("reference_answer"), + truncate_input_chars, + ) + user_prompt += ( + "\n\nThe reference solution is: \n ### \n " + f"{reference_answer} \n ###\n\n" + ) + + system_prompt = ( + f"{prompts['global_system']}\n\n" + f"{prompts['task_prompts'][task]}\n\n" + f"{prompts['scoring_format']}" + ).strip() + + rows.append( + { + "instruction_index": idx, + "dialogue_id": eval_row["dialogue_id"], + "dialogue_uid": eval_row["dialogue_uid"], + "task": task, + "ability": eval_row.get("ability", MT_BENCH_101_TASK_TO_ABILITY[task]), + "turn_index": eval_row["turn_index"], + "model_completion": model_response, + "system_prompt": system_prompt, + "user_prompt": user_prompt, + } + ) + + prompt_template = ChatPromptTemplate.from_messages( + [("system", "{system_prompt}"), ("user", "{user_prompt}")] + ) + inputs = prompt_template.batch( + [ + {"system_prompt": row["system_prompt"], "user_prompt": row["user_prompt"]} + for row in rows + ] + ) + judge_completions = do_inference( + chat_model=judge_chat_model, + inputs=inputs, + use_tqdm=use_tqdm, + ) + + for row, judge_completion in zip(rows, judge_completions): + row["judge_completion"] = judge_completion + row["score"] = parse_mt_bench_101_rating(judge_completion) + + return pd.DataFrame(rows) + + +def compute_mt_bench_101_dialogue_scores(scored_turns: pd.DataFrame) -> pd.DataFrame: + grouped = ( + scored_turns.groupby(["dialogue_uid", "dialogue_id", "task", "ability"], as_index=False)[ + "score" + ].min() + ) + grouped = grouped.rename(columns={"score": "dialogue_score"}) + return grouped + + +def summarize_mt_bench_101_absolute_scores(scored_turns: pd.DataFrame) -> dict[str, object]: + dialogue_scores = compute_mt_bench_101_dialogue_scores(scored_turns=scored_turns) + per_task_series = dialogue_scores.groupby("task")["dialogue_score"].mean().sort_index() + per_ability_series = ( + dialogue_scores.groupby("ability")["dialogue_score"].mean().sort_index() + ) + overall = per_task_series.mean() if len(per_task_series) else float("nan") + + return { + "num_turns": int(len(scored_turns)), + "num_scored_turns": int(scored_turns["score"].notna().sum()), + "per_task": { + task: float(score) + for task, score in per_task_series.items() + if pd.notna(score) + }, + "per_ability": { + ability: float(score) + for ability, score in per_ability_series.items() + if pd.notna(score) + }, + "overall": float(overall) if pd.notna(overall) else None, + } + + +def derive_mt_bench_101_pairwise_preferences( + scored_a: pd.DataFrame, + scored_b: pd.DataFrame, +) -> pd.DataFrame: + cols = ["instruction_index", "dialogue_uid", "dialogue_id", "task", "ability", "turn_index"] + merged = scored_a.loc[:, cols + ["score"]].rename(columns={"score": "score_A"}).merge( + scored_b.loc[:, cols + ["score"]].rename(columns={"score": "score_B"}), + on=cols, + how="inner", + ) + + scorer = PairScore() + preferences = [] + for _, row in merged.iterrows(): + score_a = row["score_A"] + score_b = row["score_B"] + if pd.isna(score_a) or pd.isna(score_b): + preferences.append(None) + continue + preferences.append(float(scorer.preference_from_scores(score_a, score_b))) + merged["preference"] = preferences + return merged + + +def _compute_preference_stats(preferences: pd.Series) -> dict[str, float | int]: + tie_tol = 1e-12 + valid = preferences.dropna() + num_wins = int(sum(valid < 0.5 - tie_tol)) + num_losses = int(sum(valid > 0.5 + tie_tol)) + num_ties = int(sum((valid >= 0.5 - tie_tol) & (valid <= 0.5 + tie_tol))) + num_battles = len(preferences) + num_missing = num_battles - (num_wins + num_losses + num_ties) + denom = num_wins + num_losses + num_ties + winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0 + return { + "num_battles": num_battles, + "num_wins": num_wins, + "num_losses": num_losses, + "num_ties": num_ties, + "num_missing": num_missing, + "winrate": winrate, + } + + +def _grouped_preference_stats( + pairwise_df: pd.DataFrame, + group_by: str, +) -> dict[str, dict[str, float | int]]: + grouped: dict[str, list[float]] = {} + for _, row in pairwise_df.iterrows(): + key = row[group_by] + grouped.setdefault(key, []).append(row["preference"]) + return { + key: _compute_preference_stats(pd.Series(values)) + for key, values in grouped.items() + } + + +def summarize_mt_bench_101_pairwise(pairwise_turns: pd.DataFrame) -> dict[str, object]: + turn_preferences = pairwise_turns["preference"] + turn_level = { + **_compute_preference_stats(turn_preferences), + "per_task": _grouped_preference_stats(pairwise_turns, "task"), + "per_ability": _grouped_preference_stats(pairwise_turns, "ability"), + } + + dialogue_scores = ( + pairwise_turns.groupby(["dialogue_uid", "dialogue_id", "task", "ability"], as_index=False)[ + ["score_A", "score_B"] + ].min() + ) + scorer = PairScore() + dialogue_scores["preference"] = [ + float(scorer.preference_from_scores(score_a, score_b)) + if pd.notna(score_a) and pd.notna(score_b) + else None + for score_a, score_b in zip(dialogue_scores["score_A"], dialogue_scores["score_B"]) + ] + dialogue_level = { + **_compute_preference_stats(dialogue_scores["preference"]), + "per_task": _grouped_preference_stats(dialogue_scores, "task"), + "per_ability": _grouped_preference_stats(dialogue_scores, "ability"), + } + + return { + "turn_level": turn_level, + "dialogue_level": dialogue_level, + "preferences": [None if pd.isna(x) else float(x) for x in turn_preferences], + } diff --git a/openjury/mt_bench_101/generate.py b/openjury/mt_bench_101/generate.py new file mode 100644 index 0000000..8d2b1c3 --- /dev/null +++ b/openjury/mt_bench_101/generate.py @@ -0,0 +1,68 @@ +from typing import Any + +import pandas as pd +from langchain.prompts import ChatPromptTemplate + +from openjury.utils import do_inference, make_model, truncate + +DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant." + + +def _build_golden_context_input( + *, + system_prompt: str, + golden_context: list[dict[str, str]], + user_message: str, + truncate_input_chars: int | None, +): + messages: list[tuple[str, str]] = [("system", system_prompt)] + for turn in golden_context: + messages.append( + ("user", truncate(str(turn.get("user") or ""), max_len=truncate_input_chars)) + ) + messages.append( + ( + "assistant", + truncate(str(turn.get("bot") or ""), max_len=truncate_input_chars), + ) + ) + messages.append(("user", truncate(user_message, max_len=truncate_input_chars))) + return ChatPromptTemplate.from_messages(messages).invoke({}) + + +def generate_mt_bench_101_completions( + eval_items: pd.DataFrame, + model: str, + truncate_input_chars: int | None = 8192, + max_tokens: int | None = 8192, + use_tqdm: bool = True, + system_prompt: str = DEFAULT_SYSTEM_PROMPT, + **model_kwargs: Any, +) -> pd.DataFrame: + """Generate MT-Bench-101 responses from golden context eval items.""" + chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) + + inputs = [] + for _, row in eval_items.iterrows(): + inputs.append( + _build_golden_context_input( + system_prompt=system_prompt, + golden_context=row.get("golden_context") or [], + user_message=str(row.get("user_message") or ""), + truncate_input_chars=truncate_input_chars, + ) + ) + + completions = do_inference(chat_model=chat_model, inputs=inputs, use_tqdm=use_tqdm) + idxs = eval_items.index.tolist() + return pd.DataFrame( + { + "instruction_index": idxs, + "dialogue_id": [eval_items.loc[idx, "dialogue_id"] for idx in idxs], + "dialogue_uid": [eval_items.loc[idx, "dialogue_uid"] for idx in idxs], + "task": [eval_items.loc[idx, "task"] for idx in idxs], + "ability": [eval_items.loc[idx, "ability"] for idx in idxs], + "turn_index": [eval_items.loc[idx, "turn_index"] for idx in idxs], + "completion": completions, + } + ) diff --git a/openjury/prompts/mt_bench_101/AR.txt b/openjury/prompts/mt_bench_101/AR.txt new file mode 100644 index 0000000..61e77ae --- /dev/null +++ b/openjury/prompts/mt_bench_101/AR.txt @@ -0,0 +1,13 @@ +The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance: + +1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply. +2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references. + +Scoring Guidelines: + +- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content. +- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information. +- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue. +- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely. + +In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above. diff --git a/openjury/prompts/mt_bench_101/CC.txt b/openjury/prompts/mt_bench_101/CC.txt new file mode 100644 index 0000000..f9d0bb6 --- /dev/null +++ b/openjury/prompts/mt_bench_101/CC.txt @@ -0,0 +1,17 @@ +The AI assistant's capability to resist interference will be measured against these criteria: + + +1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information. +2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only. + + +Scoring Guidelines: + + +- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information. +- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question. +- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions. +- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant. + + +Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria. diff --git a/openjury/prompts/mt_bench_101/CM.txt b/openjury/prompts/mt_bench_101/CM.txt new file mode 100644 index 0000000..db8beb7 --- /dev/null +++ b/openjury/prompts/mt_bench_101/CM.txt @@ -0,0 +1,15 @@ +The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows: + +1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments. +2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue. +3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation. +4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements. +Scoring Guidelines: + +1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow. +4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context. +7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow. +10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally. +When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance. + +Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness. diff --git a/openjury/prompts/mt_bench_101/GR.txt b/openjury/prompts/mt_bench_101/GR.txt new file mode 100644 index 0000000..42968e9 --- /dev/null +++ b/openjury/prompts/mt_bench_101/GR.txt @@ -0,0 +1,15 @@ +The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows: + +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format '### reference solution ###' for the specific problem. +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning. +3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself. +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension. +Scoring Guidelines: + +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning. +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles. +7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution. +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding. +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score. + +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution. diff --git a/openjury/prompts/mt_bench_101/IC.txt b/openjury/prompts/mt_bench_101/IC.txt new file mode 100644 index 0000000..db991a7 --- /dev/null +++ b/openjury/prompts/mt_bench_101/IC.txt @@ -0,0 +1,16 @@ +The AI assistant's ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows: + +1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification. +2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query. +3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query. + +Scoring Guidelines: + +- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question. +- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified. +- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question. +- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question. + +When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue. + +Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query. diff --git a/openjury/prompts/mt_bench_101/MR.txt b/openjury/prompts/mt_bench_101/MR.txt new file mode 100644 index 0000000..4315e11 --- /dev/null +++ b/openjury/prompts/mt_bench_101/MR.txt @@ -0,0 +1,15 @@ +The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows: + +1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem. +2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles. +3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself. +4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension. +Scoring Guidelines: + +1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards. +4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles. +7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution. +10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding. +When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score. + +Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution. diff --git a/openjury/prompts/mt_bench_101/PI.txt b/openjury/prompts/mt_bench_101/PI.txt new file mode 100644 index 0000000..8702484 --- /dev/null +++ b/openjury/prompts/mt_bench_101/PI.txt @@ -0,0 +1,15 @@ +The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows: + +1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics. +2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts. +3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it. +4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'. +Scoring Guidelines: + +1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation. +4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'. +7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'. +10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation. +When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score. + +Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience. diff --git a/openjury/prompts/mt_bench_101/SA.txt b/openjury/prompts/mt_bench_101/SA.txt new file mode 100644 index 0000000..03fb0c0 --- /dev/null +++ b/openjury/prompts/mt_bench_101/SA.txt @@ -0,0 +1,13 @@ +We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows: + +1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge. +2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance. + +Scoring Guidelines: + +- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect. +- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer. +- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification. +- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge. + +In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above. diff --git a/openjury/prompts/mt_bench_101/SC.txt b/openjury/prompts/mt_bench_101/SC.txt new file mode 100644 index 0000000..1472a0d --- /dev/null +++ b/openjury/prompts/mt_bench_101/SC.txt @@ -0,0 +1,13 @@ +We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows: + +1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation. +2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response. + +Scoring Guidelines: + +- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response. +- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response. +- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer. +- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response. + +In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above. diff --git a/openjury/prompts/mt_bench_101/SI.txt b/openjury/prompts/mt_bench_101/SI.txt new file mode 100644 index 0000000..61ffe78 --- /dev/null +++ b/openjury/prompts/mt_bench_101/SI.txt @@ -0,0 +1,13 @@ +We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows: + +1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task. +2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements. + +Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines: + +- 1-3 points: The AI assistant failed to understand the task request and neither asked relevant questions nor provided information related to the task. +- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant. +- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights. +- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations. + +Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary. diff --git a/openjury/prompts/mt_bench_101/TS.txt b/openjury/prompts/mt_bench_101/TS.txt new file mode 100644 index 0000000..2bbc354 --- /dev/null +++ b/openjury/prompts/mt_bench_101/TS.txt @@ -0,0 +1,15 @@ +The AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows: + +1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject. +2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds. +3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature. +4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads. +Scoring Guidelines: + +1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter. +4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts. +7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction. +10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content. +When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities. + +Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria. diff --git a/openjury/prompts/mt_bench_101/global_system.txt b/openjury/prompts/mt_bench_101/global_system.txt new file mode 100644 index 0000000..ca94e9f --- /dev/null +++ b/openjury/prompts/mt_bench_101/global_system.txt @@ -0,0 +1,5 @@ +Please act as an impartial judge following these instructions: In the following conversations, the response of the "assistant" in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated. + +Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria. + +Note that only the response of the "assistant" in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations are the ground truth history which do NOT need to be evaluated. diff --git a/openjury/prompts/mt_bench_101/rephrasing.txt b/openjury/prompts/mt_bench_101/rephrasing.txt new file mode 100644 index 0000000..9bd5e9c --- /dev/null +++ b/openjury/prompts/mt_bench_101/rephrasing.txt @@ -0,0 +1,13 @@ +We aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows: + +1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round. +2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round. + +Scoring Guidelines: + +- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements. +- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence. +- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues. +- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes. + +Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly. diff --git a/openjury/prompts/mt_bench_101/scoring_format.txt b/openjury/prompts/mt_bench_101/scoring_format.txt new file mode 100644 index 0000000..b28f9eb --- /dev/null +++ b/openjury/prompts/mt_bench_101/scoring_format.txt @@ -0,0 +1,9 @@ +Note that only the response of the "assistant" in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated. + +You must provide your explanation. After providing your explanation, show the score by strictly following this format: "Rating: [[score]]", for example "Rating: [[6]]". + +The DIALOGUE needs to be judged in this format: + +*** +DIALOGUE +*** diff --git a/openjury/utils.py b/openjury/utils.py index 17b9a0d..2079424 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -486,6 +486,7 @@ def make_model( def download_all(): from openjury.instruction_dataset.mt_bench import download_mt_bench + from openjury.instruction_dataset.mt_bench_101 import download_mt_bench_101 print(f"Downloading all dataset in {data_root}") for dataset in ["alpaca-eval", "arena-hard", "m-arena-hard"]: @@ -501,6 +502,7 @@ def download_all(): ) download_mt_bench() + download_mt_bench_101() class Timeblock: diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index c0c2f20..8b92d27 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -42,8 +42,36 @@ def mock_external_data_and_cache(monkeypatch): ) mt_bench_questions["instruction"] = mt_bench_questions["turn_1"] + mt_bench_101_eval_items = pd.DataFrame( + { + "dialogue_id": [0, 0, 1], + "dialogue_uid": ["CM:0", "CM:0", "MR:1"], + "task": ["CM", "CM", "MR"], + "ability": ["perceptivity", "perceptivity", "adaptability"], + "turn_index": [2, 3, 1], + "golden_context": [ + [{"user": "CM user 1", "bot": "CM bot 1"}], + [ + {"user": "CM user 1", "bot": "CM bot 1"}, + {"user": "CM user 2", "bot": "CM bot 2"}, + ], + [], + ], + "user_message": ["CM user 2", "CM user 3", "MR user 1"], + "reference_answer": ["CM ref 2", "CM ref 3", "MR ref 1"], + "requires_reference": [False, False, True], + "instruction": ["CM user 2", "CM user 3", "MR user 1"], + }, + index=pd.Index(range(3), name="instruction_index"), + ) + def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame: - df = mt_bench_questions if dataset == "mt-bench" else single_turn_instructions + if dataset == "mt-bench": + df = mt_bench_questions + elif dataset == "mt-bench-101": + df = mt_bench_101_eval_items + else: + df = single_turn_instructions return df.head(n_instructions) if n_instructions is not None else df monkeypatch.setattr( @@ -324,4 +352,20 @@ def test_mt_bench_fastchat_conservative_swap_mode(tmp_path): # Conservative swap runs both orders, but returns one resolved verdict per match. assert len(prefs) == 6 # 3 questions * 2 turns - assert all(p == pytest.approx(0.5) for p in prefs) \ No newline at end of file + assert all(p == pytest.approx(0.5) for p in prefs) + + +def test_mt_bench_101_pipeline(tmp_path): + prefs = main_generate_and_eval( + CliArgs( + dataset="mt-bench-101", + model_A="Dummy/model-a-response", + model_B="Dummy/model-b-response", + judge_model="Dummy/Explanation.\nRating: [[10]]", + result_folder=str(tmp_path), + ) + ) + + # Both models receive the same constant judge score, so all pairwise prefs are ties. + assert len(prefs) == 3 + assert all(float(pref) == pytest.approx(0.5) for pref in prefs) diff --git a/tests/test_mt_bench_101.py b/tests/test_mt_bench_101.py new file mode 100644 index 0000000..b51d8a7 --- /dev/null +++ b/tests/test_mt_bench_101.py @@ -0,0 +1,139 @@ +import json + +import pandas as pd +import pytest + +import openjury.instruction_dataset.mt_bench_101 as mt_bench_101_dataset +from openjury.mt_bench_101.evaluate import ( + derive_mt_bench_101_pairwise_preferences, + judge_mt_bench_101_single, + parse_mt_bench_101_rating, + summarize_mt_bench_101_absolute_scores, + summarize_mt_bench_101_pairwise, +) +from openjury.utils import DummyModel + + +def test_load_mt_bench_101_turn_expansion(tmp_path, monkeypatch): + dataset_path = tmp_path / "mtbench101.jsonl" + records = [ + { + "task": "CM", + "id": 1, + "history": [ + {"user": "u1", "bot": "b1"}, + {"user": "u2", "bot": "b2"}, + ], + }, + { + "task": "PI", + "id": 2, + "history": [ + {"user": "x1", "bot": "y1"}, + {"user": "x2", "bot": "y2"}, + ], + }, + ] + dataset_path.write_text( + "\n".join(json.dumps(record) for record in records) + "\n", + encoding="utf-8", + ) + monkeypatch.setattr( + mt_bench_101_dataset, + "download_mt_bench_101", + lambda local_dir=None: dataset_path, + ) + + eval_items = mt_bench_101_dataset.load_mt_bench_101() + + # CM starts at turn 2 (1 row), PI starts at turn 1 (2 rows) => total 3. + assert len(eval_items) == 3 + cm_rows = eval_items[eval_items["task"] == "CM"] + assert cm_rows.iloc[0]["turn_index"] == 2 + assert len(cm_rows.iloc[0]["golden_context"]) == 1 + + +def test_parse_mt_bench_101_rating(): + assert parse_mt_bench_101_rating("Reasoning...\nRating: [[7]]") == pytest.approx(7.0) + assert parse_mt_bench_101_rating("rating: [[10]]") == pytest.approx(10.0) + assert parse_mt_bench_101_rating("I would rate this [[6]] overall.") == pytest.approx(6.0) + assert ( + parse_mt_bench_101_rating("See section [3] for details...\nRating: [[6]]") + == pytest.approx(6.0) + ) + assert parse_mt_bench_101_rating("Rating: [[0]]") is None + assert parse_mt_bench_101_rating("Rating: [[11]]") is None + assert parse_mt_bench_101_rating("Rating: [6]") is None + assert parse_mt_bench_101_rating("No rating present.") is None + + +def test_judge_mt_bench_101_includes_reference_block_for_mr(): + eval_items = pd.DataFrame( + { + "instruction_index": [0], + "dialogue_id": [1], + "dialogue_uid": ["MR:1"], + "task": ["MR"], + "ability": ["adaptability"], + "turn_index": [2], + "golden_context": [[{"user": "q1", "bot": "a1"}]], + "user_message": ["q2"], + "reference_answer": ["ref answer"], + } + ).set_index("instruction_index") + completions = pd.DataFrame( + {"instruction_index": [0], "completion": ["model answer"]} + ) + + scored = judge_mt_bench_101_single( + judge_chat_model=DummyModel("Dummy/reasoning\nRating: [[8]]"), + eval_items=eval_items, + completions=completions, + use_tqdm=False, + ) + + user_prompt = scored.iloc[0]["user_prompt"] + assert scored.iloc[0]["score"] == pytest.approx(8.0) + assert "The dialogue need to be judged is:" in user_prompt + assert "The reference solution is:" in user_prompt + assert " Human: q1" in user_prompt + assert "Assistant: model answer" in user_prompt + assert user_prompt.find("***") < user_prompt.find("The reference solution is:") + assert "strictly following this format" in scored.iloc[0]["system_prompt"] + + +def test_mt_bench_101_aggregation_and_pairwise(): + scored_a = pd.DataFrame( + { + "instruction_index": [0, 1, 2], + "dialogue_uid": ["PI:1", "PI:1", "PI:2"], + "dialogue_id": [1, 1, 2], + "task": ["PI", "PI", "PI"], + "ability": ["interactivity", "interactivity", "interactivity"], + "turn_index": [1, 2, 1], + "score": [9.0, 2.0, 4.0], + } + ) + scored_b = pd.DataFrame( + { + "instruction_index": [0, 1, 2], + "dialogue_uid": ["PI:1", "PI:1", "PI:2"], + "dialogue_id": [1, 1, 2], + "task": ["PI", "PI", "PI"], + "ability": ["interactivity", "interactivity", "interactivity"], + "turn_index": [1, 2, 1], + "score": [8.0, 1.0, 6.0], + } + ) + + absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a) + assert absolute_a["per_task"]["PI"] == pytest.approx(3.0) + assert absolute_a["overall"] == pytest.approx(3.0) + + pairwise_turns = derive_mt_bench_101_pairwise_preferences(scored_a, scored_b) + summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns) + + assert summary["turn_level"]["num_battles"] == 3 + assert summary["dialogue_level"]["num_battles"] == 2 + # dialogue-level uses min scores per dialogue: one A win and one A loss + assert summary["dialogue_level"]["winrate"] == pytest.approx(0.5) diff --git a/tests/test_mt_bench_downloads.py b/tests/test_mt_bench_downloads.py index 9058a3b..a41996d 100644 --- a/tests/test_mt_bench_downloads.py +++ b/tests/test_mt_bench_downloads.py @@ -1,6 +1,7 @@ from pathlib import Path import openjury.instruction_dataset.mt_bench as mt_bench +import openjury.instruction_dataset.mt_bench_101 as mt_bench_101 import openjury.utils as utils @@ -36,7 +37,7 @@ def _snapshot_download_stub(**_kwargs): def test_download_all_includes_mt_bench(tmp_path, monkeypatch): hf_datasets = [] - calls = {"contexts": 0, "mt_bench": 0} + calls = {"contexts": 0, "mt_bench": 0, "mt_bench_101": 0} monkeypatch.setattr(utils, "data_root", tmp_path) monkeypatch.setattr( @@ -54,6 +55,11 @@ def _contexts_snapshot_stub(**_kwargs): "download_mt_bench", lambda: calls.__setitem__("mt_bench", calls["mt_bench"] + 1), ) + monkeypatch.setattr( + mt_bench_101, + "download_mt_bench_101", + lambda: calls.__setitem__("mt_bench_101", calls["mt_bench_101"] + 1), + ) utils.download_all() @@ -64,3 +70,4 @@ def _contexts_snapshot_stub(**_kwargs): ] assert calls["contexts"] == 1 assert calls["mt_bench"] == 1 + assert calls["mt_bench_101"] == 1 From 0ca66c5e2c0fd6511b476d20ee820e383ad26041 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 10 Mar 2026 10:52:23 +0100 Subject: [PATCH 29/35] remove redundant print statement --- openjury/generate_and_evaluate.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 2db5da0..cb82c5f 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -949,9 +949,6 @@ def _run_mt_bench_openjury( preference_parts: list[pd.Series] = [] combined_metadata: list[dict[str, object]] = [] - if args.swap_mode == "both": - print("Running reversed evaluation for position bias correction.") - if instructions_turn_1: ( annotations_turn_1, From a295305d48ec68010c0f43bc8361835ddebfede1 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 17 Mar 2026 14:42:43 +0100 Subject: [PATCH 30/35] move mt-bench logic from the entrypoint --- openjury/generate_and_evaluate.py | 456 +------------------------- openjury/mt_bench/pipeline.py | 484 ++++++++++++++++++++++++++++ tests/test_generate_and_evaluate.py | 9 + 3 files changed, 499 insertions(+), 450 deletions(-) create mode 100644 openjury/mt_bench/pipeline.py diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index cb82c5f..dbc659e 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -16,14 +16,12 @@ from openjury.evaluate import ( annotate_battles, PairScore, - load_judge_system_and_user_prompt, ) -from openjury.generate import generate_instructions, generate_base, generate_multiturn +from openjury.generate import generate_instructions, generate_base from openjury.instruction_dataset import load_instructions -from openjury.mt_bench.common import iter_mt_bench_pairwise_rows -from openjury.mt_bench.fastchat_compat import ( - FASTCHAT_TEMPERATURE_CONFIG, - judge_mt_bench_pairwise_fastchat, +from openjury.mt_bench.pipeline import ( + format_mt_bench_for_evaluation, + run_mt_bench, ) from openjury.utils import ( cache_function_dataframe, @@ -33,8 +31,6 @@ read_df, ) -NEED_REF_CATS = {"math", "reasoning", "coding"} - def try_load_dataset_completions( dataset: str, model: str, n_instructions: int | None @@ -175,7 +171,7 @@ def parse_args(cls): required=False, default=8192, help="Character-level truncation applied before tokenization: truncates each instruction " - "before model A/B generation and truncates each completion before judge evaluation.", + "before model A/B generation and truncates each completion before judge evaluation.", ) parser.add_argument( "--max_out_tokens_models", @@ -328,139 +324,6 @@ def print_results(results): print("=" * 60 + "\n") -def format_mt_bench_for_evaluation( - questions: pd.DataFrame, - completions_A: pd.DataFrame, - completions_B: pd.DataFrame, - turns_mode: str, - truncate_input_chars: int | None, -) -> tuple[ - tuple[list[str], list[str], list[str], list[dict[str, object]]], - tuple[list[str], list[str], list[str], list[dict[str, object]]], -]: - """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" - assert turns_mode in ("both", "single", "multi") - eval_single = turns_mode in ("both", "single") - eval_multi = turns_mode in ("both", "multi") - - instructions_turn_1: list[str] = [] - completions_a_turn_1: list[str] = [] - completions_b_turn_1: list[str] = [] - metadata_turn_1: list[dict[str, object]] = [] - - instructions_turn_2: list[str] = [] - completions_a_turn_2: list[str] = [] - completions_b_turn_2: list[str] = [] - metadata_turn_2: list[dict[str, object]] = [] - - for row in iter_mt_bench_pairwise_rows( - questions=questions, - completions_a=completions_A, - completions_b=completions_B, - truncate_input_chars=truncate_input_chars, - ): - needs_ref = row.category in NEED_REF_CATS - if eval_single: - if needs_ref and row.ref_1: - instruction = ( - "[MT-Bench | Turn 1]\n" - "Use the reference answer for correctness checks.\n\n" - f"[Question]\n{row.turn_1_question}\n\n" - f"[Reference Answer]\n{row.ref_1}" - ) - else: - instruction = row.turn_1_question - - instructions_turn_1.append(instruction) - completions_a_turn_1.append(row.answer_a_1) - completions_b_turn_1.append(row.answer_b_1) - metadata_turn_1.append( - { - "question_id": row.question_id, - "category": row.category, - "turn": 1, - } - ) - - if eval_multi and row.turn_2_question: - instruction_parts = [ - "Please focus on which assistant provides a better answer to the second user question." - ] - if needs_ref and (row.ref_1 or row.ref_2): - instruction_parts.extend( - [ - "<|The Start of Reference Answer|>", - "### User:", - row.turn_1_question, - "### Reference answer:", - row.ref_1, - "### User:", - row.turn_2_question, - "### Reference answer:", - row.ref_2, - "<|The End of Reference Answer|>", - ] - ) - - conversation_a = _format_mt_bench_multiturn_conversation( - turn_1_question=row.turn_1_question, - turn_1_answer=row.answer_a_1, - turn_2_question=row.turn_2_question, - turn_2_answer=row.answer_a_2, - ) - conversation_b = _format_mt_bench_multiturn_conversation( - turn_1_question=row.turn_1_question, - turn_1_answer=row.answer_b_1, - turn_2_question=row.turn_2_question, - turn_2_answer=row.answer_b_2, - ) - - instructions_turn_2.append("\n\n".join(instruction_parts)) - completions_a_turn_2.append(conversation_a) - completions_b_turn_2.append(conversation_b) - metadata_turn_2.append( - { - "question_id": row.question_id, - "category": row.category, - "turn": 2, - } - ) - - return ( - ( - instructions_turn_1, - completions_a_turn_1, - completions_b_turn_1, - metadata_turn_1, - ), - ( - instructions_turn_2, - completions_a_turn_2, - completions_b_turn_2, - metadata_turn_2, - ), - ) - - -def _format_mt_bench_multiturn_conversation( - *, - turn_1_question: str, - turn_1_answer: str, - turn_2_question: str, - turn_2_answer: str, -) -> str: - return ( - "### User:\n" - f"{turn_1_question}\n\n" - "### Assistant:\n" - f"{turn_1_answer}\n\n" - "### User:\n" - f"{turn_2_question}\n\n" - "### Assistant:\n" - f"{turn_2_answer}" - ) - - def compute_preference_stats(prefs: pd.Series) -> dict: """Derive win/loss/tie counts and winrate from a Series of preferences. @@ -610,7 +473,7 @@ def main(args: CliArgs): # MT-Bench has its own pipeline: multi-turn generation + category-aware judging if args.dataset == "mt-bench": - return _run_mt_bench(args, ignore_cache) + return run_mt_bench(args, ignore_cache) # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset @@ -791,313 +654,6 @@ def main(args: CliArgs): return prefs -def _generate_mt_bench_completions( - args: CliArgs, - questions_df: pd.DataFrame, - ignore_cache: bool, -) -> tuple[pd.DataFrame, pd.DataFrame]: - cache_prefix = ( - "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench" - ) - - def _run_generation(model_name: str) -> pd.DataFrame: - if args.mt_bench_compatibility == "fastchat": - return generate_multiturn( - questions=questions_df, - model=model_name, - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - use_tqdm=args.use_tqdm, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - temperature_config=FASTCHAT_TEMPERATURE_CONFIG, - ) - return generate_multiturn( - questions=questions_df, - model=model_name, - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - use_tqdm=args.use_tqdm, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - ) - - completions_a = cache_function_dataframe( - lambda: _run_generation(args.model_A), - ignore_cache=ignore_cache, - cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}", - ).set_index("instruction_index") - - completions_b = cache_function_dataframe( - lambda: _run_generation(args.model_B), - ignore_cache=ignore_cache, - cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}", - ).set_index("instruction_index") - return completions_a, completions_b - - -def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" - name += f"-{args.swap_mode}" - if suffix: - name += f"-{suffix}" - return name.replace("/", "_") - - -def _save_mt_bench_results( - *, - args: CliArgs, - results: dict[str, object], - annotations_df: pd.DataFrame, - name_suffix: str | None = None, -) -> None: - name = _build_mt_bench_result_name(args, suffix=name_suffix) - res_folder = Path(args.result_folder) / name - res_folder.mkdir(parents=True, exist_ok=True) - - with open(res_folder / f"args-{name}.json", "w") as f: - json.dump(asdict(args), f, indent=2) - - annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) - - with open(res_folder / f"results-{name}.json", "w") as f: - json.dump(results, f, indent=2) - - -def _run_mt_bench_fastchat( - *, - args: CliArgs, - questions_df: pd.DataFrame, - completions_a: pd.DataFrame, - completions_b: pd.DataFrame, - judge_chat_model, -) -> pd.Series: - prefs, annotations, combined_metadata, num_inconsistent = ( - judge_mt_bench_pairwise_fastchat( - judge_chat_model=judge_chat_model, - judge_model=args.judge_model, - questions=questions_df, - completions_a=completions_a, - completions_b=completions_b, - model_a=args.model_A, - model_b=args.model_B, - turns_mode=args.mt_bench_turns, - swap_mode=args.swap_mode, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - ) - - stats = compute_preference_stats(prefs) - results = { - "dataset": args.dataset, - "model_A": args.model_A, - "model_B": args.model_B, - "judge_model": args.judge_model, - "mt_bench_compatibility": args.mt_bench_compatibility, - "num_inconsistent": num_inconsistent, - **stats, - "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), - "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), - "preferences": prefs.tolist(), - "date": str(datetime.now().isoformat()), - "user": os.getenv("USER", ""), - } - print_results(results) - _save_mt_bench_results( - args=args, - results=results, - annotations_df=pd.DataFrame(annotations), - name_suffix=f"mtbench_{args.mt_bench_compatibility}", - ) - return prefs - - -def _run_mt_bench_openjury( - *, - args: CliArgs, - questions_df: pd.DataFrame, - completions_a: pd.DataFrame, - completions_b: pd.DataFrame, - judge_chat_model, -) -> pd.Series: - turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( - questions=questions_df, - completions_A=completions_a, - completions_B=completions_b, - turns_mode=args.mt_bench_turns, - truncate_input_chars=args.truncate_all_input_chars, - ) - ( - instructions_turn_1, - completions_a_turn_1, - completions_b_turn_1, - metadata_turn_1, - ) = turn_1_inputs - ( - instructions_turn_2, - completions_a_turn_2, - completions_b_turn_2, - metadata_turn_2, - ) = turn_2_inputs - - score_parser = PairScore() - annotations = [] - metadata_for_annotations: list[dict[str, object]] = [] - annotations_reversed = [] - metadata_for_reversed_annotations: list[dict[str, object]] = [] - preference_parts: list[pd.Series] = [] - combined_metadata: list[dict[str, object]] = [] - - if instructions_turn_1: - ( - annotations_turn_1, - annotations_turn_1_reversed, - metadata_turn_1_for_annotations, - metadata_turn_1_for_reversed_annotations, - prefs_turn_1, - combined_metadata_turn_1, - ) = _judge_turn( - judge_chat_model=judge_chat_model, - instructions=instructions_turn_1, - completions_A=completions_a_turn_1, - completions_B=completions_b_turn_1, - metadata=metadata_turn_1, - score_parser=score_parser, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - annotations.extend(annotations_turn_1) - annotations_reversed.extend(annotations_turn_1_reversed) - metadata_for_annotations.extend(metadata_turn_1_for_annotations) - metadata_for_reversed_annotations.extend( - metadata_turn_1_for_reversed_annotations - ) - preference_parts.append(prefs_turn_1) - combined_metadata.extend(combined_metadata_turn_1) - - if instructions_turn_2: - mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt( - provide_explanation=args.provide_explanation, - multi_turn=True, - ) - ( - annotations_turn_2, - annotations_turn_2_reversed, - metadata_turn_2_for_annotations, - metadata_turn_2_for_reversed_annotations, - prefs_turn_2, - combined_metadata_turn_2, - ) = _judge_turn( - judge_chat_model=judge_chat_model, - instructions=instructions_turn_2, - completions_A=completions_a_turn_2, - completions_B=completions_b_turn_2, - metadata=metadata_turn_2, - score_parser=score_parser, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - system_prompt=mt_system_prompt, - user_prompt_template=mt_user_prompt_template, - ) - annotations.extend(annotations_turn_2) - annotations_reversed.extend(annotations_turn_2_reversed) - metadata_for_annotations.extend(metadata_turn_2_for_annotations) - metadata_for_reversed_annotations.extend( - metadata_turn_2_for_reversed_annotations - ) - preference_parts.append(prefs_turn_2) - combined_metadata.extend(combined_metadata_turn_2) - - prefs = ( - pd.concat(preference_parts).reset_index(drop=True) - if preference_parts - else pd.Series(dtype=float) - ) - stats = compute_preference_stats(prefs) - results = { - "dataset": args.dataset, - "model_A": args.model_A, - "model_B": args.model_B, - "judge_model": args.judge_model, - **stats, - "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), - "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), - "preferences": prefs.tolist(), - "date": str(datetime.now().isoformat()), - "user": os.getenv("USER", ""), - } - print_results(results) - - df = pd.DataFrame(annotations) - df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] - df["category"] = [meta["category"] for meta in metadata_for_annotations] - df["turn"] = [meta["turn"] for meta in metadata_for_annotations] - df["model_A"] = args.model_A - df["model_B"] = args.model_B - df["judge"] = args.judge_model - - if args.swap_mode == "both": - df_reversed = pd.DataFrame(annotations_reversed) - df_reversed["instruction_index"] = [ - meta["question_id"] for meta in metadata_for_reversed_annotations - ] - df_reversed["category"] = [ - meta["category"] for meta in metadata_for_reversed_annotations - ] - df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations] - df_reversed["model_A"] = args.model_B - df_reversed["model_B"] = args.model_A - df_reversed["judge"] = args.judge_model - df = pd.concat([df, df_reversed], ignore_index=True) - - _save_mt_bench_results( - args=args, - results=results, - annotations_df=df, - ) - return prefs - - -def _run_mt_bench(args: CliArgs, ignore_cache: bool): - """MT-Bench pipeline (optionally FastChat-compatible).""" - questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) - print( - f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." - ) - completions_a, completions_b = _generate_mt_bench_completions( - args=args, - questions_df=questions_df, - ignore_cache=ignore_cache, - ) - judge_chat_model = make_model( - model=args.judge_model, - max_tokens=args.max_out_tokens_judge, - temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - ) - if args.mt_bench_compatibility == "fastchat": - return _run_mt_bench_fastchat( - args=args, - questions_df=questions_df, - completions_a=completions_a, - completions_b=completions_b, - judge_chat_model=judge_chat_model, - ) - return _run_mt_bench_openjury( - args=args, - questions_df=questions_df, - completions_a=completions_a, - completions_b=completions_b, - judge_chat_model=judge_chat_model, - ) - - def cli(): args = CliArgs.parse_args() print(f"Running with CLI args: {args.__dict__}") diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py new file mode 100644 index 0000000..6c949dc --- /dev/null +++ b/openjury/mt_bench/pipeline.py @@ -0,0 +1,484 @@ +"""MT-Bench evaluation pipeline. + +Orchestrates multi-turn generation, per-turn judging (OpenJury or +FastChat-compatible), and result saving for the MT-Bench benchmark. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd + +from openjury.evaluate import PairScore, load_judge_system_and_user_prompt +from openjury.generate import generate_multiturn +from openjury.instruction_dataset import load_instructions +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.mt_bench.fastchat_compat import ( + FASTCHAT_TEMPERATURE_CONFIG, + judge_mt_bench_pairwise_fastchat, +) +from openjury.utils import cache_function_dataframe, make_model + +if TYPE_CHECKING: + from openjury.generate_and_evaluate import CliArgs + +NEED_REF_CATS = {"math", "reasoning", "coding"} + + +def format_mt_bench_for_evaluation( + questions: pd.DataFrame, + completions_A: pd.DataFrame, + completions_B: pd.DataFrame, + turns_mode: str, + truncate_input_chars: int | None, +) -> tuple[ + tuple[list[str], list[str], list[str], list[dict[str, object]]], + tuple[list[str], list[str], list[str], list[dict[str, object]]], +]: + """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" + assert turns_mode in ("both", "single", "multi") + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + instructions_turn_1: list[str] = [] + completions_a_turn_1: list[str] = [] + completions_b_turn_1: list[str] = [] + metadata_turn_1: list[dict[str, object]] = [] + + instructions_turn_2: list[str] = [] + completions_a_turn_2: list[str] = [] + completions_b_turn_2: list[str] = [] + metadata_turn_2: list[dict[str, object]] = [] + + for row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_A, + completions_b=completions_B, + truncate_input_chars=truncate_input_chars, + ): + needs_ref = row.category in NEED_REF_CATS + if eval_single: + if needs_ref and row.ref_1: + instruction = ( + "[MT-Bench | Turn 1]\n" + "Use the reference answer for correctness checks.\n\n" + f"[Question]\n{row.turn_1_question}\n\n" + f"[Reference Answer]\n{row.ref_1}" + ) + else: + instruction = row.turn_1_question + + instructions_turn_1.append(instruction) + completions_a_turn_1.append(row.answer_a_1) + completions_b_turn_1.append(row.answer_b_1) + metadata_turn_1.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 1, + } + ) + + if eval_multi and row.turn_2_question: + instruction_parts = [ + "Please focus on which assistant provides a better answer to the second user question." + ] + if needs_ref and (row.ref_1 or row.ref_2): + instruction_parts.extend( + [ + "<|The Start of Reference Answer|>", + "### User:", + row.turn_1_question, + "### Reference answer:", + row.ref_1, + "### User:", + row.turn_2_question, + "### Reference answer:", + row.ref_2, + "<|The End of Reference Answer|>", + ] + ) + + conversation_a = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_a_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_a_2, + ) + conversation_b = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_b_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_b_2, + ) + + instructions_turn_2.append("\n\n".join(instruction_parts)) + completions_a_turn_2.append(conversation_a) + completions_b_turn_2.append(conversation_b) + metadata_turn_2.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 2, + } + ) + + return ( + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ), + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ), + ) + + +def _format_mt_bench_multiturn_conversation( + *, + turn_1_question: str, + turn_1_answer: str, + turn_2_question: str, + turn_2_answer: str, +) -> str: + return ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{turn_1_answer}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{turn_2_answer}" + ) + + +def _generate_mt_bench_completions( + args: CliArgs, + questions_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + cache_prefix = ( + "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench" + ) + + def _run_generation(model_name: str) -> pd.DataFrame: + if args.mt_bench_compatibility == "fastchat": + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + temperature_config=FASTCHAT_TEMPERATURE_CONFIG, + ) + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + return completions_a, completions_b + + +def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + +def _save_mt_bench_results( + *, + args: CliArgs, + results: dict[str, object], + annotations_df: pd.DataFrame, + name_suffix: str | None = None, +) -> None: + name = _build_mt_bench_result_name(args, suffix=name_suffix) + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + +def _run_mt_bench_fastchat( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + from openjury.generate_and_evaluate import ( + _compute_grouped_stats, + compute_preference_stats, + print_results, + ) + + prefs, annotations, combined_metadata, num_inconsistent = ( + judge_mt_bench_pairwise_fastchat( + judge_chat_model=judge_chat_model, + judge_model=args.judge_model, + questions=questions_df, + completions_a=completions_a, + completions_b=completions_b, + model_a=args.model_A, + model_b=args.model_B, + turns_mode=args.mt_bench_turns, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + ) + + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "mt_bench_compatibility": args.mt_bench_compatibility, + "num_inconsistent": num_inconsistent, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + _save_mt_bench_results( + args=args, + results=results, + annotations_df=pd.DataFrame(annotations), + name_suffix=f"mtbench_{args.mt_bench_compatibility}", + ) + return prefs + + +def _run_mt_bench_openjury( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + from openjury.generate_and_evaluate import ( + _compute_grouped_stats, + _judge_turn, + compute_preference_stats, + print_results, + ) + + turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( + questions=questions_df, + completions_A=completions_a, + completions_B=completions_b, + turns_mode=args.mt_bench_turns, + truncate_input_chars=args.truncate_all_input_chars, + ) + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ) = turn_2_inputs + + score_parser = PairScore() + annotations = [] + metadata_for_annotations: list[dict[str, object]] = [] + annotations_reversed = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + preference_parts: list[pd.Series] = [] + combined_metadata: list[dict[str, object]] = [] + + if instructions_turn_1: + ( + annotations_turn_1, + annotations_turn_1_reversed, + metadata_turn_1_for_annotations, + metadata_turn_1_for_reversed_annotations, + prefs_turn_1, + combined_metadata_turn_1, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_1, + completions_A=completions_a_turn_1, + completions_B=completions_b_turn_1, + metadata=metadata_turn_1, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations.extend(annotations_turn_1) + annotations_reversed.extend(annotations_turn_1_reversed) + metadata_for_annotations.extend(metadata_turn_1_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_1_for_reversed_annotations + ) + preference_parts.append(prefs_turn_1) + combined_metadata.extend(combined_metadata_turn_1) + + if instructions_turn_2: + mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt( + provide_explanation=args.provide_explanation, + multi_turn=True, + ) + ( + annotations_turn_2, + annotations_turn_2_reversed, + metadata_turn_2_for_annotations, + metadata_turn_2_for_reversed_annotations, + prefs_turn_2, + combined_metadata_turn_2, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_2, + completions_A=completions_a_turn_2, + completions_B=completions_b_turn_2, + metadata=metadata_turn_2, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, + ) + annotations.extend(annotations_turn_2) + annotations_reversed.extend(annotations_turn_2_reversed) + metadata_for_annotations.extend(metadata_turn_2_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_2_for_reversed_annotations + ) + preference_parts.append(prefs_turn_2) + combined_metadata.extend(combined_metadata_turn_2) + + prefs = ( + pd.concat(preference_parts).reset_index(drop=True) + if preference_parts + else pd.Series(dtype=float) + ) + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + + df = pd.DataFrame(annotations) + df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] + df["category"] = [meta["category"] for meta in metadata_for_annotations] + df["turn"] = [meta["turn"] for meta in metadata_for_annotations] + df["model_A"] = args.model_A + df["model_B"] = args.model_B + df["judge"] = args.judge_model + + if args.swap_mode == "both": + df_reversed = pd.DataFrame(annotations_reversed) + df_reversed["instruction_index"] = [ + meta["question_id"] for meta in metadata_for_reversed_annotations + ] + df_reversed["category"] = [ + meta["category"] for meta in metadata_for_reversed_annotations + ] + df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations] + df_reversed["model_A"] = args.model_B + df_reversed["model_B"] = args.model_A + df_reversed["judge"] = args.judge_model + df = pd.concat([df, df_reversed], ignore_index=True) + + _save_mt_bench_results( + args=args, + results=results, + annotations_df=df, + ) + return prefs + + +def run_mt_bench(args: CliArgs, ignore_cache: bool): + """MT-Bench pipeline (optionally FastChat-compatible).""" + questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) + print( + f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_completions( + args=args, + questions_df=questions_df, + ignore_cache=ignore_cache, + ) + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + if args.mt_bench_compatibility == "fastchat": + return _run_mt_bench_fastchat( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) + return _run_mt_bench_openjury( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index c0c2f20..b9e4ef6 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -2,6 +2,7 @@ import pytest import openjury.generate_and_evaluate as generate_and_evaluate +import openjury.mt_bench.pipeline as mt_bench_pipeline from openjury.generate_and_evaluate import ( main as main_generate_and_eval, CliArgs, @@ -51,6 +52,11 @@ def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Da "load_instructions", _load_instructions, ) + monkeypatch.setattr( + mt_bench_pipeline, + "load_instructions", + _load_instructions, + ) monkeypatch.setattr( generate_and_evaluate, "load_contexts", @@ -69,6 +75,9 @@ def _run_without_cache(fun, **_kwargs): monkeypatch.setattr( generate_and_evaluate, "cache_function_dataframe", _run_without_cache ) + monkeypatch.setattr( + mt_bench_pipeline, "cache_function_dataframe", _run_without_cache + ) @pytest.mark.parametrize( From 0fb9700200398deafa23f12d8f8c8aaf5979960d Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 17 Mar 2026 21:18:23 +0100 Subject: [PATCH 31/35] Remove stale unused entries for fastchat mode --- openjury/mt_bench/fastchat_compat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/openjury/mt_bench/fastchat_compat.py b/openjury/mt_bench/fastchat_compat.py index 5883391..728b0f2 100644 --- a/openjury/mt_bench/fastchat_compat.py +++ b/openjury/mt_bench/fastchat_compat.py @@ -21,11 +21,9 @@ "reasoning": 0.0, "stem": 0.1, "humanities": 0.1, - "arena-hard-200": 0.0, } -# "arena-hard-200" is a FastChat-internal category label, not OpenJury's arena-hard dataset. -FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding", "arena-hard-200"} +FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding"} FastChatVerdict = Literal["A", "B", "tie", "error"] PairwiseWinner = Literal["model_A", "model_B", "tie", "error"] From 8d73c77372d8a6d2c1e8e7887db5fbe4304a99bd Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Tue, 17 Mar 2026 23:06:40 +0100 Subject: [PATCH 32/35] move the mt-bench-101 logic from generate_and_evaluate --- README.md | 19 ++ openjury/generate.py | 8 +- openjury/generate_and_evaluate.py | 354 +++----------------- openjury/mt_bench/fastchat_compat.py | 4 +- openjury/mt_bench/pipeline.py | 484 +++++++++++++++++++++++++++ openjury/mt_bench_101/__init__.py | 2 + openjury/mt_bench_101/generate.py | 26 +- openjury/mt_bench_101/pipeline.py | 201 +++++++++++ openjury/utils.py | 47 ++- tests/test_generate_and_evaluate.py | 9 + 10 files changed, 814 insertions(+), 340 deletions(-) create mode 100644 openjury/mt_bench/pipeline.py create mode 100644 openjury/mt_bench_101/pipeline.py diff --git a/README.md b/README.md index 91903e8..d09fe21 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,25 @@ The evaluation scripts expose four different length controls with different role - `--max_out_tokens_judge`: generation token budget for the judge completion (reasoning + score output). - `--max_model_len`: optional vLLM context-window limit (prompt + generated tokens), applied to vLLM models; this should be greater than or equal to the two `max_out_tokens_*` values. +### Engine-Specific Configuration (`--engine_kwargs`) + +Some providers expose additional engine-level knobs (for example, vLLM allows configuring tensor parallelism or GPU memory utilization). +OpenJury lets you forward these options directly to the underlying engine via `--engine_kwargs`, which expects a JSON object. + +For instance, to run vLLM with tensor parallelism across multiple GPUs: + +```bash +python openjury/generate_and_evaluate.py \ + --dataset alpaca-eval \ + --model_A VLLM/Qwen/Qwen2.5-0.5B-Instruct \ + --model_B VLLM/Qwen/Qwen2.5-1.5B-Instruct \ + --judge_model VLLM/Qwen/Qwen3.5-27B-FP8 \ + --n_instructions 10 \ + --engine_kwargs '{"tensor_parallel_size": 2}' +``` + +While any key in `--engine_kwargs` is forwarded to the underlying engine (e.g. `vllm.LLM`, `LlamaCpp`, `ChatOpenAI`), existing dedicated flags such as `--max_model_len` and `--chat_template` have higher precedence. + ## 🎨 Model Specification Models are specified using the format: `{LangChain Backend}/{Model Path}` diff --git a/openjury/generate.py b/openjury/generate.py index cda4bcc..64eb789 100644 --- a/openjury/generate.py +++ b/openjury/generate.py @@ -63,9 +63,9 @@ def generate_instructions( max_tokens: int | None = 32768, use_tqdm: bool = True, system_prompt: str | None = None, - **model_kwargs, + **engine_kwargs, ) -> pd.DataFrame: - chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs) + chat_model = make_model(model, max_tokens=max_tokens, **engine_kwargs) # TODO improve prompt to generate instructions if system_prompt is None: @@ -235,9 +235,9 @@ def generate_base( truncate_input_chars: int | None = 8192, max_tokens: int | None = 32768, use_tqdm: bool = False, - **model_kwargs, + **engine_kwargs, ) -> pd.DataFrame: - model = make_model(model, max_tokens=max_tokens, **model_kwargs) + model = make_model(model, max_tokens=max_tokens, **engine_kwargs) inputs = [ truncate(instruction, max_len=truncate_input_chars) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index f34f395..1950c05 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -6,7 +6,7 @@ import argparse import json import os -from dataclasses import dataclass, asdict +from dataclasses import dataclass, asdict, field from datetime import datetime from functools import partial from pathlib import Path @@ -20,18 +20,15 @@ ) from openjury.generate import generate_instructions, generate_base, generate_multiturn from openjury.instruction_dataset import load_instructions -from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.mt_bench.pipeline import ( + format_mt_bench_for_evaluation, + run_mt_bench, +) from openjury.mt_bench.fastchat_compat import ( FASTCHAT_TEMPERATURE_CONFIG, judge_mt_bench_pairwise_fastchat, ) -from openjury.mt_bench_101.evaluate import ( - derive_mt_bench_101_pairwise_preferences, - judge_mt_bench_101_single, - summarize_mt_bench_101_absolute_scores, - summarize_mt_bench_101_pairwise, -) -from openjury.mt_bench_101.generate import generate_mt_bench_101_completions +from openjury.mt_bench_101.pipeline import run_mt_bench_101 from openjury.utils import ( cache_function_dataframe, data_root, @@ -40,8 +37,6 @@ read_df, ) -NEED_REF_CATS = {"math", "reasoning", "coding"} - def try_load_dataset_completions( dataset: str, model: str, n_instructions: int | None @@ -99,8 +94,8 @@ class CliArgs: chat_template: str | None = None mt_bench_turns: str = "both" mt_bench_compatibility: str = "openjury" - result_folder: str = "results" + engine_kwargs: dict = field(default_factory=dict) def __post_init__(self): supported_modes = ["fixed", "both"] @@ -189,7 +184,7 @@ def parse_args(cls): required=False, default=8192, help="Character-level truncation applied before tokenization: truncates each instruction " - "before model A/B generation and truncates each completion before judge evaluation.", + "before model A/B generation and truncates each completion before judge evaluation.", ) parser.add_argument( "--max_out_tokens_models", @@ -255,8 +250,27 @@ def parse_args(cls): "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures." ), ) + parser.add_argument( + "--engine_kwargs", + type=str, + required=False, + default="{}", + help=( + "JSON dict of engine-specific kwargs forwarded to the underlying engine. " + "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'." + ), + ) args = parser.parse_args() + try: + engine_kwargs = ( + json.loads(args.engine_kwargs) if args.engine_kwargs else {} + ) + if not isinstance(engine_kwargs, dict): + raise ValueError("engine_kwargs must be a JSON object") + except Exception as e: + raise SystemExit(f"Failed to parse --engine_kwargs: {e}") + return cls( dataset=args.dataset, model_A=args.model_A, @@ -275,6 +289,7 @@ def parse_args(cls): mt_bench_turns=args.mt_bench_turns, mt_bench_compatibility=args.mt_bench_compatibility, result_folder=args.result_folder, + engine_kwargs=engine_kwargs, ) @@ -326,139 +341,6 @@ def print_results(results): print("=" * 60 + "\n") -def format_mt_bench_for_evaluation( - questions: pd.DataFrame, - completions_A: pd.DataFrame, - completions_B: pd.DataFrame, - turns_mode: str, - truncate_input_chars: int | None, -) -> tuple[ - tuple[list[str], list[str], list[str], list[dict[str, object]]], - tuple[list[str], list[str], list[str], list[dict[str, object]]], -]: - """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" - assert turns_mode in ("both", "single", "multi") - eval_single = turns_mode in ("both", "single") - eval_multi = turns_mode in ("both", "multi") - - instructions_turn_1: list[str] = [] - completions_a_turn_1: list[str] = [] - completions_b_turn_1: list[str] = [] - metadata_turn_1: list[dict[str, object]] = [] - - instructions_turn_2: list[str] = [] - completions_a_turn_2: list[str] = [] - completions_b_turn_2: list[str] = [] - metadata_turn_2: list[dict[str, object]] = [] - - for row in iter_mt_bench_pairwise_rows( - questions=questions, - completions_a=completions_A, - completions_b=completions_B, - truncate_input_chars=truncate_input_chars, - ): - needs_ref = row.category in NEED_REF_CATS - if eval_single: - if needs_ref and row.ref_1: - instruction = ( - "[MT-Bench | Turn 1]\n" - "Use the reference answer for correctness checks.\n\n" - f"[Question]\n{row.turn_1_question}\n\n" - f"[Reference Answer]\n{row.ref_1}" - ) - else: - instruction = row.turn_1_question - - instructions_turn_1.append(instruction) - completions_a_turn_1.append(row.answer_a_1) - completions_b_turn_1.append(row.answer_b_1) - metadata_turn_1.append( - { - "question_id": row.question_id, - "category": row.category, - "turn": 1, - } - ) - - if eval_multi and row.turn_2_question: - instruction_parts = [ - "Please focus on which assistant provides a better answer to the second user question." - ] - if needs_ref and (row.ref_1 or row.ref_2): - instruction_parts.extend( - [ - "<|The Start of Reference Answer|>", - "### User:", - row.turn_1_question, - "### Reference answer:", - row.ref_1, - "### User:", - row.turn_2_question, - "### Reference answer:", - row.ref_2, - "<|The End of Reference Answer|>", - ] - ) - - conversation_a = _format_mt_bench_multiturn_conversation( - turn_1_question=row.turn_1_question, - turn_1_answer=row.answer_a_1, - turn_2_question=row.turn_2_question, - turn_2_answer=row.answer_a_2, - ) - conversation_b = _format_mt_bench_multiturn_conversation( - turn_1_question=row.turn_1_question, - turn_1_answer=row.answer_b_1, - turn_2_question=row.turn_2_question, - turn_2_answer=row.answer_b_2, - ) - - instructions_turn_2.append("\n\n".join(instruction_parts)) - completions_a_turn_2.append(conversation_a) - completions_b_turn_2.append(conversation_b) - metadata_turn_2.append( - { - "question_id": row.question_id, - "category": row.category, - "turn": 2, - } - ) - - return ( - ( - instructions_turn_1, - completions_a_turn_1, - completions_b_turn_1, - metadata_turn_1, - ), - ( - instructions_turn_2, - completions_a_turn_2, - completions_b_turn_2, - metadata_turn_2, - ), - ) - - -def _format_mt_bench_multiturn_conversation( - *, - turn_1_question: str, - turn_1_answer: str, - turn_2_question: str, - turn_2_answer: str, -) -> str: - return ( - "### User:\n" - f"{turn_1_question}\n\n" - "### Assistant:\n" - f"{turn_1_answer}\n\n" - "### User:\n" - f"{turn_2_question}\n\n" - "### Assistant:\n" - f"{turn_2_answer}" - ) - - def compute_preference_stats(prefs: pd.Series) -> dict: """Derive win/loss/tie counts and winrate from a Series of preferences. @@ -608,9 +490,9 @@ def main(args: CliArgs): # MT-Bench has its own pipeline: multi-turn generation + category-aware judging if args.dataset == "mt-bench": - return _run_mt_bench(args, ignore_cache) + return run_mt_bench(args, ignore_cache) if args.dataset == "mt-bench-101": - return _run_mt_bench_101(args, ignore_cache) + return run_mt_bench_101(args, ignore_cache) # Currrently, we run context evaluation is_fluency_task = "fluency" in args.dataset @@ -635,9 +517,25 @@ def main(args: CliArgs): # TODO currently we just support base models for fluency, we could also support instruction-tuned models gen_fun = ( - partial(generate_base, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models, max_model_len=args.max_model_len, chat_template=args.chat_template) + partial( + generate_base, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + use_tqdm=args.use_tqdm, + **args.engine_kwargs, + ) if is_fluency_task - else partial(generate_instructions, truncate_input_chars=args.truncate_all_input_chars, max_tokens=args.max_out_tokens_models, chat_template=args.chat_template, max_model_len=args.max_model_len) + else partial( + generate_instructions, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + use_tqdm=args.use_tqdm, + **args.engine_kwargs, + ) ) dataset_completions_A = try_load_dataset_completions( args.dataset, args.model_A, n_instructions @@ -685,6 +583,7 @@ def main(args: CliArgs): max_tokens=args.max_out_tokens_judge, max_model_len=args.max_model_len, chat_template=args.chat_template, + **args.engine_kwargs, ) if is_fluency_task: system_prompt = """You are a highly efficient assistant, who evaluates and selects the best large language \ @@ -819,36 +718,6 @@ def _run_generation(model_name: str) -> pd.DataFrame: return completions_a, completions_b -def _generate_mt_bench_101_completions( - args: CliArgs, - eval_items_df: pd.DataFrame, - ignore_cache: bool, -) -> tuple[pd.DataFrame, pd.DataFrame]: - def _run_generation(model_name: str) -> pd.DataFrame: - return generate_mt_bench_101_completions( - eval_items=eval_items_df, - model=model_name, - truncate_input_chars=args.truncate_all_input_chars, - max_tokens=args.max_out_tokens_models, - use_tqdm=args.use_tqdm, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - ) - - completions_a = cache_function_dataframe( - lambda: _run_generation(args.model_A), - ignore_cache=ignore_cache, - cache_name=f"mt-bench-101_{args.model_A}_{args.n_instructions}", - ).set_index("instruction_index") - - completions_b = cache_function_dataframe( - lambda: _run_generation(args.model_B), - ignore_cache=ignore_cache, - cache_name=f"mt-bench-101_{args.model_B}_{args.n_instructions}", - ).set_index("instruction_index") - return completions_a, completions_b - - def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" name += f"-{args.swap_mode}" @@ -857,13 +726,6 @@ def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str return name.replace("/", "_") -def _build_mt_bench_101_result_name(args: CliArgs, suffix: str | None = None) -> str: - name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" - if suffix: - name += f"-{suffix}" - return name.replace("/", "_") - - def _save_mt_bench_results( *, args: CliArgs, @@ -1093,124 +955,6 @@ def _run_mt_bench_openjury( return prefs -def _run_mt_bench_101(args: CliArgs, ignore_cache: bool) -> pd.Series: - """MT-Bench-101 pipeline with paper-faithful single-answer grading.""" - if args.mt_bench_compatibility or args.mt_bench_turns: - print( - "MT-Bench-101 is a different benchmark from original MT-Bench. " - "--mt_bench_turns and --mt_bench_compatibility have no effect for this dataset, " - ) - if args.swap_mode: - print( - "--swap_mode has no effect for mt-bench-101 since it does single answer grading before comparing the models" - ) - - eval_items_df = load_instructions( - "mt-bench-101", n_instructions=args.n_instructions - ) - print( - "Generating completions from golden context for MT-Bench-101 with " - f"{args.model_A} and {args.model_B}." - ) - completions_a, completions_b = _generate_mt_bench_101_completions( - args=args, - eval_items_df=eval_items_df, - ignore_cache=ignore_cache, - ) - - judge_chat_model = make_model( - model=args.judge_model, - max_tokens=args.max_out_tokens_judge, - temperature=0.6, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - ) - scored_a = judge_mt_bench_101_single( - judge_chat_model=judge_chat_model, - eval_items=eval_items_df, - completions=completions_a, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - scored_b = judge_mt_bench_101_single( - judge_chat_model=judge_chat_model, - eval_items=eval_items_df, - completions=completions_b, - truncate_input_chars=args.truncate_all_input_chars, - use_tqdm=args.use_tqdm, - ) - - absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a) - absolute_b = summarize_mt_bench_101_absolute_scores(scored_turns=scored_b) - pairwise_turns = derive_mt_bench_101_pairwise_preferences( - scored_a=scored_a, - scored_b=scored_b, - ) - pairwise_summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns) - dialogue_pairwise = pairwise_summary["dialogue_level"] - - print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}") - print( - "MT-Bench-101 dialogue-level pairwise winrate(A): " - f"{dialogue_pairwise['winrate']:.1%}" - ) - - ann_cols = [ - "instruction_index", - "dialogue_uid", - "dialogue_id", - "task", - "ability", - "turn_index", - "model_completion", - "judge_completion", - "score", - ] - annotations_a = scored_a.loc[:, ann_cols].copy() - annotations_a["evaluated_model"] = args.model_A - annotations_b = scored_b.loc[:, ann_cols].copy() - annotations_b["evaluated_model"] = args.model_B - annotations_df = pd.concat([annotations_a, annotations_b], ignore_index=True) - annotations_df = annotations_df.merge( - pairwise_turns.loc[ - :, ["instruction_index", "score_A", "score_B", "preference"] - ], - on="instruction_index", - how="left", - validate="many_to_one", - ) - - results = { - "dataset": args.dataset, - "model_A": args.model_A, - "model_B": args.model_B, - "judge_model": args.judge_model, - "judge_temperature": 0.6, - "evaluation_mode": "single_answer_grading", - "num_battles": dialogue_pairwise["num_battles"], - "winrate": dialogue_pairwise["winrate"], - "num_wins": dialogue_pairwise["num_wins"], - "num_losses": dialogue_pairwise["num_losses"], - "num_ties": dialogue_pairwise["num_ties"], - "num_missing": dialogue_pairwise["num_missing"], - "per_category": dialogue_pairwise["per_task"], - "model_A_scores": absolute_a, - "model_B_scores": absolute_b, - "pairwise": pairwise_summary, - "preferences": pairwise_summary["preferences"], - "date": str(datetime.now().isoformat()), - "user": os.getenv("USER", ""), - } - - _save_mt_bench_results( - args=args, - results=results, - annotations_df=annotations_df, - result_name=_build_mt_bench_101_result_name(args, suffix="mtbench_101"), - ) - return pd.Series(pairwise_summary["preferences"]) - - def _run_mt_bench(args: CliArgs, ignore_cache: bool): """MT-Bench pipeline (optionally FastChat-compatible).""" questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) diff --git a/openjury/mt_bench/fastchat_compat.py b/openjury/mt_bench/fastchat_compat.py index 5883391..728b0f2 100644 --- a/openjury/mt_bench/fastchat_compat.py +++ b/openjury/mt_bench/fastchat_compat.py @@ -21,11 +21,9 @@ "reasoning": 0.0, "stem": 0.1, "humanities": 0.1, - "arena-hard-200": 0.0, } -# "arena-hard-200" is a FastChat-internal category label, not OpenJury's arena-hard dataset. -FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding", "arena-hard-200"} +FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding"} FastChatVerdict = Literal["A", "B", "tie", "error"] PairwiseWinner = Literal["model_A", "model_B", "tie", "error"] diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py new file mode 100644 index 0000000..6c949dc --- /dev/null +++ b/openjury/mt_bench/pipeline.py @@ -0,0 +1,484 @@ +"""MT-Bench evaluation pipeline. + +Orchestrates multi-turn generation, per-turn judging (OpenJury or +FastChat-compatible), and result saving for the MT-Bench benchmark. +""" + +from __future__ import annotations + +import json +import os +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd + +from openjury.evaluate import PairScore, load_judge_system_and_user_prompt +from openjury.generate import generate_multiturn +from openjury.instruction_dataset import load_instructions +from openjury.mt_bench.common import iter_mt_bench_pairwise_rows +from openjury.mt_bench.fastchat_compat import ( + FASTCHAT_TEMPERATURE_CONFIG, + judge_mt_bench_pairwise_fastchat, +) +from openjury.utils import cache_function_dataframe, make_model + +if TYPE_CHECKING: + from openjury.generate_and_evaluate import CliArgs + +NEED_REF_CATS = {"math", "reasoning", "coding"} + + +def format_mt_bench_for_evaluation( + questions: pd.DataFrame, + completions_A: pd.DataFrame, + completions_B: pd.DataFrame, + turns_mode: str, + truncate_input_chars: int | None, +) -> tuple[ + tuple[list[str], list[str], list[str], list[dict[str, object]]], + tuple[list[str], list[str], list[str], list[dict[str, object]]], +]: + """Flatten MT-Bench into per-turn instruction/completion battle inputs.""" + assert turns_mode in ("both", "single", "multi") + eval_single = turns_mode in ("both", "single") + eval_multi = turns_mode in ("both", "multi") + + instructions_turn_1: list[str] = [] + completions_a_turn_1: list[str] = [] + completions_b_turn_1: list[str] = [] + metadata_turn_1: list[dict[str, object]] = [] + + instructions_turn_2: list[str] = [] + completions_a_turn_2: list[str] = [] + completions_b_turn_2: list[str] = [] + metadata_turn_2: list[dict[str, object]] = [] + + for row in iter_mt_bench_pairwise_rows( + questions=questions, + completions_a=completions_A, + completions_b=completions_B, + truncate_input_chars=truncate_input_chars, + ): + needs_ref = row.category in NEED_REF_CATS + if eval_single: + if needs_ref and row.ref_1: + instruction = ( + "[MT-Bench | Turn 1]\n" + "Use the reference answer for correctness checks.\n\n" + f"[Question]\n{row.turn_1_question}\n\n" + f"[Reference Answer]\n{row.ref_1}" + ) + else: + instruction = row.turn_1_question + + instructions_turn_1.append(instruction) + completions_a_turn_1.append(row.answer_a_1) + completions_b_turn_1.append(row.answer_b_1) + metadata_turn_1.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 1, + } + ) + + if eval_multi and row.turn_2_question: + instruction_parts = [ + "Please focus on which assistant provides a better answer to the second user question." + ] + if needs_ref and (row.ref_1 or row.ref_2): + instruction_parts.extend( + [ + "<|The Start of Reference Answer|>", + "### User:", + row.turn_1_question, + "### Reference answer:", + row.ref_1, + "### User:", + row.turn_2_question, + "### Reference answer:", + row.ref_2, + "<|The End of Reference Answer|>", + ] + ) + + conversation_a = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_a_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_a_2, + ) + conversation_b = _format_mt_bench_multiturn_conversation( + turn_1_question=row.turn_1_question, + turn_1_answer=row.answer_b_1, + turn_2_question=row.turn_2_question, + turn_2_answer=row.answer_b_2, + ) + + instructions_turn_2.append("\n\n".join(instruction_parts)) + completions_a_turn_2.append(conversation_a) + completions_b_turn_2.append(conversation_b) + metadata_turn_2.append( + { + "question_id": row.question_id, + "category": row.category, + "turn": 2, + } + ) + + return ( + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ), + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ), + ) + + +def _format_mt_bench_multiturn_conversation( + *, + turn_1_question: str, + turn_1_answer: str, + turn_2_question: str, + turn_2_answer: str, +) -> str: + return ( + "### User:\n" + f"{turn_1_question}\n\n" + "### Assistant:\n" + f"{turn_1_answer}\n\n" + "### User:\n" + f"{turn_2_question}\n\n" + "### Assistant:\n" + f"{turn_2_answer}" + ) + + +def _generate_mt_bench_completions( + args: CliArgs, + questions_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + cache_prefix = ( + "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench" + ) + + def _run_generation(model_name: str) -> pd.DataFrame: + if args.mt_bench_compatibility == "fastchat": + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + temperature_config=FASTCHAT_TEMPERATURE_CONFIG, + ) + return generate_multiturn( + questions=questions_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), + ignore_cache=ignore_cache, + cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + return completions_a, completions_b + + +def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + name += f"-{args.swap_mode}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + +def _save_mt_bench_results( + *, + args: CliArgs, + results: dict[str, object], + annotations_df: pd.DataFrame, + name_suffix: str | None = None, +) -> None: + name = _build_mt_bench_result_name(args, suffix=name_suffix) + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + +def _run_mt_bench_fastchat( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + from openjury.generate_and_evaluate import ( + _compute_grouped_stats, + compute_preference_stats, + print_results, + ) + + prefs, annotations, combined_metadata, num_inconsistent = ( + judge_mt_bench_pairwise_fastchat( + judge_chat_model=judge_chat_model, + judge_model=args.judge_model, + questions=questions_df, + completions_a=completions_a, + completions_b=completions_b, + model_a=args.model_A, + model_b=args.model_B, + turns_mode=args.mt_bench_turns, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + ) + + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "mt_bench_compatibility": args.mt_bench_compatibility, + "num_inconsistent": num_inconsistent, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + _save_mt_bench_results( + args=args, + results=results, + annotations_df=pd.DataFrame(annotations), + name_suffix=f"mtbench_{args.mt_bench_compatibility}", + ) + return prefs + + +def _run_mt_bench_openjury( + *, + args: CliArgs, + questions_df: pd.DataFrame, + completions_a: pd.DataFrame, + completions_b: pd.DataFrame, + judge_chat_model, +) -> pd.Series: + from openjury.generate_and_evaluate import ( + _compute_grouped_stats, + _judge_turn, + compute_preference_stats, + print_results, + ) + + turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( + questions=questions_df, + completions_A=completions_a, + completions_B=completions_b, + turns_mode=args.mt_bench_turns, + truncate_input_chars=args.truncate_all_input_chars, + ) + ( + instructions_turn_1, + completions_a_turn_1, + completions_b_turn_1, + metadata_turn_1, + ) = turn_1_inputs + ( + instructions_turn_2, + completions_a_turn_2, + completions_b_turn_2, + metadata_turn_2, + ) = turn_2_inputs + + score_parser = PairScore() + annotations = [] + metadata_for_annotations: list[dict[str, object]] = [] + annotations_reversed = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + preference_parts: list[pd.Series] = [] + combined_metadata: list[dict[str, object]] = [] + + if instructions_turn_1: + ( + annotations_turn_1, + annotations_turn_1_reversed, + metadata_turn_1_for_annotations, + metadata_turn_1_for_reversed_annotations, + prefs_turn_1, + combined_metadata_turn_1, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_1, + completions_A=completions_a_turn_1, + completions_B=completions_b_turn_1, + metadata=metadata_turn_1, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + annotations.extend(annotations_turn_1) + annotations_reversed.extend(annotations_turn_1_reversed) + metadata_for_annotations.extend(metadata_turn_1_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_1_for_reversed_annotations + ) + preference_parts.append(prefs_turn_1) + combined_metadata.extend(combined_metadata_turn_1) + + if instructions_turn_2: + mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt( + provide_explanation=args.provide_explanation, + multi_turn=True, + ) + ( + annotations_turn_2, + annotations_turn_2_reversed, + metadata_turn_2_for_annotations, + metadata_turn_2_for_reversed_annotations, + prefs_turn_2, + combined_metadata_turn_2, + ) = _judge_turn( + judge_chat_model=judge_chat_model, + instructions=instructions_turn_2, + completions_A=completions_a_turn_2, + completions_B=completions_b_turn_2, + metadata=metadata_turn_2, + score_parser=score_parser, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + system_prompt=mt_system_prompt, + user_prompt_template=mt_user_prompt_template, + ) + annotations.extend(annotations_turn_2) + annotations_reversed.extend(annotations_turn_2_reversed) + metadata_for_annotations.extend(metadata_turn_2_for_annotations) + metadata_for_reversed_annotations.extend( + metadata_turn_2_for_reversed_annotations + ) + preference_parts.append(prefs_turn_2) + combined_metadata.extend(combined_metadata_turn_2) + + prefs = ( + pd.concat(preference_parts).reset_index(drop=True) + if preference_parts + else pd.Series(dtype=float) + ) + stats = compute_preference_stats(prefs) + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + **stats, + "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"), + "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"), + "preferences": prefs.tolist(), + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + print_results(results) + + df = pd.DataFrame(annotations) + df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations] + df["category"] = [meta["category"] for meta in metadata_for_annotations] + df["turn"] = [meta["turn"] for meta in metadata_for_annotations] + df["model_A"] = args.model_A + df["model_B"] = args.model_B + df["judge"] = args.judge_model + + if args.swap_mode == "both": + df_reversed = pd.DataFrame(annotations_reversed) + df_reversed["instruction_index"] = [ + meta["question_id"] for meta in metadata_for_reversed_annotations + ] + df_reversed["category"] = [ + meta["category"] for meta in metadata_for_reversed_annotations + ] + df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations] + df_reversed["model_A"] = args.model_B + df_reversed["model_B"] = args.model_A + df_reversed["judge"] = args.judge_model + df = pd.concat([df, df_reversed], ignore_index=True) + + _save_mt_bench_results( + args=args, + results=results, + annotations_df=df, + ) + return prefs + + +def run_mt_bench(args: CliArgs, ignore_cache: bool): + """MT-Bench pipeline (optionally FastChat-compatible).""" + questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions) + print( + f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_completions( + args=args, + questions_df=questions_df, + ignore_cache=ignore_cache, + ) + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + if args.mt_bench_compatibility == "fastchat": + return _run_mt_bench_fastchat( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) + return _run_mt_bench_openjury( + args=args, + questions_df=questions_df, + completions_a=completions_a, + completions_b=completions_b, + judge_chat_model=judge_chat_model, + ) diff --git a/openjury/mt_bench_101/__init__.py b/openjury/mt_bench_101/__init__.py index abe2e9e..7fd025f 100644 --- a/openjury/mt_bench_101/__init__.py +++ b/openjury/mt_bench_101/__init__.py @@ -1,4 +1,5 @@ from openjury.mt_bench_101.generate import generate_mt_bench_101_completions +from openjury.mt_bench_101.pipeline import run_mt_bench_101 from openjury.mt_bench_101.evaluate import ( derive_mt_bench_101_pairwise_preferences, judge_mt_bench_101_single, @@ -10,6 +11,7 @@ "derive_mt_bench_101_pairwise_preferences", "generate_mt_bench_101_completions", "judge_mt_bench_101_single", + "run_mt_bench_101", "summarize_mt_bench_101_absolute_scores", "summarize_mt_bench_101_pairwise", ] diff --git a/openjury/mt_bench_101/generate.py b/openjury/mt_bench_101/generate.py index 8d2b1c3..ab8d75e 100644 --- a/openjury/mt_bench_101/generate.py +++ b/openjury/mt_bench_101/generate.py @@ -8,6 +8,10 @@ DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant." +def _escape_template_braces(text: str) -> str: + return text.replace("{", "{{").replace("}", "}}") + + def _build_golden_context_input( *, system_prompt: str, @@ -15,18 +19,32 @@ def _build_golden_context_input( user_message: str, truncate_input_chars: int | None, ): - messages: list[tuple[str, str]] = [("system", system_prompt)] + messages: list[tuple[str, str]] = [("system", _escape_template_braces(system_prompt))] for turn in golden_context: messages.append( - ("user", truncate(str(turn.get("user") or ""), max_len=truncate_input_chars)) + ( + "user", + _escape_template_braces( + truncate(str(turn.get("user") or ""), max_len=truncate_input_chars) + ), + ) ) messages.append( ( "assistant", - truncate(str(turn.get("bot") or ""), max_len=truncate_input_chars), + _escape_template_braces( + truncate(str(turn.get("bot") or ""), max_len=truncate_input_chars) + ), ) ) - messages.append(("user", truncate(user_message, max_len=truncate_input_chars))) + messages.append( + ( + "user", + _escape_template_braces( + truncate(user_message, max_len=truncate_input_chars) + ), + ) + ) return ChatPromptTemplate.from_messages(messages).invoke({}) diff --git a/openjury/mt_bench_101/pipeline.py b/openjury/mt_bench_101/pipeline.py new file mode 100644 index 0000000..2bacbef --- /dev/null +++ b/openjury/mt_bench_101/pipeline.py @@ -0,0 +1,201 @@ +"""MT-Bench-101 evaluation pipeline.""" + +from __future__ import annotations + +import json +import os +from dataclasses import asdict +from datetime import datetime +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd + +from openjury.mt_bench_101.evaluate import ( + derive_mt_bench_101_pairwise_preferences, + judge_mt_bench_101_single, + summarize_mt_bench_101_absolute_scores, + summarize_mt_bench_101_pairwise, +) +from openjury.mt_bench_101.generate import generate_mt_bench_101_completions +from openjury.utils import cache_function_dataframe, make_model + +if TYPE_CHECKING: + from openjury.generate_and_evaluate import CliArgs + + +def _generate_mt_bench_101_completions( + args: CliArgs, + eval_items_df: pd.DataFrame, + ignore_cache: bool, +) -> tuple[pd.DataFrame, pd.DataFrame]: + def _run_generation(model_name: str) -> pd.DataFrame: + return generate_mt_bench_101_completions( + eval_items=eval_items_df, + model=model_name, + truncate_input_chars=args.truncate_all_input_chars, + max_tokens=args.max_out_tokens_models, + use_tqdm=args.use_tqdm, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + + completions_a = cache_function_dataframe( + lambda: _run_generation(args.model_A), + ignore_cache=ignore_cache, + cache_name=f"mt-bench-101_{args.model_A}_{args.n_instructions}", + ).set_index("instruction_index") + + completions_b = cache_function_dataframe( + lambda: _run_generation(args.model_B), + ignore_cache=ignore_cache, + cache_name=f"mt-bench-101_{args.model_B}_{args.n_instructions}", + ).set_index("instruction_index") + return completions_a, completions_b + + +def _build_mt_bench_101_result_name(args: CliArgs, suffix: str | None = None) -> str: + name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}" + if suffix: + name += f"-{suffix}" + return name.replace("/", "_") + + +def _save_mt_bench_101_results( + *, + args: CliArgs, + results: dict[str, object], + annotations_df: pd.DataFrame, + name_suffix: str | None = None, +) -> None: + name = _build_mt_bench_101_result_name(args, suffix=name_suffix) + res_folder = Path(args.result_folder) / name + res_folder.mkdir(parents=True, exist_ok=True) + + with open(res_folder / f"args-{name}.json", "w") as f: + json.dump(asdict(args), f, indent=2) + + annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False) + + with open(res_folder / f"results-{name}.json", "w") as f: + json.dump(results, f, indent=2) + + +def run_mt_bench_101(args: CliArgs, ignore_cache: bool) -> pd.Series: + """MT-Bench-101 pipeline with single-answer grading.""" + if args.mt_bench_compatibility or args.mt_bench_turns: + print( + "MT-Bench-101 is a different benchmark from original MT-Bench. " + "--mt_bench_turns and --mt_bench_compatibility have no effect for this dataset, " + ) + if args.swap_mode: + print( + "--swap_mode has no effect for mt-bench-101 since it does single answer grading before comparing the models" + ) + + from openjury import generate_and_evaluate as gae + + eval_items_df = gae.load_instructions( + "mt-bench-101", n_instructions=args.n_instructions + ) + print( + "Generating completions from golden context for MT-Bench-101 with " + f"{args.model_A} and {args.model_B}." + ) + completions_a, completions_b = _generate_mt_bench_101_completions( + args=args, + eval_items_df=eval_items_df, + ignore_cache=ignore_cache, + ) + + judge_chat_model = make_model( + model=args.judge_model, + max_tokens=args.max_out_tokens_judge, + temperature=0.6, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + ) + scored_a = judge_mt_bench_101_single( + judge_chat_model=judge_chat_model, + eval_items=eval_items_df, + completions=completions_a, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + scored_b = judge_mt_bench_101_single( + judge_chat_model=judge_chat_model, + eval_items=eval_items_df, + completions=completions_b, + truncate_input_chars=args.truncate_all_input_chars, + use_tqdm=args.use_tqdm, + ) + + absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a) + absolute_b = summarize_mt_bench_101_absolute_scores(scored_turns=scored_b) + pairwise_turns = derive_mt_bench_101_pairwise_preferences( + scored_a=scored_a, + scored_b=scored_b, + ) + pairwise_summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns) + dialogue_pairwise = pairwise_summary["dialogue_level"] + + print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}") + print( + "MT-Bench-101 dialogue-level pairwise winrate(A): " + f"{dialogue_pairwise['winrate']:.1%}" + ) + + ann_cols = [ + "instruction_index", + "dialogue_uid", + "dialogue_id", + "task", + "ability", + "turn_index", + "model_completion", + "judge_completion", + "score", + ] + annotations_a = scored_a.loc[:, ann_cols].copy() + annotations_a["evaluated_model"] = args.model_A + annotations_b = scored_b.loc[:, ann_cols].copy() + annotations_b["evaluated_model"] = args.model_B + annotations_df = pd.concat([annotations_a, annotations_b], ignore_index=True) + annotations_df = annotations_df.merge( + pairwise_turns.loc[ + :, ["instruction_index", "score_A", "score_B", "preference"] + ], + on="instruction_index", + how="left", + validate="many_to_one", + ) + + results = { + "dataset": args.dataset, + "model_A": args.model_A, + "model_B": args.model_B, + "judge_model": args.judge_model, + "judge_temperature": 0.6, + "evaluation_mode": "single_answer_grading", + "num_battles": dialogue_pairwise["num_battles"], + "winrate": dialogue_pairwise["winrate"], + "num_wins": dialogue_pairwise["num_wins"], + "num_losses": dialogue_pairwise["num_losses"], + "num_ties": dialogue_pairwise["num_ties"], + "num_missing": dialogue_pairwise["num_missing"], + "per_category": dialogue_pairwise["per_task"], + "model_A_scores": absolute_a, + "model_B_scores": absolute_b, + "pairwise": pairwise_summary, + "preferences": pairwise_summary["preferences"], + "date": str(datetime.now().isoformat()), + "user": os.getenv("USER", ""), + } + + _save_mt_bench_101_results( + args=args, + results=results, + annotations_df=annotations_df, + name_suffix="mtbench_101", + ) + return pd.Series(pairwise_summary["preferences"]) diff --git a/openjury/utils.py b/openjury/utils.py index 2079424..cf5516e 100644 --- a/openjury/utils.py +++ b/openjury/utils.py @@ -407,7 +407,7 @@ def make_model( model: str, max_tokens: int | None = 8192, temperature: float | None = None, - **kwargs, + **engine_kwargs, ): """Instantiate a model wrapper from a provider/model-name string. @@ -417,10 +417,18 @@ def make_model( max_tokens: Maximum tokens the model may generate. temperature: Optional generation temperature override. ``None`` keeps each provider wrapper's default temperature behavior. - **kwargs: Provider-specific options forwarded to the model wrapper. - For VLLM these include ``max_model_len``, ``chat_template``, and - any other ``vllm.LLM`` constructor arguments. + **engine_kwargs: Engine-specific options forwarded to the model wrapper. """ + # Avoid mutating the original engine_kwargs dictionary + # NOTE: this is a shallow copy since we are not modifying any + # mutable objects in the dictionary. + engine_kwargs = engine_kwargs.copy() + + # Dedicated arguments like max_tokens always win over engine_kwargs. + engine_kwargs["max_tokens"] = max_tokens or 8192 + if temperature is not None: + engine_kwargs["temperature"] = temperature + model_provider = model.split("/")[0] if model_provider == "Dummy": @@ -431,39 +439,30 @@ def make_model( # Use our custom ChatVLLM wrapper which properly applies chat templates if model_provider == "VLLM": - chat_template = kwargs.pop("chat_template", None) - vllm_kwargs = {k: v for k, v in kwargs.items() if v is not None} + engine_kwargs = {k: v for k, v in engine_kwargs.items() if v is not None} + engine_kwargs["chat_template"] = engine_kwargs.get("chat_template", None) + return ChatVLLM( model=model_name, - max_tokens=max_tokens if max_tokens else 8192, - temperature=temperature if temperature is not None else 0.6, - chat_template=chat_template, - **vllm_kwargs, + **engine_kwargs, ) - - model_kwargs = {} - if max_tokens is not None: - model_kwargs["max_tokens"] = max_tokens - if temperature is not None: - model_kwargs["temperature"] = temperature - if model_provider == "OpenRouter": # Special case we need to override API url and key return ChatOpenAI( api_key=os.getenv("OPENROUTER_API_KEY"), base_url="https://openrouter.ai/api/v1", model=model_name, - **model_kwargs, + **engine_kwargs, ) elif model_provider == "LlamaCpp": - model_kwargs["model_path"] = model_name - model_kwargs.setdefault("n_ctx", 0) - return ChatLlamaCppModel(**model_kwargs) + engine_kwargs["model_path"] = model_name + engine_kwargs.setdefault("n_ctx", 0) + return ChatLlamaCppModel(**engine_kwargs) else: model_classes = [ ChatOpenAI, ] - model_kwargs["model"] = model_name + engine_kwargs["model"] = model_name try: from langchain_together.llms import Together @@ -475,13 +474,13 @@ def make_model( from langchain_openai.llms import OpenAI model_classes.append(OpenAI) - except ImportError: + except ImportError as e: print(str(e)) model_cls_dict = {model_cls.__name__: model_cls for model_cls in model_classes} assert ( model_provider in model_cls_dict ), f"{model_provider} not available, choose among {list(model_cls_dict.keys())}" - return model_cls_dict[model_provider](**model_kwargs) + return model_cls_dict[model_provider](**engine_kwargs) def download_all(): diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 8b92d27..0ad57eb 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -2,6 +2,7 @@ import pytest import openjury.generate_and_evaluate as generate_and_evaluate +import openjury.mt_bench.pipeline as mt_bench_pipeline from openjury.generate_and_evaluate import ( main as main_generate_and_eval, CliArgs, @@ -79,6 +80,11 @@ def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Da "load_instructions", _load_instructions, ) + monkeypatch.setattr( + mt_bench_pipeline, + "load_instructions", + _load_instructions, + ) monkeypatch.setattr( generate_and_evaluate, "load_contexts", @@ -97,6 +103,9 @@ def _run_without_cache(fun, **_kwargs): monkeypatch.setattr( generate_and_evaluate, "cache_function_dataframe", _run_without_cache ) + monkeypatch.setattr( + mt_bench_pipeline, "cache_function_dataframe", _run_without_cache + ) @pytest.mark.parametrize( From 6dd78fda9bce60a3a18746cbdaa60cddcbf730d0 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Wed, 18 Mar 2026 00:09:15 +0100 Subject: [PATCH 33/35] Refactor mt-bench eval helpers into shared runtime module --- openjury/eval_runtime.py | 171 ++++++++++++++++++++++++++++++ openjury/generate_and_evaluate.py | 171 +----------------------------- openjury/mt_bench/pipeline.py | 19 ++-- 3 files changed, 178 insertions(+), 183 deletions(-) create mode 100644 openjury/eval_runtime.py diff --git a/openjury/eval_runtime.py b/openjury/eval_runtime.py new file mode 100644 index 0000000..dd367e5 --- /dev/null +++ b/openjury/eval_runtime.py @@ -0,0 +1,171 @@ +"""Shared evaluation runtime helpers used by entrypoints and benchmark pipelines.""" + +from __future__ import annotations + +import pandas as pd + +from openjury.evaluate import annotate_battles, PairScore + + +def print_results(results): + """Print battle results in a readable format.""" + print("\n" + "=" * 60) + print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) + print(f"📊 Dataset: {results['dataset']}") + print( + f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" + ) + print(f"⚖️ Judge: {results['judge_model']}") + print("📈 Results Summary:") + print(f" Total Battles: {results['num_battles']}") + print(f" Win Rate (A): {results['winrate']:.1%}") + print(f" ✅ Wins: {results['num_wins']}") + print(f" ❌ Losses: {results['num_losses']}") + print(f" 🤝 Ties: {results['num_ties']}") + if results.get("num_missing", 0) > 0: + print(f" ❓ Missing: {results['num_missing']}") + + per_category = results.get("per_category") + if per_category: + print("\nPer-Category Breakdown:") + print( + f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}" + ) + print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}") + for cat, stats in sorted(per_category.items()): + print( + f" {cat:<14} | {stats['winrate']:>11.1%} | " + f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}" + ) + + per_turn = results.get("per_turn") + if per_turn: + print("\nPer-Turn Breakdown:") + for turn, stats in sorted(per_turn.items()): + print( + f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} " + f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})" + ) + print("=" * 60 + "\n") + + +def compute_preference_stats(prefs: pd.Series) -> dict: + """Derive win/loss/tie counts and winrate from a Series of preferences.""" + num_battles = len(prefs) + num_wins = int(sum(prefs < 0.5)) + num_losses = int(sum(prefs > 0.5)) + num_ties = int(sum(prefs == 0.5)) + num_missing = num_battles - (num_wins + num_losses + num_ties) + denom = num_wins + num_losses + num_ties + winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0 + return { + "num_battles": num_battles, + "num_wins": num_wins, + "num_losses": num_losses, + "num_ties": num_ties, + "num_missing": num_missing, + "winrate": winrate, + } + + +def _compute_grouped_stats( + preferences: pd.Series, + metadata: list[dict[str, object]], + group_by: str, +) -> dict[object, dict[str, float | int]]: + grouped: dict[object, list[float]] = {} + for meta, pref in zip(metadata, preferences): + key = meta.get(group_by) + if key is None: + continue + grouped.setdefault(key, []).append(pref) + return { + key: compute_preference_stats(pd.Series(vals)) + for key, vals in grouped.items() + } + + +def _parse_preferences_from_annotations( + annotations: list, + score_parser: PairScore, +) -> pd.Series: + return pd.Series( + [ + score_parser.parse_model_raw(annotation.judge_completion) + for annotation in annotations + ] + ) + + +def _judge_turn( + *, + judge_chat_model, + instructions: list[str], + completions_A: list[str], + completions_B: list[str], + metadata: list[dict[str, object]], + score_parser: PairScore, + provide_explanation: bool, + swap_mode: str, + truncate_input_chars: int | None, + use_tqdm: bool, + system_prompt: str | None = None, + user_prompt_template: str | None = None, +) -> tuple[ + list, + list, + list[dict[str, object]], + list[dict[str, object]], + pd.Series, + list[dict[str, object]], +]: + if not instructions: + return [], [], [], [], pd.Series(dtype=float), [] + + annotations = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_A, + completions_B=completions_B, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)] + + annotations_reversed: list = [] + metadata_for_reversed_annotations: list[dict[str, object]] = [] + combined_metadata = list(metadata) + + if swap_mode == "both": + print("Correction for judge bias towards a certain model position is set.") + print("Evaluating completions with models reversed.") + annotations_reversed = annotate_battles( + judge_chat_model=judge_chat_model, + instructions=instructions, + completions_A=completions_B, + completions_B=completions_A, + provide_explanation=provide_explanation, + system_prompt=system_prompt, + user_prompt_template=user_prompt_template, + truncate_input_chars=truncate_input_chars, + use_tqdm=use_tqdm, + ) + prefs_reversed = _parse_preferences_from_annotations( + annotations_reversed, score_parser + ) + preference_parts.append(1 - prefs_reversed) + metadata_for_reversed_annotations = list(metadata) + combined_metadata.extend(metadata) + + preferences = pd.concat(preference_parts).reset_index(drop=True) + return ( + annotations, + annotations_reversed, + list(metadata), + metadata_for_reversed_annotations, + preferences, + combined_metadata, + ) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index dbc659e..99986ee 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -14,9 +14,9 @@ import pandas as pd from openjury.evaluate import ( - annotate_battles, PairScore, ) +from openjury.eval_runtime import _judge_turn, compute_preference_stats, print_results from openjury.generate import generate_instructions, generate_base from openjury.instruction_dataset import load_instructions from openjury.mt_bench.pipeline import ( @@ -281,175 +281,6 @@ def load_contexts(dataset: str) -> pd.Series: return pd.read_csv(path).loc[:, "instruction"] -def print_results(results): - """Print battle results in a nice formatted way""" - - print("\n" + "=" * 60) - print("🏆 MODEL BATTLE RESULTS 🏆".center(60)) - print(f"📊 Dataset: {results['dataset']}") - print( - f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}" - ) - print(f"⚖️ Judge: {results['judge_model']}") - print(f"📈 Results Summary:") - print(f" Total Battles: {results['num_battles']}") - print(f" Win Rate (A): {results['winrate']:.1%}") - print(f" ✅ Wins: {results['num_wins']}") - print(f" ❌ Losses: {results['num_losses']}") - print(f" 🤝 Ties: {results['num_ties']}") - if results.get("num_missing", 0) > 0: - print(f" ❓ Missing: {results['num_missing']}") - - per_category = results.get("per_category") - if per_category: - print("\nPer-Category Breakdown:") - print( - f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}" - ) - print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}") - for cat, stats in sorted(per_category.items()): - print( - f" {cat:<14} | {stats['winrate']:>11.1%} | " - f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}" - ) - - per_turn = results.get("per_turn") - if per_turn: - print("\nPer-Turn Breakdown:") - for turn, stats in sorted(per_turn.items()): - print( - f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} " - f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})" - ) - print("=" * 60 + "\n") - - -def compute_preference_stats(prefs: pd.Series) -> dict: - """Derive win/loss/tie counts and winrate from a Series of preferences. - - Preference < 0.5 means model A wins, > 0.5 means model B wins, - exactly 0.5 is a tie. None/NaN values are counted as missing. - """ - num_battles = len(prefs) - num_wins = int(sum(prefs < 0.5)) - num_losses = int(sum(prefs > 0.5)) - num_ties = int(sum(prefs == 0.5)) - num_missing = num_battles - (num_wins + num_losses + num_ties) - denom = num_wins + num_losses + num_ties - winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0 - return { - "num_battles": num_battles, - "num_wins": num_wins, - "num_losses": num_losses, - "num_ties": num_ties, - "num_missing": num_missing, - "winrate": winrate, - } - - -def _compute_grouped_stats( - preferences: pd.Series, - metadata: list[dict[str, object]], - group_by: str, -) -> dict[object, dict[str, float | int]]: - grouped: dict[object, list[float]] = {} - for meta, pref in zip(metadata, preferences): - key = meta.get(group_by) - if key is None: - continue - grouped.setdefault(key, []).append(pref) - return { - key: compute_preference_stats(pd.Series(vals)) - for key, vals in grouped.items() - } - - -def _parse_preferences_from_annotations( - annotations: list, - score_parser: PairScore, -) -> pd.Series: - return pd.Series( - [ - score_parser.parse_model_raw(annotation.judge_completion) - for annotation in annotations - ] - ) - - -def _judge_turn( - *, - judge_chat_model, - instructions: list[str], - completions_A: list[str], - completions_B: list[str], - metadata: list[dict[str, object]], - score_parser: PairScore, - provide_explanation: bool, - swap_mode: str, - truncate_input_chars: int | None, - use_tqdm: bool, - system_prompt: str | None = None, - user_prompt_template: str | None = None, -) -> tuple[ - list, - list, - list[dict[str, object]], - list[dict[str, object]], - pd.Series, - list[dict[str, object]], -]: - if not instructions: - return [], [], [], [], pd.Series(dtype=float), [] - - annotations = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions, - completions_A=completions_A, - completions_B=completions_B, - provide_explanation=provide_explanation, - system_prompt=system_prompt, - user_prompt_template=user_prompt_template, - truncate_input_chars=truncate_input_chars, - use_tqdm=use_tqdm, - ) - preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)] - - annotations_reversed: list = [] - metadata_for_reversed_annotations: list[dict[str, object]] = [] - combined_metadata = list(metadata) - - if swap_mode == "both": - print("Correction for judge bias towards a certain model position is set.") - print("Evaluating completions with models reversed.") - annotations_reversed = annotate_battles( - judge_chat_model=judge_chat_model, - instructions=instructions, - completions_A=completions_B, - completions_B=completions_A, - provide_explanation=provide_explanation, - system_prompt=system_prompt, - user_prompt_template=user_prompt_template, - truncate_input_chars=truncate_input_chars, - use_tqdm=use_tqdm, - ) - prefs_reversed = _parse_preferences_from_annotations( - annotations_reversed, score_parser - ) - preference_parts.append(1 - prefs_reversed) - metadata_for_reversed_annotations = list(metadata) - combined_metadata.extend(metadata) - - preferences = pd.concat(preference_parts).reset_index(drop=True) - return ( - annotations, - annotations_reversed, - list(metadata), - metadata_for_reversed_annotations, - preferences, - combined_metadata, - ) - - def main(args: CliArgs): """ 1) take as input: diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py index 6c949dc..fd9df82 100644 --- a/openjury/mt_bench/pipeline.py +++ b/openjury/mt_bench/pipeline.py @@ -16,6 +16,12 @@ import pandas as pd from openjury.evaluate import PairScore, load_judge_system_and_user_prompt +from openjury.eval_runtime import ( + _compute_grouped_stats, + _judge_turn, + compute_preference_stats, + print_results, +) from openjury.generate import generate_multiturn from openjury.instruction_dataset import load_instructions from openjury.mt_bench.common import iter_mt_bench_pairwise_rows @@ -245,12 +251,6 @@ def _run_mt_bench_fastchat( completions_b: pd.DataFrame, judge_chat_model, ) -> pd.Series: - from openjury.generate_and_evaluate import ( - _compute_grouped_stats, - compute_preference_stats, - print_results, - ) - prefs, annotations, combined_metadata, num_inconsistent = ( judge_mt_bench_pairwise_fastchat( judge_chat_model=judge_chat_model, @@ -300,13 +300,6 @@ def _run_mt_bench_openjury( completions_b: pd.DataFrame, judge_chat_model, ) -> pd.Series: - from openjury.generate_and_evaluate import ( - _compute_grouped_stats, - _judge_turn, - compute_preference_stats, - print_results, - ) - turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation( questions=questions_df, completions_A=completions_a, From 0094eea47dc1fcc0fddbfaa21a1520693a385db9 Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Wed, 18 Mar 2026 01:13:33 +0100 Subject: [PATCH 34/35] move cli args and parsing to separate util to remove dependencies on entrypoint --- openjury/config.py | 212 ++++++++++++++++++++++++++++++ openjury/generate_and_evaluate.py | 211 +---------------------------- openjury/mt_bench/pipeline.py | 2 +- 3 files changed, 215 insertions(+), 210 deletions(-) create mode 100644 openjury/config.py diff --git a/openjury/config.py b/openjury/config.py new file mode 100644 index 0000000..80802eb --- /dev/null +++ b/openjury/config.py @@ -0,0 +1,212 @@ +"""CLI argument configuration for generation and evaluation entrypoints.""" + +import argparse +import json +from dataclasses import dataclass, field + + +@dataclass +class CliArgs: + dataset: str + model_A: str + model_B: str + judge_model: str + + n_instructions: int | None = None + provide_explanation: bool = False + swap_mode: str = "fixed" + ignore_cache: bool = False + use_tqdm: bool = False + truncate_all_input_chars: int = 8192 + max_out_tokens_models: int = 32768 + max_out_tokens_judge: int = 32768 + max_model_len: int | None = None + chat_template: str | None = None + mt_bench_turns: str = "both" + mt_bench_compatibility: str = "openjury" + result_folder: str = "results" + engine_kwargs: dict = field(default_factory=dict) + + def __post_init__(self): + supported_modes = ["fixed", "both"] + assert ( + self.swap_mode in supported_modes + ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." + supported_mt_bench_modes = ["openjury", "fastchat"] + assert ( + self.mt_bench_compatibility in supported_mt_bench_modes + ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}." + + @classmethod + def parse_args(cls): + parser = argparse.ArgumentParser( + prog="Generate completion and evaluate with a judge", + ) + parser.add_argument( + "--dataset", + help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction " + "tuning cases or `french-contexts`, `spanish-contexts` for base models.", + ) + parser.add_argument( + "--model_A", + required=True, + help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", + ) + parser.add_argument( + "--model_B", + required=True, + help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", + ) + parser.add_argument( + "--judge_model", + required=True, + help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, " + "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc", + ) + parser.add_argument( + "--n_instructions", + type=int, + required=False, + ) + parser.add_argument( + "--provide_explanation", + action="store_true", + help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve" + "the accuracy of the judge but enables some result interpretation.", + ) + parser.add_argument( + "--swap_mode", + type=str, + choices=["fixed", "both"], + default="fixed", + help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order " + "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account " + "for judge position bias. Default is 'fixed'.", + ) + parser.add_argument( + "--ignore_cache", + action="store_true", + help="If specified, ignore cache of previous completions.", + ) + parser.add_argument( + "--use_tqdm", + action="store_true", + help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", + ) + parser.add_argument( + "--result_folder", + type=str, + required=False, + default="results", + help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in" + " `[result_folder]/[evaluation_name]`.", + ) + parser.add_argument( + "--truncate_all_input_chars", + type=int, + required=False, + default=8192, + help="Character-level truncation applied before tokenization: truncates each instruction " + "before model A/B generation and truncates each completion before judge evaluation.", + ) + parser.add_argument( + "--max_out_tokens_models", + type=int, + required=False, + default=32768, + help=( + "Generation token budget for each model A/B response. For VLLM, keep this <= " + "--max_model_len (if provided)." + ), + ) + parser.add_argument( + "--max_out_tokens_judge", + type=int, + required=False, + default=32768, + help=( + "Generation token budget for the judge response (reasoning + scores). For " + "VLLM, keep this <= --max_model_len (if provided)." + ), + ) + parser.add_argument( + "--max_model_len", + type=int, + required=False, + default=None, + help=( + "Optional total context window for VLLM models (prompt + generation). This is " + "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap " + "generated tokens. This is useful on smaller GPUs to avoid OOM." + ), + ) + parser.add_argument( + "--chat_template", + type=str, + required=False, + default=None, + help="Jinja2 chat template string to use instead of the model's tokenizer template. " + "If not provided, ChatML is used as fallback for models without a chat template.", + ) + parser.add_argument( + "--mt_bench_turns", + type=str, + choices=["both", "single", "multi"], + default="both", + help="Which MT-Bench turns to evaluate. 'single': only turn 1, " + "'multi': only turn 2 (with full conversation context), " + "'both' (default): evaluate both turns.", + ) + parser.add_argument( + "--mt_bench_compatibility", + type=str, + choices=["openjury", "fastchat"], + default="openjury", + help=( + "MT-Bench evaluation/generation mode. " + "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. " + "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, " + "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures." + ), + ) + parser.add_argument( + "--engine_kwargs", + type=str, + required=False, + default="{}", + help=( + "JSON dict of engine-specific kwargs forwarded to the underlying engine. " + "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'." + ), + ) + args = parser.parse_args() + + try: + engine_kwargs = ( + json.loads(args.engine_kwargs) if args.engine_kwargs else {} + ) + if not isinstance(engine_kwargs, dict): + raise ValueError("engine_kwargs must be a JSON object") + except Exception as e: + raise SystemExit(f"Failed to parse --engine_kwargs: {e}") + + return cls( + dataset=args.dataset, + model_A=args.model_A, + model_B=args.model_B, + judge_model=args.judge_model, + n_instructions=args.n_instructions, + provide_explanation=args.provide_explanation, + swap_mode=args.swap_mode, + ignore_cache=args.ignore_cache, + use_tqdm=args.use_tqdm, + truncate_all_input_chars=args.truncate_all_input_chars, + max_out_tokens_models=args.max_out_tokens_models, + max_out_tokens_judge=args.max_out_tokens_judge, + max_model_len=args.max_model_len, + chat_template=args.chat_template, + mt_bench_turns=args.mt_bench_turns, + mt_bench_compatibility=args.mt_bench_compatibility, + result_folder=args.result_folder, + engine_kwargs=engine_kwargs, + ) diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py index 99986ee..66d15af 100644 --- a/openjury/generate_and_evaluate.py +++ b/openjury/generate_and_evaluate.py @@ -3,16 +3,16 @@ and then evaluates them using a judge model. """ -import argparse import json import os -from dataclasses import dataclass, asdict, field +from dataclasses import asdict from datetime import datetime from functools import partial from pathlib import Path import pandas as pd +from openjury.config import CliArgs from openjury.evaluate import ( PairScore, ) @@ -69,213 +69,6 @@ def try_load_dataset_completions( ) -@dataclass -class CliArgs: - dataset: str - model_A: str - model_B: str - judge_model: str - - n_instructions: int | None = None - provide_explanation: bool = False - swap_mode: str = "fixed" - ignore_cache: bool = False - use_tqdm: bool = False - truncate_all_input_chars: int = 8192 - max_out_tokens_models: int = 32768 - max_out_tokens_judge: int = 32768 - max_model_len: int | None = None - chat_template: str | None = None - mt_bench_turns: str = "both" - mt_bench_compatibility: str = "openjury" - result_folder: str = "results" - engine_kwargs: dict = field(default_factory=dict) - - def __post_init__(self): - supported_modes = ["fixed", "both"] - assert ( - self.swap_mode in supported_modes - ), f"Only {supported_modes} modes are supported but got {self.swap_mode}." - supported_mt_bench_modes = ["openjury", "fastchat"] - assert ( - self.mt_bench_compatibility in supported_mt_bench_modes - ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}." - - @classmethod - def parse_args(cls): - parser = argparse.ArgumentParser( - prog="Generate completion and evaluate with a judge", - ) - parser.add_argument( - "--dataset", - help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction " - "tuning cases or `french-contexts`, `spanish-contexts` for base models.", - ) - parser.add_argument( - "--model_A", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--model_B", - required=True, - help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`", - ) - parser.add_argument( - "--judge_model", - required=True, - help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, " - "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc", - ) - parser.add_argument( - "--n_instructions", - type=int, - required=False, - ) - parser.add_argument( - "--provide_explanation", - action="store_true", - help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve" - "the accuracy of the judge but enables some result interpretation.", - ) - parser.add_argument( - "--swap_mode", - type=str, - choices=["fixed", "both"], - default="fixed", - help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order " - "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account " - "for judge position bias. Default is 'fixed'.", - ) - parser.add_argument( - "--ignore_cache", - action="store_true", - help="If specified, ignore cache of previous completions.", - ) - parser.add_argument( - "--use_tqdm", - action="store_true", - help="If specified, use tqdm, does not work with all model providers, vLLM in particular.", - ) - parser.add_argument( - "--result_folder", - type=str, - required=False, - default="results", - help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in" - " `[result_folder]/[evaluation_name]`.", - ) - parser.add_argument( - "--truncate_all_input_chars", - type=int, - required=False, - default=8192, - help="Character-level truncation applied before tokenization: truncates each instruction " - "before model A/B generation and truncates each completion before judge evaluation.", - ) - parser.add_argument( - "--max_out_tokens_models", - type=int, - required=False, - default=32768, - help=( - "Generation token budget for each model A/B response. For VLLM, keep this <= " - "--max_model_len (if provided)." - ), - ) - parser.add_argument( - "--max_out_tokens_judge", - type=int, - required=False, - default=32768, - help=( - "Generation token budget for the judge response (reasoning + scores). For " - "VLLM, keep this <= --max_model_len (if provided)." - ), - ) - parser.add_argument( - "--max_model_len", - type=int, - required=False, - default=None, - help=( - "Optional total context window for VLLM models (prompt + generation). This is " - "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap " - "generated tokens. This is useful on smaller GPUs to avoid OOM." - ), - ) - parser.add_argument( - "--chat_template", - type=str, - required=False, - default=None, - help="Jinja2 chat template string to use instead of the model's tokenizer template. " - "If not provided, ChatML is used as fallback for models without a chat template.", - ) - parser.add_argument( - "--mt_bench_turns", - type=str, - choices=["both", "single", "multi"], - default="both", - help="Which MT-Bench turns to evaluate. 'single': only turn 1, " - "'multi': only turn 2 (with full conversation context), " - "'both' (default): evaluate both turns.", - ) - parser.add_argument( - "--mt_bench_compatibility", - type=str, - choices=["openjury", "fastchat"], - default="openjury", - help=( - "MT-Bench evaluation/generation mode. " - "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. " - "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, " - "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures." - ), - ) - parser.add_argument( - "--engine_kwargs", - type=str, - required=False, - default="{}", - help=( - "JSON dict of engine-specific kwargs forwarded to the underlying engine. " - "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'." - ), - ) - args = parser.parse_args() - - try: - engine_kwargs = ( - json.loads(args.engine_kwargs) if args.engine_kwargs else {} - ) - if not isinstance(engine_kwargs, dict): - raise ValueError("engine_kwargs must be a JSON object") - except Exception as e: - raise SystemExit(f"Failed to parse --engine_kwargs: {e}") - - return cls( - dataset=args.dataset, - model_A=args.model_A, - model_B=args.model_B, - judge_model=args.judge_model, - n_instructions=args.n_instructions, - provide_explanation=args.provide_explanation, - swap_mode=args.swap_mode, - ignore_cache=args.ignore_cache, - use_tqdm=args.use_tqdm, - truncate_all_input_chars=args.truncate_all_input_chars, - max_out_tokens_models=args.max_out_tokens_models, - max_out_tokens_judge=args.max_out_tokens_judge, - max_model_len=args.max_model_len, - chat_template=args.chat_template, - mt_bench_turns=args.mt_bench_turns, - mt_bench_compatibility=args.mt_bench_compatibility, - result_folder=args.result_folder, - engine_kwargs=engine_kwargs, - ) - - def load_contexts(dataset: str) -> pd.Series: path = data_root / "contexts" / dataset return pd.read_csv(path).loc[:, "instruction"] diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py index fd9df82..39f9eb4 100644 --- a/openjury/mt_bench/pipeline.py +++ b/openjury/mt_bench/pipeline.py @@ -32,7 +32,7 @@ from openjury.utils import cache_function_dataframe, make_model if TYPE_CHECKING: - from openjury.generate_and_evaluate import CliArgs + from openjury.config import CliArgs NEED_REF_CATS = {"math", "reasoning", "coding"} From 61aaa128b1ee9a736f7e388596ac97a129de372e Mon Sep 17 00:00:00 2001 From: ErlisLushtaku Date: Wed, 18 Mar 2026 01:21:25 +0100 Subject: [PATCH 35/35] remove dependency from mt-bench-101 pipeline to entrypoint --- openjury/mt_bench_101/pipeline.py | 9 +++------ tests/test_generate_and_evaluate.py | 9 +++++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/openjury/mt_bench_101/pipeline.py b/openjury/mt_bench_101/pipeline.py index 2bacbef..28aa2ee 100644 --- a/openjury/mt_bench_101/pipeline.py +++ b/openjury/mt_bench_101/pipeline.py @@ -11,6 +11,7 @@ import pandas as pd +from openjury.instruction_dataset import load_instructions from openjury.mt_bench_101.evaluate import ( derive_mt_bench_101_pairwise_preferences, judge_mt_bench_101_single, @@ -21,7 +22,7 @@ from openjury.utils import cache_function_dataframe, make_model if TYPE_CHECKING: - from openjury.generate_and_evaluate import CliArgs + from openjury.config import CliArgs def _generate_mt_bench_101_completions( @@ -93,11 +94,7 @@ def run_mt_bench_101(args: CliArgs, ignore_cache: bool) -> pd.Series: "--swap_mode has no effect for mt-bench-101 since it does single answer grading before comparing the models" ) - from openjury import generate_and_evaluate as gae - - eval_items_df = gae.load_instructions( - "mt-bench-101", n_instructions=args.n_instructions - ) + eval_items_df = load_instructions("mt-bench-101", n_instructions=args.n_instructions) print( "Generating completions from golden context for MT-Bench-101 with " f"{args.model_A} and {args.model_B}." diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py index 0ad57eb..a89671e 100644 --- a/tests/test_generate_and_evaluate.py +++ b/tests/test_generate_and_evaluate.py @@ -3,6 +3,7 @@ import openjury.generate_and_evaluate as generate_and_evaluate import openjury.mt_bench.pipeline as mt_bench_pipeline +import openjury.mt_bench_101.pipeline as mt_bench_101_pipeline from openjury.generate_and_evaluate import ( main as main_generate_and_eval, CliArgs, @@ -85,6 +86,11 @@ def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Da "load_instructions", _load_instructions, ) + monkeypatch.setattr( + mt_bench_101_pipeline, + "load_instructions", + _load_instructions, + ) monkeypatch.setattr( generate_and_evaluate, "load_contexts", @@ -106,6 +112,9 @@ def _run_without_cache(fun, **_kwargs): monkeypatch.setattr( mt_bench_pipeline, "cache_function_dataframe", _run_without_cache ) + monkeypatch.setattr( + mt_bench_101_pipeline, "cache_function_dataframe", _run_without_cache + ) @pytest.mark.parametrize(