diff --git a/README.md b/README.md
index 24087a0..fd4e51a 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ Compared to other libraries, here is a breakdown of features:
| **Arena-Hard-Auto** | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ |
| **Lighteval** | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ |
| **Evalchemy** | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
-| **OpenJury** | 🔜 | ✅ | ✅ | ✅ | ✅ | ✅ |
+| **OpenJury** | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
The table has been done on Oct 2025, in case some libraries implemented missing features, please open an issue
or send a PR, we will be happy to update the information.
@@ -191,10 +191,29 @@ python openjury/generate_and_evaluate.py \
This override applies to all vLLM models in the run. For remote providers (OpenAI, Together, OpenRouter), the flag is ignored since they handle templates server-side.
+### MT-Bench (Multi-Turn Evaluation)
+
+MT-Bench evaluates multi-turn conversation ability using 80 two-turn questions across 8 categories
+(writing, roleplay, reasoning, math, coding, extraction, STEM, humanities).
+It uses category-dependent judge prompts and reference answers for math/reasoning/coding.
+Questions are automatically downloaded from the [LMSYS MT-Bench HuggingFace space](https://huggingface.co/spaces/lmsys/mt-bench).
+
+```bash
+uv run python openjury/generate_and_evaluate.py \
+ --dataset mt-bench \
+ --model_A VLLM/Qwen/Qwen2.5-7B-Instruct \
+ --model_B OpenRouter/openai/gpt-4o \
+ --judge_model OpenRouter/deepseek/deepseek-chat-v3.1 \
+ --n_instructions 10
+```
+
+Results include per-category and per-turn win rate breakdowns. Use `--swap_mode both` to correct for judge position bias.
+
## 📊 Supported Datasets
| Dataset | Description |
|-----------------------|------------------------------------------------------------------------------------------------|
+| `mt-bench` | 80 multi-turn (2-turn) questions across 8 categories ([LMSYS MT-Bench](https://arxiv.org/abs/2306.05685)) |
| `alpaca-eval` | General instruction-following benchmark |
| `arena-hard` | More challenging evaluation suite |
| `m-arena-hard` | Translated version of Arena-Hard in 23 languages |
diff --git a/openjury/config.py b/openjury/config.py
new file mode 100644
index 0000000..80802eb
--- /dev/null
+++ b/openjury/config.py
@@ -0,0 +1,212 @@
+"""CLI argument configuration for generation and evaluation entrypoints."""
+
+import argparse
+import json
+from dataclasses import dataclass, field
+
+
+@dataclass
+class CliArgs:
+ dataset: str
+ model_A: str
+ model_B: str
+ judge_model: str
+
+ n_instructions: int | None = None
+ provide_explanation: bool = False
+ swap_mode: str = "fixed"
+ ignore_cache: bool = False
+ use_tqdm: bool = False
+ truncate_all_input_chars: int = 8192
+ max_out_tokens_models: int = 32768
+ max_out_tokens_judge: int = 32768
+ max_model_len: int | None = None
+ chat_template: str | None = None
+ mt_bench_turns: str = "both"
+ mt_bench_compatibility: str = "openjury"
+ result_folder: str = "results"
+ engine_kwargs: dict = field(default_factory=dict)
+
+ def __post_init__(self):
+ supported_modes = ["fixed", "both"]
+ assert (
+ self.swap_mode in supported_modes
+ ), f"Only {supported_modes} modes are supported but got {self.swap_mode}."
+ supported_mt_bench_modes = ["openjury", "fastchat"]
+ assert (
+ self.mt_bench_compatibility in supported_mt_bench_modes
+ ), f"Only {supported_mt_bench_modes} are supported but got {self.mt_bench_compatibility}."
+
+ @classmethod
+ def parse_args(cls):
+ parser = argparse.ArgumentParser(
+ prog="Generate completion and evaluate with a judge",
+ )
+ parser.add_argument(
+ "--dataset",
+ help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
+ "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
+ )
+ parser.add_argument(
+ "--model_A",
+ required=True,
+ help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+ )
+ parser.add_argument(
+ "--model_B",
+ required=True,
+ help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
+ )
+ parser.add_argument(
+ "--judge_model",
+ required=True,
+ help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
+ "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc",
+ )
+ parser.add_argument(
+ "--n_instructions",
+ type=int,
+ required=False,
+ )
+ parser.add_argument(
+ "--provide_explanation",
+ action="store_true",
+ help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
+ "the accuracy of the judge but enables some result interpretation.",
+ )
+ parser.add_argument(
+ "--swap_mode",
+ type=str,
+ choices=["fixed", "both"],
+ default="fixed",
+ help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
+ "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
+ "for judge position bias. Default is 'fixed'.",
+ )
+ parser.add_argument(
+ "--ignore_cache",
+ action="store_true",
+ help="If specified, ignore cache of previous completions.",
+ )
+ parser.add_argument(
+ "--use_tqdm",
+ action="store_true",
+ help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
+ )
+ parser.add_argument(
+ "--result_folder",
+ type=str,
+ required=False,
+ default="results",
+ help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
+ " `[result_folder]/[evaluation_name]`.",
+ )
+ parser.add_argument(
+ "--truncate_all_input_chars",
+ type=int,
+ required=False,
+ default=8192,
+ help="Character-level truncation applied before tokenization: truncates each instruction "
+ "before model A/B generation and truncates each completion before judge evaluation.",
+ )
+ parser.add_argument(
+ "--max_out_tokens_models",
+ type=int,
+ required=False,
+ default=32768,
+ help=(
+ "Generation token budget for each model A/B response. For VLLM, keep this <= "
+ "--max_model_len (if provided)."
+ ),
+ )
+ parser.add_argument(
+ "--max_out_tokens_judge",
+ type=int,
+ required=False,
+ default=32768,
+ help=(
+ "Generation token budget for the judge response (reasoning + scores). For "
+ "VLLM, keep this <= --max_model_len (if provided)."
+ ),
+ )
+ parser.add_argument(
+ "--max_model_len",
+ type=int,
+ required=False,
+ default=None,
+ help=(
+ "Optional total context window for VLLM models (prompt + generation). This is "
+ "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap "
+ "generated tokens. This is useful on smaller GPUs to avoid OOM."
+ ),
+ )
+ parser.add_argument(
+ "--chat_template",
+ type=str,
+ required=False,
+ default=None,
+ help="Jinja2 chat template string to use instead of the model's tokenizer template. "
+ "If not provided, ChatML is used as fallback for models without a chat template.",
+ )
+ parser.add_argument(
+ "--mt_bench_turns",
+ type=str,
+ choices=["both", "single", "multi"],
+ default="both",
+ help="Which MT-Bench turns to evaluate. 'single': only turn 1, "
+ "'multi': only turn 2 (with full conversation context), "
+ "'both' (default): evaluate both turns.",
+ )
+ parser.add_argument(
+ "--mt_bench_compatibility",
+ type=str,
+ choices=["openjury", "fastchat"],
+ default="openjury",
+ help=(
+ "MT-Bench evaluation/generation mode. "
+ "'openjury' (default): OpenJury score_A/score_B prompt + softmax preference. "
+ "'fastchat': use FastChat/MT-Bench pairwise prompts with [[A]]/[[B]]/[[C]] verdict parsing, "
+ "conservative position-bias handling, judge temperature=0, and MT-Bench category temperatures."
+ ),
+ )
+ parser.add_argument(
+ "--engine_kwargs",
+ type=str,
+ required=False,
+ default="{}",
+ help=(
+ "JSON dict of engine-specific kwargs forwarded to the underlying engine. "
+ "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'."
+ ),
+ )
+ args = parser.parse_args()
+
+ try:
+ engine_kwargs = (
+ json.loads(args.engine_kwargs) if args.engine_kwargs else {}
+ )
+ if not isinstance(engine_kwargs, dict):
+ raise ValueError("engine_kwargs must be a JSON object")
+ except Exception as e:
+ raise SystemExit(f"Failed to parse --engine_kwargs: {e}")
+
+ return cls(
+ dataset=args.dataset,
+ model_A=args.model_A,
+ model_B=args.model_B,
+ judge_model=args.judge_model,
+ n_instructions=args.n_instructions,
+ provide_explanation=args.provide_explanation,
+ swap_mode=args.swap_mode,
+ ignore_cache=args.ignore_cache,
+ use_tqdm=args.use_tqdm,
+ truncate_all_input_chars=args.truncate_all_input_chars,
+ max_out_tokens_models=args.max_out_tokens_models,
+ max_out_tokens_judge=args.max_out_tokens_judge,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ mt_bench_turns=args.mt_bench_turns,
+ mt_bench_compatibility=args.mt_bench_compatibility,
+ result_folder=args.result_folder,
+ engine_kwargs=engine_kwargs,
+ )
diff --git a/openjury/eval_runtime.py b/openjury/eval_runtime.py
new file mode 100644
index 0000000..dd367e5
--- /dev/null
+++ b/openjury/eval_runtime.py
@@ -0,0 +1,171 @@
+"""Shared evaluation runtime helpers used by entrypoints and benchmark pipelines."""
+
+from __future__ import annotations
+
+import pandas as pd
+
+from openjury.evaluate import annotate_battles, PairScore
+
+
+def print_results(results):
+ """Print battle results in a readable format."""
+ print("\n" + "=" * 60)
+ print("🏆 MODEL BATTLE RESULTS 🏆".center(60))
+ print(f"📊 Dataset: {results['dataset']}")
+ print(
+ f"🤖 Competitors: Model A: {results['model_A']} vs Model B: {results['model_B']}"
+ )
+ print(f"⚖️ Judge: {results['judge_model']}")
+ print("📈 Results Summary:")
+ print(f" Total Battles: {results['num_battles']}")
+ print(f" Win Rate (A): {results['winrate']:.1%}")
+ print(f" ✅ Wins: {results['num_wins']}")
+ print(f" ❌ Losses: {results['num_losses']}")
+ print(f" 🤝 Ties: {results['num_ties']}")
+ if results.get("num_missing", 0) > 0:
+ print(f" ❓ Missing: {results['num_missing']}")
+
+ per_category = results.get("per_category")
+ if per_category:
+ print("\nPer-Category Breakdown:")
+ print(
+ f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}"
+ )
+ print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}")
+ for cat, stats in sorted(per_category.items()):
+ print(
+ f" {cat:<14} | {stats['winrate']:>11.1%} | "
+ f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}"
+ )
+
+ per_turn = results.get("per_turn")
+ if per_turn:
+ print("\nPer-Turn Breakdown:")
+ for turn, stats in sorted(per_turn.items()):
+ print(
+ f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} "
+ f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})"
+ )
+ print("=" * 60 + "\n")
+
+
+def compute_preference_stats(prefs: pd.Series) -> dict:
+ """Derive win/loss/tie counts and winrate from a Series of preferences."""
+ num_battles = len(prefs)
+ num_wins = int(sum(prefs < 0.5))
+ num_losses = int(sum(prefs > 0.5))
+ num_ties = int(sum(prefs == 0.5))
+ num_missing = num_battles - (num_wins + num_losses + num_ties)
+ denom = num_wins + num_losses + num_ties
+ winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0
+ return {
+ "num_battles": num_battles,
+ "num_wins": num_wins,
+ "num_losses": num_losses,
+ "num_ties": num_ties,
+ "num_missing": num_missing,
+ "winrate": winrate,
+ }
+
+
+def _compute_grouped_stats(
+ preferences: pd.Series,
+ metadata: list[dict[str, object]],
+ group_by: str,
+) -> dict[object, dict[str, float | int]]:
+ grouped: dict[object, list[float]] = {}
+ for meta, pref in zip(metadata, preferences):
+ key = meta.get(group_by)
+ if key is None:
+ continue
+ grouped.setdefault(key, []).append(pref)
+ return {
+ key: compute_preference_stats(pd.Series(vals))
+ for key, vals in grouped.items()
+ }
+
+
+def _parse_preferences_from_annotations(
+ annotations: list,
+ score_parser: PairScore,
+) -> pd.Series:
+ return pd.Series(
+ [
+ score_parser.parse_model_raw(annotation.judge_completion)
+ for annotation in annotations
+ ]
+ )
+
+
+def _judge_turn(
+ *,
+ judge_chat_model,
+ instructions: list[str],
+ completions_A: list[str],
+ completions_B: list[str],
+ metadata: list[dict[str, object]],
+ score_parser: PairScore,
+ provide_explanation: bool,
+ swap_mode: str,
+ truncate_input_chars: int | None,
+ use_tqdm: bool,
+ system_prompt: str | None = None,
+ user_prompt_template: str | None = None,
+) -> tuple[
+ list,
+ list,
+ list[dict[str, object]],
+ list[dict[str, object]],
+ pd.Series,
+ list[dict[str, object]],
+]:
+ if not instructions:
+ return [], [], [], [], pd.Series(dtype=float), []
+
+ annotations = annotate_battles(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions,
+ completions_A=completions_A,
+ completions_B=completions_B,
+ provide_explanation=provide_explanation,
+ system_prompt=system_prompt,
+ user_prompt_template=user_prompt_template,
+ truncate_input_chars=truncate_input_chars,
+ use_tqdm=use_tqdm,
+ )
+ preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)]
+
+ annotations_reversed: list = []
+ metadata_for_reversed_annotations: list[dict[str, object]] = []
+ combined_metadata = list(metadata)
+
+ if swap_mode == "both":
+ print("Correction for judge bias towards a certain model position is set.")
+ print("Evaluating completions with models reversed.")
+ annotations_reversed = annotate_battles(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions,
+ completions_A=completions_B,
+ completions_B=completions_A,
+ provide_explanation=provide_explanation,
+ system_prompt=system_prompt,
+ user_prompt_template=user_prompt_template,
+ truncate_input_chars=truncate_input_chars,
+ use_tqdm=use_tqdm,
+ )
+ prefs_reversed = _parse_preferences_from_annotations(
+ annotations_reversed, score_parser
+ )
+ preference_parts.append(1 - prefs_reversed)
+ metadata_for_reversed_annotations = list(metadata)
+ combined_metadata.extend(metadata)
+
+ preferences = pd.concat(preference_parts).reset_index(drop=True)
+ return (
+ annotations,
+ annotations_reversed,
+ list(metadata),
+ metadata_for_reversed_annotations,
+ preferences,
+ combined_metadata,
+ )
diff --git a/openjury/evaluate.py b/openjury/evaluate.py
index 342da57..b5845ba 100644
--- a/openjury/evaluate.py
+++ b/openjury/evaluate.py
@@ -17,6 +17,7 @@
data_root,
download_hf,
do_inference,
+ truncate,
)
@@ -51,18 +52,29 @@ def get_regexp_match(self, s: str, regex: str, group_index: int = 1):
return float(m.group(group_index).strip(" "))
+_COMPLETION_LABEL_SINGLE = "Answer"
+_COMPLETION_LABEL_MULTI_TURN = "Conversation with User"
+_EXPLANATION_SUFFIX = ", first starts with an explanation of your judgement"
+_SCORE_FENCE = "\n```"
+
+
def load_judge_system_and_user_prompt(
provide_explanation: bool = True,
+ multi_turn: bool = False,
) -> tuple[str, str]:
- # Prepare judge
- with open(Path(__file__).parent / "prompts" / "system-prompt.txt", "r") as f:
- system_prompt = str(f.read())
+ prompts_dir = Path(__file__).parent / "prompts"
+
+ system_prompt = (prompts_dir / "system-prompt.txt").read_text()
- prompt_filename = (
- "prompt-with-explanation.txt" if provide_explanation else "prompt.txt"
+ user_prompt_template = (prompts_dir / "prompt.txt").read_text()
+ user_prompt_template = user_prompt_template.replace(
+ "{completion_label}",
+ _COMPLETION_LABEL_MULTI_TURN if multi_turn else _COMPLETION_LABEL_SINGLE,
+ )
+ user_prompt_template = user_prompt_template.replace(
+ "{explanation_suffix}",
+ _EXPLANATION_SUFFIX if provide_explanation else _SCORE_FENCE,
)
- with open(Path(__file__).parent / "prompts" / prompt_filename, "r") as f:
- user_prompt_template = str(f.read())
return system_prompt, user_prompt_template
@@ -287,14 +299,6 @@ def annotate_battles(
[("system", system_prompt), ("user", user_prompt_template)]
)
- def truncate(s: str, max_len: int | None = None):
- if not isinstance(s, str):
- return ""
- if max_len is not None:
- return s[:max_len]
- else:
- return s
-
inputs = prompt_template.batch(
[
{
diff --git a/openjury/generate.py b/openjury/generate.py
index 11c6508..64eb789 100644
--- a/openjury/generate.py
+++ b/openjury/generate.py
@@ -1,17 +1,59 @@
import pandas as pd
from langchain.prompts import ChatPromptTemplate
+from typing import Any
from openjury.utils import (
do_inference,
make_model,
+ truncate,
)
-def truncate(s: str, max_len: int | None = None):
- if max_len is not None:
- return s[:max_len]
- else:
- return s
+def _set_temperature_on_model(chat_model, temperature: float) -> None:
+ if hasattr(chat_model, "set_temperature"):
+ chat_model.set_temperature(temperature)
+ return
+ if hasattr(chat_model, "temperature"):
+ setattr(chat_model, "temperature", temperature)
+
+
+def _infer_grouped_by_temperature(
+ *,
+ model_spec: str,
+ provider: str,
+ max_tokens: int | None,
+ model_kwargs: dict[str, Any],
+ base_model,
+ inputs: list,
+ temperatures: list[float],
+ use_tqdm: bool,
+) -> list[str]:
+ outputs: list[str] = [""] * len(inputs)
+ groups: dict[float, list[int]] = {}
+ for idx, temp in enumerate(temperatures):
+ groups.setdefault(float(temp), []).append(idx)
+
+ for temp in sorted(groups.keys()):
+ idxs = groups[temp]
+ group_inputs = [inputs[i] for i in idxs]
+
+ if provider in {"VLLM", "LlamaCpp"}:
+ _set_temperature_on_model(base_model, temp)
+ group_model = base_model
+ else:
+ group_model = make_model(
+ model_spec, max_tokens=max_tokens, temperature=temp, **model_kwargs
+ )
+
+ group_outs = do_inference(
+ chat_model=group_model,
+ inputs=group_inputs,
+ use_tqdm=use_tqdm,
+ )
+ for i, out in zip(idxs, group_outs):
+ outputs[i] = out
+
+ return outputs
def generate_instructions(
@@ -57,6 +99,136 @@ def generate_instructions(
return df_outputs
+def generate_multiturn(
+ questions: pd.DataFrame,
+ model: str,
+ truncate_input_chars: int | None = 8192,
+ max_tokens: int | None = 8192,
+ use_tqdm: bool = True,
+ temperature_config: dict[str, float] | None = None,
+ **model_kwargs,
+) -> pd.DataFrame:
+ """Generate two-turn completions for MT-Bench style questions.
+
+ Generates turn 1 answers first, then uses them as conversation context
+ to generate turn 2 answers.
+
+ Args:
+ questions: DataFrame with columns turn_1, turn_2, and index instruction_index.
+ model: Model specification string (e.g. "VLLM/model-name").
+ temperature_config: Optional category -> temperature mapping. When set,
+ inputs are inferred in temperature-homogeneous groups to match
+ MT-Bench/FastChat category defaults.
+ **model_kwargs: Provider-specific options forwarded to make_model
+ (e.g. max_model_len, chat_template for VLLM).
+ Returns:
+ DataFrame with columns: instruction_index, completion_turn_1, completion_turn_2
+ """
+ provider = model.split("/")[0]
+ use_category_temperatures = temperature_config is not None
+ local_provider = provider in {"VLLM", "LlamaCpp"}
+
+ chat_model = None
+ if use_category_temperatures and local_provider:
+ chat_model = make_model(model, max_tokens=max_tokens, temperature=0.0, **model_kwargs)
+ else:
+ chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs)
+
+ system_prompt = "You are a helpful assistant."
+ idxs = questions.index.tolist()
+ temperatures: list[float] = []
+ if use_category_temperatures:
+ temperatures = [
+ temperature_config.get(str(questions.loc[idx].get("category") or ""), 0.7)
+ for idx in idxs
+ ]
+
+ turn1_template = ChatPromptTemplate.from_messages(
+ [("system", system_prompt), ("user", "{user_prompt}")]
+ )
+
+ turn1_inputs = turn1_template.batch(
+ [
+ {"user_prompt": truncate(row["turn_1"], max_len=truncate_input_chars)}
+ for _, row in questions.iterrows()
+ ]
+ )
+
+ print(f"Generating turn 1 completions ({len(turn1_inputs)} questions).")
+ if use_category_temperatures:
+ completions_turn_1 = _infer_grouped_by_temperature(
+ model_spec=model,
+ provider=provider,
+ max_tokens=max_tokens,
+ model_kwargs=model_kwargs,
+ base_model=chat_model,
+ inputs=turn1_inputs,
+ temperatures=temperatures,
+ use_tqdm=use_tqdm,
+ )
+ else:
+ completions_turn_1 = do_inference(
+ chat_model=chat_model,
+ inputs=turn1_inputs,
+ use_tqdm=use_tqdm,
+ )
+
+ turn2_inputs = []
+ for (_, row), t1_answer in zip(questions.iterrows(), completions_turn_1):
+ if row["turn_2"] is None:
+ turn2_inputs.append(
+ turn1_template.invoke(
+ {"user_prompt": "No follow-up question."}
+ )
+ )
+ else:
+ multi_turn_template = ChatPromptTemplate.from_messages(
+ [
+ ("system", system_prompt),
+ ("user", "{turn_1}"),
+ ("assistant", "{turn_1_answer}"),
+ ("user", "{turn_2}"),
+ ]
+ )
+ turn2_inputs.append(
+ multi_turn_template.invoke(
+ {
+ "turn_1": truncate(row["turn_1"], max_len=truncate_input_chars),
+ "turn_1_answer": truncate(str(t1_answer), max_len=truncate_input_chars),
+ "turn_2": truncate(row["turn_2"], max_len=truncate_input_chars),
+ }
+ )
+ )
+
+ print(f"Generating turn 2 completions ({len(turn2_inputs)} questions).")
+ if use_category_temperatures:
+ completions_turn_2 = _infer_grouped_by_temperature(
+ model_spec=model,
+ provider=provider,
+ max_tokens=max_tokens,
+ model_kwargs=model_kwargs,
+ base_model=chat_model,
+ inputs=turn2_inputs,
+ temperatures=temperatures,
+ use_tqdm=use_tqdm,
+ )
+ else:
+ completions_turn_2 = do_inference(
+ chat_model=chat_model,
+ inputs=turn2_inputs,
+ use_tqdm=use_tqdm,
+ )
+
+ df_outputs = pd.DataFrame(
+ data={
+ "instruction_index": idxs,
+ "completion_turn_1": completions_turn_1,
+ "completion_turn_2": completions_turn_2,
+ },
+ )
+ return df_outputs
+
+
def generate_base(
instructions: pd.Series,
model: str,
diff --git a/openjury/generate_and_evaluate.py b/openjury/generate_and_evaluate.py
index 83cec69..b2fe1bf 100644
--- a/openjury/generate_and_evaluate.py
+++ b/openjury/generate_and_evaluate.py
@@ -3,26 +3,39 @@
and then evaluates them using a judge model.
"""
-import argparse
import json
-from dataclasses import asdict, dataclass, field
-from datetime import datetime, timezone
+import os
+from dataclasses import asdict
+from datetime import datetime
from functools import partial
from pathlib import Path
-import numpy as np
import pandas as pd
+from openjury.config import CliArgs
from openjury.evaluate import (
annotate_battles,
PairScore,
- resolve_judge_prompts,
+ load_judge_system_and_user_prompt,
)
-from openjury.generate import generate_instructions, generate_base
+from openjury.generate import generate_instructions, generate_base, generate_multiturn
from openjury.instruction_dataset import load_instructions
-from openjury.repro import write_run_metadata, _to_jsonable
-from openjury.utils import data_root, read_df, download_hf
-from openjury.utils import make_model, cache_function_dataframe, compute_pref_summary
+from openjury.mt_bench.pipeline import (
+ format_mt_bench_for_evaluation,
+ run_mt_bench,
+)
+from openjury.mt_bench.fastchat_compat import (
+ FASTCHAT_TEMPERATURE_CONFIG,
+ judge_mt_bench_pairwise_fastchat,
+)
+from openjury.mt_bench_101.pipeline import run_mt_bench_101
+from openjury.utils import (
+ cache_function_dataframe,
+ data_root,
+ download_hf,
+ make_model,
+ read_df,
+)
def try_load_dataset_completions(
@@ -62,184 +75,6 @@ def try_load_dataset_completions(
)
-@dataclass
-class CliArgs:
- dataset: str
- model_A: str
- model_B: str
- judge_model: str
-
- n_instructions: int | None = None
- provide_explanation: bool = False
- swap_mode: str = "fixed"
- ignore_cache: bool = False
- use_tqdm: bool = False
- truncate_all_input_chars: int = 8192
- max_out_tokens_models: int = 32768
- max_out_tokens_judge: int = 32768
- max_model_len: int | None = None
- chat_template: str | None = None
- result_folder: str = "results"
- engine_kwargs: dict = field(default_factory=dict)
-
- def __post_init__(self):
- supported_modes = ["fixed", "both"]
- assert (
- self.swap_mode in supported_modes
- ), f"Only {supported_modes} modes are supported but got {self.swap_mode}."
-
- @classmethod
- def parse_args(cls):
- parser = argparse.ArgumentParser(
- prog="Generate completion and evaluate with a judge",
- )
- parser.add_argument(
- "--dataset",
- help="The dataset to use. For instance `alpaca-eval`, `arena-hard`, `m-arena-hard-EU` for instruction "
- "tuning cases or `french-contexts`, `spanish-contexts` for base models.",
- )
- parser.add_argument(
- "--model_A",
- required=True,
- help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
- )
- parser.add_argument(
- "--model_B",
- required=True,
- help="Name of the LLM to use for a generation, must be a valid choice for `generation_provider`",
- )
- parser.add_argument(
- "--judge_model",
- required=True,
- help="Name of the LLM to use, for instance `Together/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, "
- "`VLLM/meta-llama/Meta-Llama-3-70B-Instruct-Turbo`, `LangChain/LocalPath` etc",
- )
- parser.add_argument(
- "--n_instructions",
- type=int,
- required=False,
- )
- parser.add_argument(
- "--provide_explanation",
- action="store_true",
- help="If specified, judge will provide explanation before making a judgement. Does not necessarily improve"
- "the accuracy of the judge but enables some result interpretation.",
- )
- parser.add_argument(
- "--swap_mode",
- type=str,
- choices=["fixed", "both"],
- default="fixed",
- help="Model comparison order mode. 'fixed': always use model order A-B. 'both': correct for model order "
- "bias by evaluating each instruction twice, once as A-B and once as B-A, and average. This helps account "
- "for judge position bias. Default is 'fixed'.",
- )
- parser.add_argument(
- "--ignore_cache",
- action="store_true",
- help="If specified, ignore cache of previous completions.",
- )
- parser.add_argument(
- "--use_tqdm",
- action="store_true",
- help="If specified, use tqdm, does not work with all model providers, vLLM in particular.",
- )
- parser.add_argument(
- "--result_folder",
- type=str,
- required=False,
- default="results",
- help="The folder to save the results. Defaults to `results`. Evaluation results will be saved in"
- " `[result_folder]/[evaluation_name]`.",
- )
- parser.add_argument(
- "--truncate_all_input_chars",
- type=int,
- required=False,
- default=8192,
- help="Character-level truncation applied before tokenization: truncates each instruction "
- "before model A/B generation and truncates each completion before judge evaluation.",
- )
- parser.add_argument(
- "--max_out_tokens_models",
- type=int,
- required=False,
- default=32768,
- help=(
- "Generation token budget for each model A/B response. For VLLM, keep this <= "
- "--max_model_len (if provided)."
- ),
- )
- parser.add_argument(
- "--max_out_tokens_judge",
- type=int,
- required=False,
- default=32768,
- help=(
- "Generation token budget for the judge response (reasoning + scores). For "
- "VLLM, keep this <= --max_model_len (if provided)."
- ),
- )
- parser.add_argument(
- "--max_model_len",
- type=int,
- required=False,
- default=None,
- help=(
- "Optional total context window for VLLM models (prompt + generation). This is "
- "independent from --max_out_tokens_models/--max_out_tokens_judge, which only cap "
- "generated tokens. This is useful on smaller GPUs to avoid OOM."
- ),
- )
- parser.add_argument(
- "--chat_template",
- type=str,
- required=False,
- default=None,
- help="Jinja2 chat template string to use instead of the model's tokenizer template. "
- "If not provided, ChatML is used as fallback for models without a chat template.",
- )
- parser.add_argument(
- "--engine_kwargs",
- type=str,
- required=False,
- default="{}",
- help=(
- "JSON dict of engine-specific kwargs forwarded to the underlying engine. "
- "Example for vLLM: '{\"tensor_parallel_size\": 2, \"gpu_memory_utilization\": 0.9}'."
- ),
- )
- args = parser.parse_args()
-
- try:
- engine_kwargs = (
- json.loads(args.engine_kwargs) if args.engine_kwargs else {}
- )
- if not isinstance(engine_kwargs, dict):
- raise ValueError("engine_kwargs must be a JSON object")
- except Exception as e:
- raise SystemExit(f"Failed to parse --engine_kwargs: {e}")
-
- return cls(
- dataset=args.dataset,
- model_A=args.model_A,
- model_B=args.model_B,
- judge_model=args.judge_model,
- n_instructions=args.n_instructions,
- provide_explanation=args.provide_explanation,
- swap_mode=args.swap_mode,
- ignore_cache=args.ignore_cache,
- use_tqdm=args.use_tqdm,
- truncate_all_input_chars=args.truncate_all_input_chars,
- max_out_tokens_models=args.max_out_tokens_models,
- max_out_tokens_judge=args.max_out_tokens_judge,
- max_model_len=args.max_model_len,
- chat_template=args.chat_template,
- result_folder=args.result_folder,
- engine_kwargs=engine_kwargs,
- )
-
-
def load_contexts(dataset: str) -> pd.Series:
path = data_root / "contexts" / dataset
return pd.read_csv(path).loc[:, "instruction"]
@@ -261,9 +96,159 @@ def print_results(results):
print(f" ✅ Wins: {results['num_wins']}")
print(f" ❌ Losses: {results['num_losses']}")
print(f" 🤝 Ties: {results['num_ties']}")
+ if results.get("num_missing", 0) > 0:
+ print(f" ❓ Missing: {results['num_missing']}")
+
+ per_category = results.get("per_category")
+ if per_category:
+ print("\nPer-Category Breakdown:")
+ print(
+ f" {'Category':<14} | {'Win Rate(A)':>11} | {'Wins':>4} | {'Losses':>6} | {'Ties':>4}"
+ )
+ print(f" {'-' * 14}-+-{'-' * 11}-+-{'-' * 4}-+-{'-' * 6}-+-{'-' * 4}")
+ for cat, stats in sorted(per_category.items()):
+ print(
+ f" {cat:<14} | {stats['winrate']:>11.1%} | "
+ f"{stats['num_wins']:>4} | {stats['num_losses']:>6} | {stats['num_ties']:>4}"
+ )
+
+ per_turn = results.get("per_turn")
+ if per_turn:
+ print("\nPer-Turn Breakdown:")
+ for turn, stats in sorted(per_turn.items()):
+ print(
+ f" Turn {turn} Win Rate(A): {stats['winrate']:.1%} "
+ f"(W:{stats['num_wins']} L:{stats['num_losses']} T:{stats['num_ties']})"
+ )
print("=" * 60 + "\n")
+def compute_preference_stats(prefs: pd.Series) -> dict:
+ """Derive win/loss/tie counts and winrate from a Series of preferences.
+
+ Preference < 0.5 means model A wins, > 0.5 means model B wins,
+ exactly 0.5 is a tie. None/NaN values are counted as missing.
+ """
+ num_battles = len(prefs)
+ num_wins = int(sum(prefs < 0.5))
+ num_losses = int(sum(prefs > 0.5))
+ num_ties = int(sum(prefs == 0.5))
+ num_missing = num_battles - (num_wins + num_losses + num_ties)
+ denom = num_wins + num_losses + num_ties
+ winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0
+ return {
+ "num_battles": num_battles,
+ "num_wins": num_wins,
+ "num_losses": num_losses,
+ "num_ties": num_ties,
+ "num_missing": num_missing,
+ "winrate": winrate,
+ }
+
+
+def _compute_grouped_stats(
+ preferences: pd.Series,
+ metadata: list[dict[str, object]],
+ group_by: str,
+) -> dict[object, dict[str, float | int]]:
+ grouped: dict[object, list[float]] = {}
+ for meta, pref in zip(metadata, preferences):
+ key = meta.get(group_by)
+ if key is None:
+ continue
+ grouped.setdefault(key, []).append(pref)
+ return {
+ key: compute_preference_stats(pd.Series(vals))
+ for key, vals in grouped.items()
+ }
+
+
+def _parse_preferences_from_annotations(
+ annotations: list,
+ score_parser: PairScore,
+) -> pd.Series:
+ return pd.Series(
+ [
+ score_parser.parse_model_raw(annotation.judge_completion)
+ for annotation in annotations
+ ]
+ )
+
+
+def _judge_turn(
+ *,
+ judge_chat_model,
+ instructions: list[str],
+ completions_A: list[str],
+ completions_B: list[str],
+ metadata: list[dict[str, object]],
+ score_parser: PairScore,
+ provide_explanation: bool,
+ swap_mode: str,
+ truncate_input_chars: int | None,
+ use_tqdm: bool,
+ system_prompt: str | None = None,
+ user_prompt_template: str | None = None,
+) -> tuple[
+ list,
+ list,
+ list[dict[str, object]],
+ list[dict[str, object]],
+ pd.Series,
+ list[dict[str, object]],
+]:
+ if not instructions:
+ return [], [], [], [], pd.Series(dtype=float), []
+
+ annotations = annotate_battles(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions,
+ completions_A=completions_A,
+ completions_B=completions_B,
+ provide_explanation=provide_explanation,
+ system_prompt=system_prompt,
+ user_prompt_template=user_prompt_template,
+ truncate_input_chars=truncate_input_chars,
+ use_tqdm=use_tqdm,
+ )
+ preference_parts = [_parse_preferences_from_annotations(annotations, score_parser)]
+
+ annotations_reversed: list = []
+ metadata_for_reversed_annotations: list[dict[str, object]] = []
+ combined_metadata = list(metadata)
+
+ if swap_mode == "both":
+ print("Correction for judge bias towards a certain model position is set.")
+ print("Evaluating completions with models reversed.")
+ annotations_reversed = annotate_battles(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions,
+ completions_A=completions_B,
+ completions_B=completions_A,
+ provide_explanation=provide_explanation,
+ system_prompt=system_prompt,
+ user_prompt_template=user_prompt_template,
+ truncate_input_chars=truncate_input_chars,
+ use_tqdm=use_tqdm,
+ )
+ prefs_reversed = _parse_preferences_from_annotations(
+ annotations_reversed, score_parser
+ )
+ preference_parts.append(1 - prefs_reversed)
+ metadata_for_reversed_annotations = list(metadata)
+ combined_metadata.extend(metadata)
+
+ preferences = pd.concat(preference_parts).reset_index(drop=True)
+ return (
+ annotations,
+ annotations_reversed,
+ list(metadata),
+ metadata_for_reversed_annotations,
+ preferences,
+ combined_metadata,
+ )
+
+
def main(args: CliArgs):
"""
1) take as input:
@@ -276,7 +261,6 @@ def main(args: CliArgs):
3) create annotations
"""
- run_started_at = datetime.now(timezone.utc)
print(
f"Using dataset {args.dataset} and evaluating models {args.model_A} and {args.model_B}."
)
@@ -286,6 +270,12 @@ def main(args: CliArgs):
# set_langchain_cache()
ignore_cache = args.ignore_cache
+ # MT-Bench has its own pipeline: multi-turn generation + category-aware judging
+ if args.dataset == "mt-bench":
+ return run_mt_bench(args, ignore_cache)
+ if args.dataset == "mt-bench-101":
+ return run_mt_bench_101(args, ignore_cache)
+
# Currrently, we run context evaluation
is_fluency_task = "fluency" in args.dataset
if is_fluency_task:
@@ -377,19 +367,6 @@ def main(args: CliArgs):
chat_template=args.chat_template,
**args.engine_kwargs,
)
-
- name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"
- name += f"-{args.swap_mode}"
- name = name.replace("/", "_")
-
- res_folder = Path(args.result_folder) / name
- res_folder.mkdir(parents=True, exist_ok=True)
-
- # save argument for results analysis
- with open(res_folder / f"args-{name}.json", "w") as f:
- json.dump(asdict(args), f, indent=2)
-
- print(f"Saving results to {res_folder}")
if is_fluency_task:
system_prompt = """You are a highly efficient assistant, who evaluates and selects the best large language \
model based on the quality of completion of a sentence. You will see a sentence to be completed and two \
@@ -398,54 +375,59 @@ def main(args: CliArgs):
the ordering or on the length of the answers."""
else:
# the default system prompt of annotate is to compare instruction tuned models.
+
system_prompt = None
+
+ instruction_subset = instructions.head(n_instructions)
+ instruction_indices = instruction_subset.index.tolist()
+ metadata = [{"instruction_index": idx} for idx in instruction_indices]
+ score_parser = PairScore()
(
- effective_judge_system_prompt,
- judge_user_prompt_template,
- ) = resolve_judge_prompts(
- provide_explanation=args.provide_explanation,
- system_prompt=system_prompt,
- )
- annotations = annotate_battles(
+ annotations,
+ annotations_reversed,
+ metadata_for_annotations,
+ metadata_for_reversed_annotations,
+ prefs,
+ _combined_metadata,
+ ) = _judge_turn(
judge_chat_model=judge_chat_model,
- instructions=instructions.head(n_instructions).tolist(),
+ instructions=instruction_subset.tolist(),
completions_A=completions_A.head(n_instructions).tolist(),
completions_B=completions_B.head(n_instructions).tolist(),
+ metadata=metadata,
+ score_parser=score_parser,
provide_explanation=args.provide_explanation,
- system_prompt=effective_judge_system_prompt,
- user_prompt_template=judge_user_prompt_template,
+ swap_mode=args.swap_mode,
truncate_input_chars=args.truncate_all_input_chars,
use_tqdm=args.use_tqdm,
+ system_prompt=system_prompt,
)
- if args.swap_mode == "both":
- print("Correction for judge bias towards a certain model position is set.")
- print(
- f"Evaluating completions with models reversed with judge {args.judge_model}."
- )
- annotations_reversed = annotate_battles(
- judge_chat_model=judge_chat_model,
- instructions=instructions.head(n_instructions).tolist(),
- completions_A=completions_B.head(n_instructions).tolist(),
- completions_B=completions_A.head(n_instructions).tolist(),
- provide_explanation=args.provide_explanation,
- system_prompt=effective_judge_system_prompt,
- user_prompt_template=judge_user_prompt_template,
- truncate_input_chars=args.truncate_all_input_chars,
- use_tqdm=args.use_tqdm,
- )
+ name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"
+ name += f"-{args.swap_mode}"
+ name = name.replace("/", "_")
+
+ res_folder = Path(args.result_folder) / name
+ res_folder.mkdir(parents=True, exist_ok=True)
+
+ # save argument for results analysis
+ with open(res_folder / f"args-{name}.json", "w") as f:
+ json.dump(asdict(args), f, indent=2)
+ print(f"Saving results to {res_folder}")
df = pd.DataFrame(annotations)
- df["instruction_index"] = instructions.head(n_instructions).index.tolist()
+ df["instruction_index"] = [
+ meta["instruction_index"] for meta in metadata_for_annotations
+ ]
df["model_A"] = args.model_A
df["model_B"] = args.model_B
df["judge"] = args.judge_model
if args.swap_mode == "both":
df_reversed = pd.DataFrame(annotations_reversed)
- df_reversed["instruction_index"] = instructions.head(
- n_instructions
- ).index.tolist()
+ df_reversed["instruction_index"] = [
+ meta["instruction_index"] for meta in metadata_for_reversed_annotations
+ ]
df_reversed["model_A"] = args.model_B
df_reversed["model_B"] = args.model_A
df_reversed["judge"] = args.judge_model
@@ -453,64 +435,22 @@ def main(args: CliArgs):
df.to_csv(res_folder / f"{name}-annotations.csv", index=False)
- # compute preferences between A and B
- score_parser = PairScore()
- prefs = pd.Series(
- [
- score_parser.parse_model_raw(annotation.judge_completion)
- for annotation in annotations
- ]
- )
-
- if args.swap_mode == "both":
- prefs_reversed = pd.Series(
- [
- score_parser.parse_model_raw(annotation.judge_completion)
- for annotation in annotations_reversed
- ]
- )
- prefs = pd.concat([prefs, (1 - prefs_reversed)]).reset_index(drop=True)
-
- # compute and report statistics
- summary = compute_pref_summary(prefs)
-
+ stats = compute_preference_stats(prefs)
results = {
"dataset": args.dataset,
"model_A": args.model_A,
"model_B": args.model_B,
"judge_model": args.judge_model,
- **summary,
+ **stats,
"preferences": prefs.tolist(),
+ "date": str(datetime.now().isoformat()),
+ "user": os.getenv("USER", ""),
}
print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}")
print_results(results)
with open(res_folder / f"results-{name}.json", "w") as f:
- json.dump(_to_jsonable(results), f, indent=2, allow_nan=False)
-
- eval_instruction_index = instructions.head(n_instructions).index.tolist()
- eval_instructions = instructions.head(n_instructions).tolist()
- eval_completions_A = completions_A.head(n_instructions).tolist()
- eval_completions_B = completions_B.head(n_instructions).tolist()
-
- try:
- write_run_metadata(
- output_dir=res_folder,
- entrypoint="openjury.generate_and_evaluate.main",
- run=asdict(args),
- results=results,
- input_payloads={
- "instruction_index": eval_instruction_index,
- "instructions": eval_instructions,
- "completions_A": eval_completions_A,
- "completions_B": eval_completions_B,
- },
- judge_system_prompt=effective_judge_system_prompt,
- judge_user_prompt_template=judge_user_prompt_template,
- started_at_utc=run_started_at,
- )
- except OSError as e:
- print(f"Warning: failed to write run metadata: {e}")
+ json.dump(results, f, indent=2)
return prefs
diff --git a/openjury/instruction_dataset/__init__.py b/openjury/instruction_dataset/__init__.py
index ac211e2..2702084 100644
--- a/openjury/instruction_dataset/__init__.py
+++ b/openjury/instruction_dataset/__init__.py
@@ -4,7 +4,21 @@
def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame:
- if "m-arena-hard" in dataset:
+ apply_head_after_index = True
+ if dataset == "mt-bench":
+ from openjury.instruction_dataset.mt_bench import load_mt_bench
+
+ df_instructions = load_mt_bench()
+
+ elif dataset == "mt-bench-101":
+ from openjury.instruction_dataset.mt_bench_101 import load_mt_bench_101
+
+ # MT-Bench-101 is expanded into turn-level eval items in its loader.
+ # Keep n_instructions semantics as "number of dialogues to load".
+ df_instructions = load_mt_bench_101(n_dialogues=n_instructions)
+ apply_head_after_index = False
+
+ elif "m-arena-hard" in dataset:
if dataset == "m-arena-hard":
language = None
else:
@@ -59,6 +73,8 @@ def load_instructions(dataset: str, n_instructions: int | None = None) -> pd.Dat
df_instructions = df_instructions.set_index("instruction_index").sort_index()
print(f"Loaded {len(df_instructions)} instructions for {dataset}.")
+ if not apply_head_after_index:
+ return df_instructions
if n_instructions is None:
n_instructions = len(df_instructions)
return df_instructions.head(n_instructions)
diff --git a/openjury/instruction_dataset/mt_bench.py b/openjury/instruction_dataset/mt_bench.py
new file mode 100644
index 0000000..910a045
--- /dev/null
+++ b/openjury/instruction_dataset/mt_bench.py
@@ -0,0 +1,166 @@
+from pathlib import Path
+from urllib.request import urlretrieve
+import warnings
+
+import pandas as pd
+from huggingface_hub import snapshot_download
+
+from openjury.utils import data_root
+
+FASTCHAT_GPT4_REFERENCE_URL = (
+ "https://raw.githubusercontent.com/lm-sys/FastChat/main/"
+ "fastchat/llm_judge/data/mt_bench/reference_answer/gpt-4.jsonl"
+)
+
+def _download_gpt4_references(local_dir: Path) -> Path | None:
+ reference_dir = local_dir / "reference_answer"
+ reference_dir.mkdir(parents=True, exist_ok=True)
+ gpt4_reference_path = reference_dir / "gpt-4.jsonl"
+ if gpt4_reference_path.exists():
+ return gpt4_reference_path
+ try:
+ urlretrieve(FASTCHAT_GPT4_REFERENCE_URL, gpt4_reference_path)
+ except Exception as e:
+ warnings.warn(
+ "Could not download MT-Bench GPT-4 reference answers from FastChat. "
+ f"Falling back to inline references from question.jsonl: {e}",
+ RuntimeWarning,
+ )
+ return None
+ return gpt4_reference_path
+
+
+def download_mt_bench(local_dir: Path | None = None) -> tuple[Path, Path | None]:
+ """Download MT-Bench questions and GPT-4 references if missing."""
+ if local_dir is None:
+ local_dir = data_root / "mt-bench"
+ try:
+ local_dir.mkdir(parents=True, exist_ok=True)
+ except PermissionError as e:
+ raise PermissionError(
+ f"Cannot create MT-Bench cache directory at {local_dir}. "
+ "Set environment variable OPENJURY_DATA to a writable location."
+ ) from e
+
+ question_path = local_dir / "data" / "mt_bench" / "question.jsonl"
+ if not question_path.exists():
+ try:
+ snapshot_download(
+ repo_id="lmsys/mt-bench",
+ repo_type="space",
+ allow_patterns=[
+ "data/mt_bench/question.jsonl",
+ ],
+ local_dir=local_dir,
+ force_download=False,
+ )
+ except Exception as e:
+ raise RuntimeError(
+ "Failed to download MT-Bench questions from HuggingFace space "
+ "'lmsys/mt-bench'. If you're in an offline / restricted-network "
+ "environment, pre-download the space snapshot and place the "
+ f"questions file at {question_path}, or set OPENJURY_DATA to "
+ "point to that directory."
+ ) from e
+ if not question_path.exists():
+ raise FileNotFoundError(
+ "Could not locate MT-Bench questions after download. "
+ f"Expected file at {question_path}."
+ )
+
+ gpt4_reference_path = _download_gpt4_references(local_dir)
+ return question_path, gpt4_reference_path
+
+
+def load_mt_bench() -> pd.DataFrame:
+ """Load MT-Bench questions and reference answers.
+
+ Downloads MT-Bench questions from the HuggingFace LMSYS space and tries to
+ load GPT-4 references from FastChat GitHub. If GPT-4 references cannot be
+ downloaded or parsed, falls back to inline references from question.jsonl.
+ """
+ question_path, ref_path = download_mt_bench()
+
+ questions = pd.read_json(question_path, lines=True).to_dict(orient="records")
+
+ ref_by_id: dict[int | str, list[str]] = {}
+ use_inline_reference_fallback = ref_path is None
+ if ref_path is not None:
+ try:
+ reference_records = pd.read_json(ref_path, lines=True).to_dict(
+ orient="records"
+ )
+ for rec in reference_records:
+ qid = rec.get("question_id", rec.get("id"))
+ if qid is None:
+ continue
+ choices = rec.get("choices")
+ if not (isinstance(choices, list) and choices):
+ continue
+ first_choice = choices[0]
+ if not isinstance(first_choice, dict):
+ continue
+ turns = first_choice.get("turns")
+ if not isinstance(turns, list):
+ continue
+ ref_by_id[qid] = turns
+ try:
+ ref_by_id[int(qid)] = turns
+ except Exception:
+ pass
+ except Exception as e:
+ warnings.warn(
+ "Failed to parse GPT-4 reference answers from FastChat. "
+ f"Falling back to inline references from question.jsonl: {e}",
+ RuntimeWarning,
+ )
+ use_inline_reference_fallback = True
+
+ rows = []
+ for rec in questions:
+ qid_raw = rec.get("question_id", rec.get("id"))
+ if qid_raw is None:
+ raise ValueError(
+ f"MT-Bench question record missing question_id/id: keys={list(rec.keys())}"
+ )
+ try:
+ qid = int(qid_raw)
+ except Exception:
+ qid = qid_raw
+
+ category = rec.get("category")
+ turns = rec.get("turns")
+ if isinstance(turns, list):
+ turn_1 = turns[0] if len(turns) > 0 else None
+ turn_2 = turns[1] if len(turns) > 1 else None
+ else:
+ turn_1 = rec.get("turn_1", rec.get("instruction"))
+ turn_2 = rec.get("turn_2")
+
+ ref_turns = ref_by_id.get(qid_raw) or ref_by_id.get(qid)
+ if ref_turns is None and use_inline_reference_fallback:
+ inline_ref = rec.get("reference")
+ if isinstance(inline_ref, list):
+ ref_turns = inline_ref
+
+ ref_turn_1 = (
+ ref_turns[0] if isinstance(ref_turns, list) and len(ref_turns) > 0 else None
+ )
+ ref_turn_2 = (
+ ref_turns[1] if isinstance(ref_turns, list) and len(ref_turns) > 1 else None
+ )
+
+ rows.append(
+ {
+ "instruction_index": qid,
+ "category": category,
+ "turn_1": turn_1,
+ "turn_2": turn_2,
+ "reference_turn_1": ref_turn_1,
+ "reference_turn_2": ref_turn_2,
+ "instruction": turn_1,
+ }
+ )
+
+ return pd.DataFrame(rows)
+
diff --git a/openjury/instruction_dataset/mt_bench_101.py b/openjury/instruction_dataset/mt_bench_101.py
new file mode 100644
index 0000000..032139d
--- /dev/null
+++ b/openjury/instruction_dataset/mt_bench_101.py
@@ -0,0 +1,125 @@
+import json
+from pathlib import Path
+from urllib.request import urlretrieve
+
+import pandas as pd
+
+from openjury.utils import data_root
+
+MT_BENCH_101_DATA_URL = (
+ "https://raw.githubusercontent.com/mtbench101/mt-bench-101/main/"
+ "data/subjective/mtbench101.jsonl"
+)
+
+MT_BENCH_101_TURN2_ONLY_TASKS = {"CM", "AR", "CR", "FR", "SC", "SA"}
+MT_BENCH_101_REFERENCE_TASKS = {"MR", "GR"}
+MT_BENCH_101_TASK_TO_ABILITY = {
+ "CM": "perceptivity",
+ "AR": "perceptivity",
+ "SI": "perceptivity",
+ "TS": "perceptivity",
+ "CC": "perceptivity",
+ "CR": "adaptability",
+ "FR": "adaptability",
+ "SC": "adaptability",
+ "SA": "adaptability",
+ "MR": "adaptability",
+ "GR": "adaptability",
+ "IC": "interactivity",
+ "PI": "interactivity",
+}
+
+
+def download_mt_bench_101(local_dir: Path | None = None) -> Path:
+ """Download MT-Bench-101 JSONL dataset if missing and return its path."""
+ if local_dir is None:
+ local_dir = data_root / "mt-bench-101"
+
+ local_dir.mkdir(parents=True, exist_ok=True)
+ dataset_path = local_dir / "data" / "subjective" / "mtbench101.jsonl"
+ dataset_path.parent.mkdir(parents=True, exist_ok=True)
+ if dataset_path.exists():
+ return dataset_path
+
+ try:
+ urlretrieve(MT_BENCH_101_DATA_URL, dataset_path)
+ except Exception as exc:
+ raise RuntimeError(
+ "Failed to download MT-Bench-101 dataset from GitHub. "
+ "If running in a restricted network environment, manually place the file at "
+ f"{dataset_path} or point OPENJURY_DATA to a cache containing it."
+ ) from exc
+
+ return dataset_path
+
+
+def load_mt_bench_101(
+ n_dialogues: int | None = None,
+ local_dir: Path | None = None,
+) -> pd.DataFrame:
+ """Load MT-Bench-101 and expand dialogues into turn-level evaluation items.
+
+ The returned dataframe has one row per evaluated turn, using golden context.
+ """
+ dataset_path = download_mt_bench_101(local_dir=local_dir)
+
+ records: list[dict] = []
+ with dataset_path.open("r", encoding="utf-8") as handle:
+ for line in handle:
+ line = line.strip()
+ if line:
+ records.append(json.loads(line))
+
+ if n_dialogues is not None:
+ records = records[:n_dialogues]
+
+ rows: list[dict] = []
+ for rec in records:
+ task = rec.get("task")
+ if task not in MT_BENCH_101_TASK_TO_ABILITY:
+ raise ValueError(f"Unknown MT-Bench-101 task '{task}' in record: {rec}")
+
+ dialogue_id = rec.get("id")
+ history = rec.get("history")
+ if not isinstance(history, list):
+ raise ValueError(
+ "Invalid MT-Bench-101 record: expected list in field 'history', "
+ f"got {type(history)}"
+ )
+
+ start_turn = 2 if task in MT_BENCH_101_TURN2_ONLY_TASKS else 1
+ for turn_pos, turn in enumerate(history, start=1):
+ if turn_pos < start_turn:
+ continue
+ if not isinstance(turn, dict):
+ raise ValueError(
+ "Invalid MT-Bench-101 record: each turn in 'history' must be a dict."
+ )
+
+ user_message = str(turn.get("user") or "")
+ reference_answer = str(turn.get("bot") or "")
+ golden_context = [
+ {
+ "user": str(prev_turn.get("user") or ""),
+ "bot": str(prev_turn.get("bot") or ""),
+ }
+ for prev_turn in history[: turn_pos - 1]
+ ]
+
+ rows.append(
+ {
+ "instruction_index": len(rows),
+ "dialogue_id": dialogue_id,
+ "dialogue_uid": f"{task}:{dialogue_id}",
+ "task": task,
+ "ability": MT_BENCH_101_TASK_TO_ABILITY[task],
+ "turn_index": turn_pos,
+ "golden_context": golden_context,
+ "user_message": user_message,
+ "reference_answer": reference_answer,
+ "requires_reference": task in MT_BENCH_101_REFERENCE_TASKS,
+ "instruction": user_message,
+ }
+ )
+
+ return pd.DataFrame(rows)
diff --git a/openjury/mt_bench/__init__.py b/openjury/mt_bench/__init__.py
new file mode 100644
index 0000000..c5cdd59
--- /dev/null
+++ b/openjury/mt_bench/__init__.py
@@ -0,0 +1,5 @@
+"""MT-Bench-specific helpers.
+
+This package intentionally contains MT-Bench specific logic.
+"""
+
diff --git a/openjury/mt_bench/common.py b/openjury/mt_bench/common.py
new file mode 100644
index 0000000..8a0028e
--- /dev/null
+++ b/openjury/mt_bench/common.py
@@ -0,0 +1,67 @@
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Iterator
+
+import pandas as pd
+
+from openjury.utils import safe_text
+
+
+@dataclass(frozen=True)
+class MTBenchPairwiseRow:
+ question_id: object
+ category: str | None
+ turn_1_question: str
+ turn_2_question: str
+ answer_a_1: str
+ answer_a_2: str
+ answer_b_1: str
+ answer_b_2: str
+ ref_1: str
+ ref_2: str
+
+
+def iter_mt_bench_pairwise_rows(
+ *,
+ questions: pd.DataFrame,
+ completions_a: pd.DataFrame,
+ completions_b: pd.DataFrame,
+ truncate_input_chars: int | None,
+) -> Iterator[MTBenchPairwiseRow]:
+ for question_id in questions.index.tolist():
+ row = questions.loc[question_id]
+ comp_a_row = (
+ completions_a.loc[question_id]
+ if question_id in completions_a.index
+ else completions_a.iloc[0]
+ )
+ comp_b_row = (
+ completions_b.loc[question_id]
+ if question_id in completions_b.index
+ else completions_b.iloc[0]
+ )
+ yield MTBenchPairwiseRow(
+ question_id=question_id,
+ category=row.get("category"),
+ turn_1_question=safe_text(row.get("turn_1"), truncate_input_chars),
+ turn_2_question=safe_text(row.get("turn_2"), truncate_input_chars),
+ answer_a_1=safe_text(
+ comp_a_row.get("completion_turn_1", ""),
+ truncate_input_chars,
+ ),
+ answer_a_2=safe_text(
+ comp_a_row.get("completion_turn_2", ""),
+ truncate_input_chars,
+ ),
+ answer_b_1=safe_text(
+ comp_b_row.get("completion_turn_1", ""),
+ truncate_input_chars,
+ ),
+ answer_b_2=safe_text(
+ comp_b_row.get("completion_turn_2", ""),
+ truncate_input_chars,
+ ),
+ ref_1=safe_text(row.get("reference_turn_1"), truncate_input_chars),
+ ref_2=safe_text(row.get("reference_turn_2"), truncate_input_chars),
+ )
diff --git a/openjury/mt_bench/fastchat_compat.py b/openjury/mt_bench/fastchat_compat.py
new file mode 100644
index 0000000..728b0f2
--- /dev/null
+++ b/openjury/mt_bench/fastchat_compat.py
@@ -0,0 +1,477 @@
+from __future__ import annotations
+
+import math
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal
+
+import pandas as pd
+from langchain.prompts import ChatPromptTemplate
+
+from openjury.mt_bench.common import iter_mt_bench_pairwise_rows
+from openjury.utils import do_inference
+
+
+FASTCHAT_TEMPERATURE_CONFIG: dict[str, float] = {
+ "writing": 0.7,
+ "roleplay": 0.7,
+ "extraction": 0.0,
+ "math": 0.0,
+ "coding": 0.0,
+ "reasoning": 0.0,
+ "stem": 0.1,
+ "humanities": 0.1,
+}
+
+FASTCHAT_NEED_REF_CATS: set[str] = {"math", "reasoning", "coding"}
+
+FastChatVerdict = Literal["A", "B", "tie", "error"]
+PairwiseWinner = Literal["model_A", "model_B", "tie", "error"]
+
+
+@dataclass(frozen=True)
+class FastChatPairwisePrompt:
+ name: str
+ system_prompt: str
+ user_prompt_template: str
+ multi_turn: bool
+ ref_based: bool
+
+
+_PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts" / "mt_bench"
+_SYSTEM_BASE_FILE = "system-base.txt"
+_USER_SINGLE_BASE_FILE = "user-single-base.txt"
+_USER_MULTI_BASE_FILE = "user-multi-base.txt"
+_USER_SINGLE_REF_BLOCK_FILE = "user-single-reference-block.txt"
+_USER_MULTI_REF_BLOCK_FILE = "user-multi-reference-block.txt"
+
+
+def _load_prompt_text(filename: str) -> str:
+ path = _PROMPTS_DIR / filename
+ return path.read_text(encoding="utf-8")
+
+
+def _render_prompt_text(filename: str, **kwargs: str) -> str:
+ return _load_prompt_text(filename).format(**kwargs)
+
+
+def _build_system_prompt(
+ *,
+ user_subject: str,
+ task_description: str,
+ begin_instruction: str,
+ focus_line: str = "",
+) -> str:
+ focus_segment = f"{focus_line} " if focus_line else ""
+ return _render_prompt_text(
+ _SYSTEM_BASE_FILE,
+ user_subject=user_subject,
+ task_description=task_description,
+ focus_line=focus_segment,
+ begin_instruction=begin_instruction,
+ )
+
+
+def _build_user_prompt_template(*, multi_turn: bool, ref_based: bool) -> str:
+ base_filename = _USER_MULTI_BASE_FILE if multi_turn else _USER_SINGLE_BASE_FILE
+ reference_block = ""
+ if ref_based:
+ ref_block_filename = (
+ _USER_MULTI_REF_BLOCK_FILE if multi_turn else _USER_SINGLE_REF_BLOCK_FILE
+ )
+ reference_block = _load_prompt_text(ref_block_filename)
+ return _render_prompt_text(base_filename, reference_block=reference_block)
+
+
+def _load_pairwise_prompt(
+ *,
+ name: str,
+ multi_turn: bool,
+ ref_based: bool,
+ system_user_subject: str,
+ system_task_description: str,
+ system_begin_instruction: str,
+ system_focus_line: str = "",
+) -> FastChatPairwisePrompt:
+ return FastChatPairwisePrompt(
+ name=name,
+ multi_turn=multi_turn,
+ ref_based=ref_based,
+ system_prompt=_build_system_prompt(
+ user_subject=system_user_subject,
+ task_description=system_task_description,
+ begin_instruction=system_begin_instruction,
+ focus_line=system_focus_line,
+ ),
+ user_prompt_template=_build_user_prompt_template(
+ multi_turn=multi_turn,
+ ref_based=ref_based,
+ ),
+ )
+
+
+_PAIR_V2 = _load_pairwise_prompt(
+ name="pair-v2",
+ multi_turn=False,
+ ref_based=False,
+ system_user_subject="question displayed below",
+ system_task_description=(
+ "You should choose the assistant that follows the user's instructions and answers "
+ "the user's question better. Your evaluation should consider factors such as the "
+ "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their "
+ "responses."
+ ),
+ system_begin_instruction="comparing the two responses and provide a short explanation",
+)
+
+_PAIR_V2_MULTI = _load_pairwise_prompt(
+ name="pair-v2-multi-turn",
+ multi_turn=True,
+ ref_based=False,
+ system_user_subject="questions",
+ system_task_description=(
+ "You should choose the assistant that follows the user's instructions and answers "
+ "the user's questions better. Your evaluation should consider factors such as the "
+ "helpfulness, relevance, accuracy, depth, creativity, and level of detail of their "
+ "responses."
+ ),
+ system_focus_line="You should focus on who provides a better answer to the second user question.",
+ system_begin_instruction=(
+ "comparing the responses of the two assistants and provide a short explanation"
+ ),
+)
+
+_PAIR_MATH_V1 = _load_pairwise_prompt(
+ name="pair-math-v1",
+ multi_turn=False,
+ ref_based=True,
+ system_user_subject="question displayed below",
+ system_task_description=(
+ "Your evaluation should consider correctness and helpfulness. You will be given a "
+ "reference answer, assistant A's answer, and assistant B's answer. Your job is to "
+ "evaluate which assistant's answer is better."
+ ),
+ system_begin_instruction=(
+ "comparing both assistants' answers with the reference answer. Identify and correct any mistakes"
+ ),
+)
+
+_PAIR_MATH_V1_MULTI = _load_pairwise_prompt(
+ name="pair-math-v1-multi-turn",
+ multi_turn=True,
+ ref_based=True,
+ system_user_subject="questions",
+ system_task_description=(
+ "Your evaluation should consider correctness and helpfulness. You will be given "
+ "reference answers, the assistant A's answers, the assistant B's answers. Your job is "
+ "to determine which assistant provides correct and helpful answers to the second user question."
+ ),
+ system_begin_instruction=(
+ "comparing both assistants' answers with the reference answers. Identify and correct any mistakes"
+ ),
+)
+
+
+def _parse_fastchat_verdict(judgment: str) -> FastChatVerdict:
+ if "[[A]]" in judgment:
+ return "A"
+ if "[[B]]" in judgment:
+ return "B"
+ if "[[C]]" in judgment:
+ return "tie"
+ return "error"
+
+
+def _map_verdict_to_winner(verdict: FastChatVerdict, swapped: bool) -> PairwiseWinner:
+ if verdict == "tie":
+ return "tie"
+ if verdict == "error":
+ return "error"
+ if verdict == "A":
+ return "model_B" if swapped else "model_A"
+ if verdict == "B":
+ return "model_A" if swapped else "model_B"
+ return "error"
+
+
+def _conservative_winner(g1: PairwiseWinner, g2: PairwiseWinner) -> tuple[PairwiseWinner, bool]:
+ """Conservative position-bias handling (FastChat/MT-Bench paper).
+
+ Declare a winner only if the two orderings agree; otherwise treat as tie.
+ """
+ if g1 == "error" or g2 == "error":
+ return "error", False
+ if g1 == g2:
+ return g1, False
+ return "tie", True
+
+
+def _winner_to_preference(winner: PairwiseWinner) -> float:
+ if winner == "model_A":
+ return 0.0
+ if winner == "model_B":
+ return 1.0
+ if winner == "tie":
+ return 0.5
+ return math.nan
+
+
+def _select_prompt(category: str | None, multi_turn: bool) -> FastChatPairwisePrompt:
+ needs_ref = (category or "") in FASTCHAT_NEED_REF_CATS
+ if needs_ref and multi_turn:
+ return _PAIR_MATH_V1_MULTI
+ if needs_ref:
+ return _PAIR_MATH_V1
+ if multi_turn:
+ return _PAIR_V2_MULTI
+ return _PAIR_V2
+
+
+def _group_indices_by_prompt(
+ items: list[dict[str, Any]],
+) -> dict[str, list[int]]:
+ grouped: dict[str, list[int]] = {}
+ for idx, item in enumerate(items):
+ grouped.setdefault(item["prompt_name"], []).append(idx)
+ return grouped
+
+
+def _swap_prompt_kwargs(kwargs: dict[str, str], *, multi_turn: bool) -> dict[str, str]:
+ swapped = dict(kwargs)
+ if multi_turn:
+ swapped["answer_a_1"], swapped["answer_b_1"] = swapped["answer_b_1"], swapped["answer_a_1"]
+ swapped["answer_a_2"], swapped["answer_b_2"] = swapped["answer_b_2"], swapped["answer_a_2"]
+ return swapped
+ swapped["answer_a"], swapped["answer_b"] = swapped["answer_b"], swapped["answer_a"]
+ return swapped
+
+
+def _infer_by_prompt_groups(
+ *,
+ judge_chat_model,
+ items: list[dict[str, Any]],
+ use_tqdm: bool,
+ swap_answers: bool,
+) -> list[str]:
+ """Run judge inference, grouping by prompt variant for batching."""
+ grouped_indices = _group_indices_by_prompt(items)
+
+ judgments: list[str] = [""] * len(items)
+ for prompt_name, idxs in grouped_indices.items():
+ prompt: FastChatPairwisePrompt = items[idxs[0]]["prompt"]
+ prompt_template = ChatPromptTemplate.from_messages(
+ [("system", prompt.system_prompt), ("user", prompt.user_prompt_template)]
+ )
+
+ batch_kwargs = []
+ for i in idxs:
+ kwargs = items[i]["prompt_kwargs"]
+ if swap_answers:
+ kwargs = _swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn)
+ batch_kwargs.append(kwargs)
+
+ prompt_inputs = prompt_template.batch(batch_kwargs)
+ outs = do_inference(
+ chat_model=judge_chat_model,
+ inputs=prompt_inputs,
+ use_tqdm=use_tqdm,
+ )
+ for i, out in zip(idxs, outs):
+ judgments[i] = str(out)
+ return judgments
+
+
+def _build_fastchat_judge_items(
+ *,
+ questions: pd.DataFrame,
+ completions_a: pd.DataFrame,
+ completions_b: pd.DataFrame,
+ eval_single: bool,
+ eval_multi: bool,
+ truncate_input_chars: int | None,
+) -> list[dict[str, Any]]:
+ items: list[dict[str, Any]] = []
+ for pair_row in iter_mt_bench_pairwise_rows(
+ questions=questions,
+ completions_a=completions_a,
+ completions_b=completions_b,
+ truncate_input_chars=truncate_input_chars,
+ ):
+ category = pair_row.category
+ if eval_single:
+ prompt = _select_prompt(category, multi_turn=False)
+ kwargs: dict[str, str] = {
+ "question": pair_row.turn_1_question,
+ "answer_a": pair_row.answer_a_1,
+ "answer_b": pair_row.answer_b_1,
+ }
+ if prompt.ref_based:
+ kwargs["ref_answer_1"] = pair_row.ref_1
+ items.append(
+ {
+ "question_id": pair_row.question_id,
+ "category": category,
+ "turn": 1,
+ "prompt": prompt,
+ "prompt_name": prompt.name,
+ "prompt_kwargs": kwargs,
+ }
+ )
+
+ if eval_multi and pair_row.turn_2_question:
+ prompt = _select_prompt(category, multi_turn=True)
+ kwargs = {
+ "question_1": pair_row.turn_1_question,
+ "question_2": pair_row.turn_2_question,
+ "answer_a_1": pair_row.answer_a_1,
+ "answer_a_2": pair_row.answer_a_2,
+ "answer_b_1": pair_row.answer_b_1,
+ "answer_b_2": pair_row.answer_b_2,
+ }
+ if prompt.ref_based:
+ kwargs["ref_answer_1"] = pair_row.ref_1
+ kwargs["ref_answer_2"] = pair_row.ref_2
+ items.append(
+ {
+ "question_id": pair_row.question_id,
+ "category": category,
+ "turn": 2,
+ "prompt": prompt,
+ "prompt_name": prompt.name,
+ "prompt_kwargs": kwargs,
+ }
+ )
+ return items
+
+
+def _resolve_fastchat_item_result(
+ *,
+ item: dict[str, Any],
+ g1_raw: str,
+ g2_raw: str | None,
+ judge_model: str,
+ model_a: str,
+ model_b: str,
+) -> tuple[dict[str, Any], dict[str, object], float, bool]:
+ prompt: FastChatPairwisePrompt = item["prompt"]
+ kwargs = item["prompt_kwargs"]
+ g1_user_prompt = prompt.user_prompt_template.format(**kwargs)
+ g1_verdict = _parse_fastchat_verdict(g1_raw)
+ g1_winner = _map_verdict_to_winner(g1_verdict, swapped=False)
+
+ final_winner = g1_winner
+ inconsistent = False
+ annotation_row: dict[str, Any] = {
+ "question_id": item["question_id"],
+ "category": item["category"],
+ "turn": item["turn"],
+ "model_A": model_a,
+ "model_B": model_b,
+ "judge": judge_model,
+ "prompt_name": prompt.name,
+ "system_prompt": prompt.system_prompt,
+ "g1_user_prompt": g1_user_prompt,
+ "g1_judgment": g1_raw,
+ "g1_verdict": g1_verdict,
+ "g1_winner": g1_winner,
+ }
+
+ if g2_raw is not None:
+ g2_verdict = _parse_fastchat_verdict(g2_raw)
+ g2_winner = _map_verdict_to_winner(g2_verdict, swapped=True)
+ final_winner, inconsistent = _conservative_winner(g1_winner, g2_winner)
+ annotation_row.update(
+ {
+ "g2_user_prompt": prompt.user_prompt_template.format(
+ **_swap_prompt_kwargs(kwargs, multi_turn=prompt.multi_turn)
+ ),
+ "g2_judgment": g2_raw,
+ "g2_verdict": g2_verdict,
+ "g2_winner": g2_winner,
+ "final_winner": final_winner,
+ "inconsistent": inconsistent,
+ }
+ )
+ else:
+ annotation_row["final_winner"] = final_winner
+ annotation_row["inconsistent"] = False
+
+ preference = _winner_to_preference(final_winner)
+ annotation_row["preference"] = preference
+ metadata = {
+ "question_id": item["question_id"],
+ "category": item["category"],
+ "turn": item["turn"],
+ }
+ return annotation_row, metadata, preference, inconsistent
+
+
+def judge_mt_bench_pairwise_fastchat(
+ *,
+ judge_chat_model,
+ judge_model: str,
+ questions: pd.DataFrame,
+ completions_a: pd.DataFrame,
+ completions_b: pd.DataFrame,
+ model_a: str,
+ model_b: str,
+ turns_mode: str,
+ swap_mode: str,
+ truncate_input_chars: int | None,
+ use_tqdm: bool,
+) -> tuple[pd.Series, list[dict[str, Any]], list[dict[str, object]], int]:
+ """Pairwise MT-Bench judging compatible with FastChat's `[[A]]/[[B]]/[[C]]` format."""
+ assert turns_mode in ("both", "single", "multi")
+ assert swap_mode in ("fixed", "both")
+
+ eval_single = turns_mode in ("both", "single")
+ eval_multi = turns_mode in ("both", "multi")
+
+ items = _build_fastchat_judge_items(
+ questions=questions,
+ completions_a=completions_a,
+ completions_b=completions_b,
+ eval_single=eval_single,
+ eval_multi=eval_multi,
+ truncate_input_chars=truncate_input_chars,
+ )
+
+ g1_judgments = _infer_by_prompt_groups(
+ judge_chat_model=judge_chat_model,
+ items=items,
+ use_tqdm=use_tqdm,
+ swap_answers=False,
+ )
+
+ g2_judgments: list[str] | None = None
+ if swap_mode == "both":
+ g2_judgments = _infer_by_prompt_groups(
+ judge_chat_model=judge_chat_model,
+ items=items,
+ use_tqdm=use_tqdm,
+ swap_answers=True,
+ )
+
+ annotations: list[dict[str, Any]] = []
+ metadata: list[dict[str, object]] = []
+ prefs: list[float] = []
+ num_inconsistent = 0
+
+ for idx, item in enumerate(items):
+ g2_raw = g2_judgments[idx] if g2_judgments is not None else None
+ annotation_row, item_metadata, preference, inconsistent = _resolve_fastchat_item_result(
+ item=item,
+ g1_raw=g1_judgments[idx],
+ g2_raw=g2_raw,
+ judge_model=judge_model,
+ model_a=model_a,
+ model_b=model_b,
+ )
+ if inconsistent:
+ num_inconsistent += 1
+ annotations.append(annotation_row)
+ metadata.append(item_metadata)
+ prefs.append(preference)
+
+ return pd.Series(prefs, dtype=float), annotations, metadata, num_inconsistent
+
diff --git a/openjury/mt_bench/pipeline.py b/openjury/mt_bench/pipeline.py
new file mode 100644
index 0000000..39f9eb4
--- /dev/null
+++ b/openjury/mt_bench/pipeline.py
@@ -0,0 +1,477 @@
+"""MT-Bench evaluation pipeline.
+
+Orchestrates multi-turn generation, per-turn judging (OpenJury or
+FastChat-compatible), and result saving for the MT-Bench benchmark.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import asdict
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+from openjury.evaluate import PairScore, load_judge_system_and_user_prompt
+from openjury.eval_runtime import (
+ _compute_grouped_stats,
+ _judge_turn,
+ compute_preference_stats,
+ print_results,
+)
+from openjury.generate import generate_multiturn
+from openjury.instruction_dataset import load_instructions
+from openjury.mt_bench.common import iter_mt_bench_pairwise_rows
+from openjury.mt_bench.fastchat_compat import (
+ FASTCHAT_TEMPERATURE_CONFIG,
+ judge_mt_bench_pairwise_fastchat,
+)
+from openjury.utils import cache_function_dataframe, make_model
+
+if TYPE_CHECKING:
+ from openjury.config import CliArgs
+
+NEED_REF_CATS = {"math", "reasoning", "coding"}
+
+
+def format_mt_bench_for_evaluation(
+ questions: pd.DataFrame,
+ completions_A: pd.DataFrame,
+ completions_B: pd.DataFrame,
+ turns_mode: str,
+ truncate_input_chars: int | None,
+) -> tuple[
+ tuple[list[str], list[str], list[str], list[dict[str, object]]],
+ tuple[list[str], list[str], list[str], list[dict[str, object]]],
+]:
+ """Flatten MT-Bench into per-turn instruction/completion battle inputs."""
+ assert turns_mode in ("both", "single", "multi")
+ eval_single = turns_mode in ("both", "single")
+ eval_multi = turns_mode in ("both", "multi")
+
+ instructions_turn_1: list[str] = []
+ completions_a_turn_1: list[str] = []
+ completions_b_turn_1: list[str] = []
+ metadata_turn_1: list[dict[str, object]] = []
+
+ instructions_turn_2: list[str] = []
+ completions_a_turn_2: list[str] = []
+ completions_b_turn_2: list[str] = []
+ metadata_turn_2: list[dict[str, object]] = []
+
+ for row in iter_mt_bench_pairwise_rows(
+ questions=questions,
+ completions_a=completions_A,
+ completions_b=completions_B,
+ truncate_input_chars=truncate_input_chars,
+ ):
+ needs_ref = row.category in NEED_REF_CATS
+ if eval_single:
+ if needs_ref and row.ref_1:
+ instruction = (
+ "[MT-Bench | Turn 1]\n"
+ "Use the reference answer for correctness checks.\n\n"
+ f"[Question]\n{row.turn_1_question}\n\n"
+ f"[Reference Answer]\n{row.ref_1}"
+ )
+ else:
+ instruction = row.turn_1_question
+
+ instructions_turn_1.append(instruction)
+ completions_a_turn_1.append(row.answer_a_1)
+ completions_b_turn_1.append(row.answer_b_1)
+ metadata_turn_1.append(
+ {
+ "question_id": row.question_id,
+ "category": row.category,
+ "turn": 1,
+ }
+ )
+
+ if eval_multi and row.turn_2_question:
+ instruction_parts = [
+ "Please focus on which assistant provides a better answer to the second user question."
+ ]
+ if needs_ref and (row.ref_1 or row.ref_2):
+ instruction_parts.extend(
+ [
+ "<|The Start of Reference Answer|>",
+ "### User:",
+ row.turn_1_question,
+ "### Reference answer:",
+ row.ref_1,
+ "### User:",
+ row.turn_2_question,
+ "### Reference answer:",
+ row.ref_2,
+ "<|The End of Reference Answer|>",
+ ]
+ )
+
+ conversation_a = _format_mt_bench_multiturn_conversation(
+ turn_1_question=row.turn_1_question,
+ turn_1_answer=row.answer_a_1,
+ turn_2_question=row.turn_2_question,
+ turn_2_answer=row.answer_a_2,
+ )
+ conversation_b = _format_mt_bench_multiturn_conversation(
+ turn_1_question=row.turn_1_question,
+ turn_1_answer=row.answer_b_1,
+ turn_2_question=row.turn_2_question,
+ turn_2_answer=row.answer_b_2,
+ )
+
+ instructions_turn_2.append("\n\n".join(instruction_parts))
+ completions_a_turn_2.append(conversation_a)
+ completions_b_turn_2.append(conversation_b)
+ metadata_turn_2.append(
+ {
+ "question_id": row.question_id,
+ "category": row.category,
+ "turn": 2,
+ }
+ )
+
+ return (
+ (
+ instructions_turn_1,
+ completions_a_turn_1,
+ completions_b_turn_1,
+ metadata_turn_1,
+ ),
+ (
+ instructions_turn_2,
+ completions_a_turn_2,
+ completions_b_turn_2,
+ metadata_turn_2,
+ ),
+ )
+
+
+def _format_mt_bench_multiturn_conversation(
+ *,
+ turn_1_question: str,
+ turn_1_answer: str,
+ turn_2_question: str,
+ turn_2_answer: str,
+) -> str:
+ return (
+ "### User:\n"
+ f"{turn_1_question}\n\n"
+ "### Assistant:\n"
+ f"{turn_1_answer}\n\n"
+ "### User:\n"
+ f"{turn_2_question}\n\n"
+ "### Assistant:\n"
+ f"{turn_2_answer}"
+ )
+
+
+def _generate_mt_bench_completions(
+ args: CliArgs,
+ questions_df: pd.DataFrame,
+ ignore_cache: bool,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+ cache_prefix = (
+ "mt-bench_fastchatgen" if args.mt_bench_compatibility == "fastchat" else "mt-bench"
+ )
+
+ def _run_generation(model_name: str) -> pd.DataFrame:
+ if args.mt_bench_compatibility == "fastchat":
+ return generate_multiturn(
+ questions=questions_df,
+ model=model_name,
+ truncate_input_chars=args.truncate_all_input_chars,
+ max_tokens=args.max_out_tokens_models,
+ use_tqdm=args.use_tqdm,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ temperature_config=FASTCHAT_TEMPERATURE_CONFIG,
+ )
+ return generate_multiturn(
+ questions=questions_df,
+ model=model_name,
+ truncate_input_chars=args.truncate_all_input_chars,
+ max_tokens=args.max_out_tokens_models,
+ use_tqdm=args.use_tqdm,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ )
+
+ completions_a = cache_function_dataframe(
+ lambda: _run_generation(args.model_A),
+ ignore_cache=ignore_cache,
+ cache_name=f"{cache_prefix}_{args.model_A}_{args.n_instructions}",
+ ).set_index("instruction_index")
+
+ completions_b = cache_function_dataframe(
+ lambda: _run_generation(args.model_B),
+ ignore_cache=ignore_cache,
+ cache_name=f"{cache_prefix}_{args.model_B}_{args.n_instructions}",
+ ).set_index("instruction_index")
+ return completions_a, completions_b
+
+
+def _build_mt_bench_result_name(args: CliArgs, suffix: str | None = None) -> str:
+ name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"
+ name += f"-{args.swap_mode}"
+ if suffix:
+ name += f"-{suffix}"
+ return name.replace("/", "_")
+
+
+def _save_mt_bench_results(
+ *,
+ args: CliArgs,
+ results: dict[str, object],
+ annotations_df: pd.DataFrame,
+ name_suffix: str | None = None,
+) -> None:
+ name = _build_mt_bench_result_name(args, suffix=name_suffix)
+ res_folder = Path(args.result_folder) / name
+ res_folder.mkdir(parents=True, exist_ok=True)
+
+ with open(res_folder / f"args-{name}.json", "w") as f:
+ json.dump(asdict(args), f, indent=2)
+
+ annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False)
+
+ with open(res_folder / f"results-{name}.json", "w") as f:
+ json.dump(results, f, indent=2)
+
+
+def _run_mt_bench_fastchat(
+ *,
+ args: CliArgs,
+ questions_df: pd.DataFrame,
+ completions_a: pd.DataFrame,
+ completions_b: pd.DataFrame,
+ judge_chat_model,
+) -> pd.Series:
+ prefs, annotations, combined_metadata, num_inconsistent = (
+ judge_mt_bench_pairwise_fastchat(
+ judge_chat_model=judge_chat_model,
+ judge_model=args.judge_model,
+ questions=questions_df,
+ completions_a=completions_a,
+ completions_b=completions_b,
+ model_a=args.model_A,
+ model_b=args.model_B,
+ turns_mode=args.mt_bench_turns,
+ swap_mode=args.swap_mode,
+ truncate_input_chars=args.truncate_all_input_chars,
+ use_tqdm=args.use_tqdm,
+ )
+ )
+
+ stats = compute_preference_stats(prefs)
+ results = {
+ "dataset": args.dataset,
+ "model_A": args.model_A,
+ "model_B": args.model_B,
+ "judge_model": args.judge_model,
+ "mt_bench_compatibility": args.mt_bench_compatibility,
+ "num_inconsistent": num_inconsistent,
+ **stats,
+ "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"),
+ "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"),
+ "preferences": prefs.tolist(),
+ "date": str(datetime.now().isoformat()),
+ "user": os.getenv("USER", ""),
+ }
+ print_results(results)
+ _save_mt_bench_results(
+ args=args,
+ results=results,
+ annotations_df=pd.DataFrame(annotations),
+ name_suffix=f"mtbench_{args.mt_bench_compatibility}",
+ )
+ return prefs
+
+
+def _run_mt_bench_openjury(
+ *,
+ args: CliArgs,
+ questions_df: pd.DataFrame,
+ completions_a: pd.DataFrame,
+ completions_b: pd.DataFrame,
+ judge_chat_model,
+) -> pd.Series:
+ turn_1_inputs, turn_2_inputs = format_mt_bench_for_evaluation(
+ questions=questions_df,
+ completions_A=completions_a,
+ completions_B=completions_b,
+ turns_mode=args.mt_bench_turns,
+ truncate_input_chars=args.truncate_all_input_chars,
+ )
+ (
+ instructions_turn_1,
+ completions_a_turn_1,
+ completions_b_turn_1,
+ metadata_turn_1,
+ ) = turn_1_inputs
+ (
+ instructions_turn_2,
+ completions_a_turn_2,
+ completions_b_turn_2,
+ metadata_turn_2,
+ ) = turn_2_inputs
+
+ score_parser = PairScore()
+ annotations = []
+ metadata_for_annotations: list[dict[str, object]] = []
+ annotations_reversed = []
+ metadata_for_reversed_annotations: list[dict[str, object]] = []
+ preference_parts: list[pd.Series] = []
+ combined_metadata: list[dict[str, object]] = []
+
+ if instructions_turn_1:
+ (
+ annotations_turn_1,
+ annotations_turn_1_reversed,
+ metadata_turn_1_for_annotations,
+ metadata_turn_1_for_reversed_annotations,
+ prefs_turn_1,
+ combined_metadata_turn_1,
+ ) = _judge_turn(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions_turn_1,
+ completions_A=completions_a_turn_1,
+ completions_B=completions_b_turn_1,
+ metadata=metadata_turn_1,
+ score_parser=score_parser,
+ provide_explanation=args.provide_explanation,
+ swap_mode=args.swap_mode,
+ truncate_input_chars=args.truncate_all_input_chars,
+ use_tqdm=args.use_tqdm,
+ )
+ annotations.extend(annotations_turn_1)
+ annotations_reversed.extend(annotations_turn_1_reversed)
+ metadata_for_annotations.extend(metadata_turn_1_for_annotations)
+ metadata_for_reversed_annotations.extend(
+ metadata_turn_1_for_reversed_annotations
+ )
+ preference_parts.append(prefs_turn_1)
+ combined_metadata.extend(combined_metadata_turn_1)
+
+ if instructions_turn_2:
+ mt_system_prompt, mt_user_prompt_template = load_judge_system_and_user_prompt(
+ provide_explanation=args.provide_explanation,
+ multi_turn=True,
+ )
+ (
+ annotations_turn_2,
+ annotations_turn_2_reversed,
+ metadata_turn_2_for_annotations,
+ metadata_turn_2_for_reversed_annotations,
+ prefs_turn_2,
+ combined_metadata_turn_2,
+ ) = _judge_turn(
+ judge_chat_model=judge_chat_model,
+ instructions=instructions_turn_2,
+ completions_A=completions_a_turn_2,
+ completions_B=completions_b_turn_2,
+ metadata=metadata_turn_2,
+ score_parser=score_parser,
+ provide_explanation=args.provide_explanation,
+ swap_mode=args.swap_mode,
+ truncate_input_chars=args.truncate_all_input_chars,
+ use_tqdm=args.use_tqdm,
+ system_prompt=mt_system_prompt,
+ user_prompt_template=mt_user_prompt_template,
+ )
+ annotations.extend(annotations_turn_2)
+ annotations_reversed.extend(annotations_turn_2_reversed)
+ metadata_for_annotations.extend(metadata_turn_2_for_annotations)
+ metadata_for_reversed_annotations.extend(
+ metadata_turn_2_for_reversed_annotations
+ )
+ preference_parts.append(prefs_turn_2)
+ combined_metadata.extend(combined_metadata_turn_2)
+
+ prefs = (
+ pd.concat(preference_parts).reset_index(drop=True)
+ if preference_parts
+ else pd.Series(dtype=float)
+ )
+ stats = compute_preference_stats(prefs)
+ results = {
+ "dataset": args.dataset,
+ "model_A": args.model_A,
+ "model_B": args.model_B,
+ "judge_model": args.judge_model,
+ **stats,
+ "per_category": _compute_grouped_stats(prefs, combined_metadata, "category"),
+ "per_turn": _compute_grouped_stats(prefs, combined_metadata, "turn"),
+ "preferences": prefs.tolist(),
+ "date": str(datetime.now().isoformat()),
+ "user": os.getenv("USER", ""),
+ }
+ print_results(results)
+
+ df = pd.DataFrame(annotations)
+ df["instruction_index"] = [meta["question_id"] for meta in metadata_for_annotations]
+ df["category"] = [meta["category"] for meta in metadata_for_annotations]
+ df["turn"] = [meta["turn"] for meta in metadata_for_annotations]
+ df["model_A"] = args.model_A
+ df["model_B"] = args.model_B
+ df["judge"] = args.judge_model
+
+ if args.swap_mode == "both":
+ df_reversed = pd.DataFrame(annotations_reversed)
+ df_reversed["instruction_index"] = [
+ meta["question_id"] for meta in metadata_for_reversed_annotations
+ ]
+ df_reversed["category"] = [
+ meta["category"] for meta in metadata_for_reversed_annotations
+ ]
+ df_reversed["turn"] = [meta["turn"] for meta in metadata_for_reversed_annotations]
+ df_reversed["model_A"] = args.model_B
+ df_reversed["model_B"] = args.model_A
+ df_reversed["judge"] = args.judge_model
+ df = pd.concat([df, df_reversed], ignore_index=True)
+
+ _save_mt_bench_results(
+ args=args,
+ results=results,
+ annotations_df=df,
+ )
+ return prefs
+
+
+def run_mt_bench(args: CliArgs, ignore_cache: bool):
+ """MT-Bench pipeline (optionally FastChat-compatible)."""
+ questions_df = load_instructions("mt-bench", n_instructions=args.n_instructions)
+ print(
+ f"Generating multi-turn completions for MT-Bench with {args.model_A} and {args.model_B}."
+ )
+ completions_a, completions_b = _generate_mt_bench_completions(
+ args=args,
+ questions_df=questions_df,
+ ignore_cache=ignore_cache,
+ )
+ judge_chat_model = make_model(
+ model=args.judge_model,
+ max_tokens=args.max_out_tokens_judge,
+ temperature=0.0 if args.mt_bench_compatibility == "fastchat" else None,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ )
+ if args.mt_bench_compatibility == "fastchat":
+ return _run_mt_bench_fastchat(
+ args=args,
+ questions_df=questions_df,
+ completions_a=completions_a,
+ completions_b=completions_b,
+ judge_chat_model=judge_chat_model,
+ )
+ return _run_mt_bench_openjury(
+ args=args,
+ questions_df=questions_df,
+ completions_a=completions_a,
+ completions_b=completions_b,
+ judge_chat_model=judge_chat_model,
+ )
diff --git a/openjury/mt_bench_101/__init__.py b/openjury/mt_bench_101/__init__.py
new file mode 100644
index 0000000..7fd025f
--- /dev/null
+++ b/openjury/mt_bench_101/__init__.py
@@ -0,0 +1,17 @@
+from openjury.mt_bench_101.generate import generate_mt_bench_101_completions
+from openjury.mt_bench_101.pipeline import run_mt_bench_101
+from openjury.mt_bench_101.evaluate import (
+ derive_mt_bench_101_pairwise_preferences,
+ judge_mt_bench_101_single,
+ summarize_mt_bench_101_absolute_scores,
+ summarize_mt_bench_101_pairwise,
+)
+
+__all__ = [
+ "derive_mt_bench_101_pairwise_preferences",
+ "generate_mt_bench_101_completions",
+ "judge_mt_bench_101_single",
+ "run_mt_bench_101",
+ "summarize_mt_bench_101_absolute_scores",
+ "summarize_mt_bench_101_pairwise",
+]
diff --git a/openjury/mt_bench_101/evaluate.py b/openjury/mt_bench_101/evaluate.py
new file mode 100644
index 0000000..bfd2382
--- /dev/null
+++ b/openjury/mt_bench_101/evaluate.py
@@ -0,0 +1,283 @@
+import re
+from functools import lru_cache
+from pathlib import Path
+
+import pandas as pd
+from langchain.prompts import ChatPromptTemplate
+
+from openjury.evaluate import PairScore
+from openjury.instruction_dataset.mt_bench_101 import (
+ MT_BENCH_101_REFERENCE_TASKS,
+ MT_BENCH_101_TASK_TO_ABILITY,
+)
+from openjury.utils import do_inference, safe_text
+
+PROMPTS_DIR = Path(__file__).resolve().parent.parent / "prompts" / "mt_bench_101"
+DOUBLE_BRACKET_PATTERN = re.compile(r"\[\[(\d+)\]\]")
+
+TASK_PROMPT_FILES = {
+ "CM": "CM.txt",
+ "AR": "AR.txt",
+ "SI": "SI.txt",
+ "TS": "TS.txt",
+ "CC": "CC.txt",
+ "CR": "rephrasing.txt",
+ "FR": "rephrasing.txt",
+ "SC": "SC.txt",
+ "SA": "SA.txt",
+ "MR": "MR.txt",
+ "GR": "GR.txt",
+ "IC": "IC.txt",
+ "PI": "PI.txt",
+}
+
+
+@lru_cache(maxsize=1)
+def load_mt_bench_101_prompts() -> dict[str, object]:
+ global_system = (PROMPTS_DIR / "global_system.txt").read_text()
+ scoring_format = (PROMPTS_DIR / "scoring_format.txt").read_text()
+ task_prompts = {
+ task: (PROMPTS_DIR / prompt_file).read_text()
+ for task, prompt_file in TASK_PROMPT_FILES.items()
+ }
+ return {
+ "global_system": global_system,
+ "scoring_format": scoring_format,
+ "task_prompts": task_prompts,
+ }
+
+
+def parse_mt_bench_101_rating(judge_completion: str) -> float | None:
+ for match in DOUBLE_BRACKET_PATTERN.finditer(judge_completion):
+ score = int(match.group(1))
+ if 1 <= score <= 10:
+ return float(score)
+ return None
+
+
+def format_mt_bench_101_dialogue(
+ *,
+ golden_context: list[dict[str, str]],
+ user_message: str,
+ assistant_message: str,
+) -> str:
+ chunks: list[str] = []
+ for turn in golden_context:
+ chunks.append(
+ f"\n\n Human: {turn.get('user', '')}\n\nAssistant: {turn.get('bot', '')}"
+ )
+ chunks.append(f"\n\n Human: {user_message}\n\nAssistant: {assistant_message}")
+ return "".join(chunks)
+
+
+def judge_mt_bench_101_single(
+ *,
+ judge_chat_model,
+ eval_items: pd.DataFrame,
+ completions: pd.DataFrame,
+ truncate_input_chars: int | None = 8192,
+ use_tqdm: bool = False,
+) -> pd.DataFrame:
+ prompts = load_mt_bench_101_prompts()
+ completion_by_idx = (
+ completions
+ if "instruction_index" not in completions.columns
+ else completions.set_index("instruction_index")
+ )
+
+ rows: list[dict[str, object]] = []
+ for idx in eval_items.index:
+ eval_row = eval_items.loc[idx]
+ completion_row = completion_by_idx.loc[idx]
+ task = str(eval_row["task"])
+ model_response = safe_text(
+ completion_row.get("completion", ""),
+ truncate_input_chars,
+ )
+
+ dialogue = format_mt_bench_101_dialogue(
+ golden_context=eval_row.get("golden_context") or [],
+ user_message=safe_text(eval_row.get("user_message", ""), truncate_input_chars),
+ assistant_message=model_response,
+ )
+
+ user_prompt = (
+ "The dialogue need to be judged is: \n *** \n "
+ f"{dialogue} \n ***"
+ )
+ if task in MT_BENCH_101_REFERENCE_TASKS:
+ reference_answer = safe_text(
+ eval_row.get("reference_answer"),
+ truncate_input_chars,
+ )
+ user_prompt += (
+ "\n\nThe reference solution is: \n ### \n "
+ f"{reference_answer} \n ###\n\n"
+ )
+
+ system_prompt = (
+ f"{prompts['global_system']}\n\n"
+ f"{prompts['task_prompts'][task]}\n\n"
+ f"{prompts['scoring_format']}"
+ ).strip()
+
+ rows.append(
+ {
+ "instruction_index": idx,
+ "dialogue_id": eval_row["dialogue_id"],
+ "dialogue_uid": eval_row["dialogue_uid"],
+ "task": task,
+ "ability": eval_row.get("ability", MT_BENCH_101_TASK_TO_ABILITY[task]),
+ "turn_index": eval_row["turn_index"],
+ "model_completion": model_response,
+ "system_prompt": system_prompt,
+ "user_prompt": user_prompt,
+ }
+ )
+
+ prompt_template = ChatPromptTemplate.from_messages(
+ [("system", "{system_prompt}"), ("user", "{user_prompt}")]
+ )
+ inputs = prompt_template.batch(
+ [
+ {"system_prompt": row["system_prompt"], "user_prompt": row["user_prompt"]}
+ for row in rows
+ ]
+ )
+ judge_completions = do_inference(
+ chat_model=judge_chat_model,
+ inputs=inputs,
+ use_tqdm=use_tqdm,
+ )
+
+ for row, judge_completion in zip(rows, judge_completions):
+ row["judge_completion"] = judge_completion
+ row["score"] = parse_mt_bench_101_rating(judge_completion)
+
+ return pd.DataFrame(rows)
+
+
+def compute_mt_bench_101_dialogue_scores(scored_turns: pd.DataFrame) -> pd.DataFrame:
+ grouped = (
+ scored_turns.groupby(["dialogue_uid", "dialogue_id", "task", "ability"], as_index=False)[
+ "score"
+ ].min()
+ )
+ grouped = grouped.rename(columns={"score": "dialogue_score"})
+ return grouped
+
+
+def summarize_mt_bench_101_absolute_scores(scored_turns: pd.DataFrame) -> dict[str, object]:
+ dialogue_scores = compute_mt_bench_101_dialogue_scores(scored_turns=scored_turns)
+ per_task_series = dialogue_scores.groupby("task")["dialogue_score"].mean().sort_index()
+ per_ability_series = (
+ dialogue_scores.groupby("ability")["dialogue_score"].mean().sort_index()
+ )
+ overall = per_task_series.mean() if len(per_task_series) else float("nan")
+
+ return {
+ "num_turns": int(len(scored_turns)),
+ "num_scored_turns": int(scored_turns["score"].notna().sum()),
+ "per_task": {
+ task: float(score)
+ for task, score in per_task_series.items()
+ if pd.notna(score)
+ },
+ "per_ability": {
+ ability: float(score)
+ for ability, score in per_ability_series.items()
+ if pd.notna(score)
+ },
+ "overall": float(overall) if pd.notna(overall) else None,
+ }
+
+
+def derive_mt_bench_101_pairwise_preferences(
+ scored_a: pd.DataFrame,
+ scored_b: pd.DataFrame,
+) -> pd.DataFrame:
+ cols = ["instruction_index", "dialogue_uid", "dialogue_id", "task", "ability", "turn_index"]
+ merged = scored_a.loc[:, cols + ["score"]].rename(columns={"score": "score_A"}).merge(
+ scored_b.loc[:, cols + ["score"]].rename(columns={"score": "score_B"}),
+ on=cols,
+ how="inner",
+ )
+
+ scorer = PairScore()
+ preferences = []
+ for _, row in merged.iterrows():
+ score_a = row["score_A"]
+ score_b = row["score_B"]
+ if pd.isna(score_a) or pd.isna(score_b):
+ preferences.append(None)
+ continue
+ preferences.append(float(scorer.preference_from_scores(score_a, score_b)))
+ merged["preference"] = preferences
+ return merged
+
+
+def _compute_preference_stats(preferences: pd.Series) -> dict[str, float | int]:
+ tie_tol = 1e-12
+ valid = preferences.dropna()
+ num_wins = int(sum(valid < 0.5 - tie_tol))
+ num_losses = int(sum(valid > 0.5 + tie_tol))
+ num_ties = int(sum((valid >= 0.5 - tie_tol) & (valid <= 0.5 + tie_tol)))
+ num_battles = len(preferences)
+ num_missing = num_battles - (num_wins + num_losses + num_ties)
+ denom = num_wins + num_losses + num_ties
+ winrate = float((num_wins + 0.5 * num_ties) / denom) if denom else 0.0
+ return {
+ "num_battles": num_battles,
+ "num_wins": num_wins,
+ "num_losses": num_losses,
+ "num_ties": num_ties,
+ "num_missing": num_missing,
+ "winrate": winrate,
+ }
+
+
+def _grouped_preference_stats(
+ pairwise_df: pd.DataFrame,
+ group_by: str,
+) -> dict[str, dict[str, float | int]]:
+ grouped: dict[str, list[float]] = {}
+ for _, row in pairwise_df.iterrows():
+ key = row[group_by]
+ grouped.setdefault(key, []).append(row["preference"])
+ return {
+ key: _compute_preference_stats(pd.Series(values))
+ for key, values in grouped.items()
+ }
+
+
+def summarize_mt_bench_101_pairwise(pairwise_turns: pd.DataFrame) -> dict[str, object]:
+ turn_preferences = pairwise_turns["preference"]
+ turn_level = {
+ **_compute_preference_stats(turn_preferences),
+ "per_task": _grouped_preference_stats(pairwise_turns, "task"),
+ "per_ability": _grouped_preference_stats(pairwise_turns, "ability"),
+ }
+
+ dialogue_scores = (
+ pairwise_turns.groupby(["dialogue_uid", "dialogue_id", "task", "ability"], as_index=False)[
+ ["score_A", "score_B"]
+ ].min()
+ )
+ scorer = PairScore()
+ dialogue_scores["preference"] = [
+ float(scorer.preference_from_scores(score_a, score_b))
+ if pd.notna(score_a) and pd.notna(score_b)
+ else None
+ for score_a, score_b in zip(dialogue_scores["score_A"], dialogue_scores["score_B"])
+ ]
+ dialogue_level = {
+ **_compute_preference_stats(dialogue_scores["preference"]),
+ "per_task": _grouped_preference_stats(dialogue_scores, "task"),
+ "per_ability": _grouped_preference_stats(dialogue_scores, "ability"),
+ }
+
+ return {
+ "turn_level": turn_level,
+ "dialogue_level": dialogue_level,
+ "preferences": [None if pd.isna(x) else float(x) for x in turn_preferences],
+ }
diff --git a/openjury/mt_bench_101/generate.py b/openjury/mt_bench_101/generate.py
new file mode 100644
index 0000000..ab8d75e
--- /dev/null
+++ b/openjury/mt_bench_101/generate.py
@@ -0,0 +1,86 @@
+from typing import Any
+
+import pandas as pd
+from langchain.prompts import ChatPromptTemplate
+
+from openjury.utils import do_inference, make_model, truncate
+
+DEFAULT_SYSTEM_PROMPT = "You are a helpful assistant."
+
+
+def _escape_template_braces(text: str) -> str:
+ return text.replace("{", "{{").replace("}", "}}")
+
+
+def _build_golden_context_input(
+ *,
+ system_prompt: str,
+ golden_context: list[dict[str, str]],
+ user_message: str,
+ truncate_input_chars: int | None,
+):
+ messages: list[tuple[str, str]] = [("system", _escape_template_braces(system_prompt))]
+ for turn in golden_context:
+ messages.append(
+ (
+ "user",
+ _escape_template_braces(
+ truncate(str(turn.get("user") or ""), max_len=truncate_input_chars)
+ ),
+ )
+ )
+ messages.append(
+ (
+ "assistant",
+ _escape_template_braces(
+ truncate(str(turn.get("bot") or ""), max_len=truncate_input_chars)
+ ),
+ )
+ )
+ messages.append(
+ (
+ "user",
+ _escape_template_braces(
+ truncate(user_message, max_len=truncate_input_chars)
+ ),
+ )
+ )
+ return ChatPromptTemplate.from_messages(messages).invoke({})
+
+
+def generate_mt_bench_101_completions(
+ eval_items: pd.DataFrame,
+ model: str,
+ truncate_input_chars: int | None = 8192,
+ max_tokens: int | None = 8192,
+ use_tqdm: bool = True,
+ system_prompt: str = DEFAULT_SYSTEM_PROMPT,
+ **model_kwargs: Any,
+) -> pd.DataFrame:
+ """Generate MT-Bench-101 responses from golden context eval items."""
+ chat_model = make_model(model, max_tokens=max_tokens, **model_kwargs)
+
+ inputs = []
+ for _, row in eval_items.iterrows():
+ inputs.append(
+ _build_golden_context_input(
+ system_prompt=system_prompt,
+ golden_context=row.get("golden_context") or [],
+ user_message=str(row.get("user_message") or ""),
+ truncate_input_chars=truncate_input_chars,
+ )
+ )
+
+ completions = do_inference(chat_model=chat_model, inputs=inputs, use_tqdm=use_tqdm)
+ idxs = eval_items.index.tolist()
+ return pd.DataFrame(
+ {
+ "instruction_index": idxs,
+ "dialogue_id": [eval_items.loc[idx, "dialogue_id"] for idx in idxs],
+ "dialogue_uid": [eval_items.loc[idx, "dialogue_uid"] for idx in idxs],
+ "task": [eval_items.loc[idx, "task"] for idx in idxs],
+ "ability": [eval_items.loc[idx, "ability"] for idx in idxs],
+ "turn_index": [eval_items.loc[idx, "turn_index"] for idx in idxs],
+ "completion": completions,
+ }
+ )
diff --git a/openjury/mt_bench_101/pipeline.py b/openjury/mt_bench_101/pipeline.py
new file mode 100644
index 0000000..28aa2ee
--- /dev/null
+++ b/openjury/mt_bench_101/pipeline.py
@@ -0,0 +1,198 @@
+"""MT-Bench-101 evaluation pipeline."""
+
+from __future__ import annotations
+
+import json
+import os
+from dataclasses import asdict
+from datetime import datetime
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import pandas as pd
+
+from openjury.instruction_dataset import load_instructions
+from openjury.mt_bench_101.evaluate import (
+ derive_mt_bench_101_pairwise_preferences,
+ judge_mt_bench_101_single,
+ summarize_mt_bench_101_absolute_scores,
+ summarize_mt_bench_101_pairwise,
+)
+from openjury.mt_bench_101.generate import generate_mt_bench_101_completions
+from openjury.utils import cache_function_dataframe, make_model
+
+if TYPE_CHECKING:
+ from openjury.config import CliArgs
+
+
+def _generate_mt_bench_101_completions(
+ args: CliArgs,
+ eval_items_df: pd.DataFrame,
+ ignore_cache: bool,
+) -> tuple[pd.DataFrame, pd.DataFrame]:
+ def _run_generation(model_name: str) -> pd.DataFrame:
+ return generate_mt_bench_101_completions(
+ eval_items=eval_items_df,
+ model=model_name,
+ truncate_input_chars=args.truncate_all_input_chars,
+ max_tokens=args.max_out_tokens_models,
+ use_tqdm=args.use_tqdm,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ )
+
+ completions_a = cache_function_dataframe(
+ lambda: _run_generation(args.model_A),
+ ignore_cache=ignore_cache,
+ cache_name=f"mt-bench-101_{args.model_A}_{args.n_instructions}",
+ ).set_index("instruction_index")
+
+ completions_b = cache_function_dataframe(
+ lambda: _run_generation(args.model_B),
+ ignore_cache=ignore_cache,
+ cache_name=f"mt-bench-101_{args.model_B}_{args.n_instructions}",
+ ).set_index("instruction_index")
+ return completions_a, completions_b
+
+
+def _build_mt_bench_101_result_name(args: CliArgs, suffix: str | None = None) -> str:
+ name = f"{args.dataset}-{args.model_A}-{args.model_B}-{args.judge_model}"
+ if suffix:
+ name += f"-{suffix}"
+ return name.replace("/", "_")
+
+
+def _save_mt_bench_101_results(
+ *,
+ args: CliArgs,
+ results: dict[str, object],
+ annotations_df: pd.DataFrame,
+ name_suffix: str | None = None,
+) -> None:
+ name = _build_mt_bench_101_result_name(args, suffix=name_suffix)
+ res_folder = Path(args.result_folder) / name
+ res_folder.mkdir(parents=True, exist_ok=True)
+
+ with open(res_folder / f"args-{name}.json", "w") as f:
+ json.dump(asdict(args), f, indent=2)
+
+ annotations_df.to_csv(res_folder / f"{name}-annotations.csv", index=False)
+
+ with open(res_folder / f"results-{name}.json", "w") as f:
+ json.dump(results, f, indent=2)
+
+
+def run_mt_bench_101(args: CliArgs, ignore_cache: bool) -> pd.Series:
+ """MT-Bench-101 pipeline with single-answer grading."""
+ if args.mt_bench_compatibility or args.mt_bench_turns:
+ print(
+ "MT-Bench-101 is a different benchmark from original MT-Bench. "
+ "--mt_bench_turns and --mt_bench_compatibility have no effect for this dataset, "
+ )
+ if args.swap_mode:
+ print(
+ "--swap_mode has no effect for mt-bench-101 since it does single answer grading before comparing the models"
+ )
+
+ eval_items_df = load_instructions("mt-bench-101", n_instructions=args.n_instructions)
+ print(
+ "Generating completions from golden context for MT-Bench-101 with "
+ f"{args.model_A} and {args.model_B}."
+ )
+ completions_a, completions_b = _generate_mt_bench_101_completions(
+ args=args,
+ eval_items_df=eval_items_df,
+ ignore_cache=ignore_cache,
+ )
+
+ judge_chat_model = make_model(
+ model=args.judge_model,
+ max_tokens=args.max_out_tokens_judge,
+ temperature=0.6,
+ max_model_len=args.max_model_len,
+ chat_template=args.chat_template,
+ )
+ scored_a = judge_mt_bench_101_single(
+ judge_chat_model=judge_chat_model,
+ eval_items=eval_items_df,
+ completions=completions_a,
+ truncate_input_chars=args.truncate_all_input_chars,
+ use_tqdm=args.use_tqdm,
+ )
+ scored_b = judge_mt_bench_101_single(
+ judge_chat_model=judge_chat_model,
+ eval_items=eval_items_df,
+ completions=completions_b,
+ truncate_input_chars=args.truncate_all_input_chars,
+ use_tqdm=args.use_tqdm,
+ )
+
+ absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a)
+ absolute_b = summarize_mt_bench_101_absolute_scores(scored_turns=scored_b)
+ pairwise_turns = derive_mt_bench_101_pairwise_preferences(
+ scored_a=scored_a,
+ scored_b=scored_b,
+ )
+ pairwise_summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns)
+ dialogue_pairwise = pairwise_summary["dialogue_level"]
+
+ print(f"{args.model_A} vs {args.model_B} judged by {args.judge_model}")
+ print(
+ "MT-Bench-101 dialogue-level pairwise winrate(A): "
+ f"{dialogue_pairwise['winrate']:.1%}"
+ )
+
+ ann_cols = [
+ "instruction_index",
+ "dialogue_uid",
+ "dialogue_id",
+ "task",
+ "ability",
+ "turn_index",
+ "model_completion",
+ "judge_completion",
+ "score",
+ ]
+ annotations_a = scored_a.loc[:, ann_cols].copy()
+ annotations_a["evaluated_model"] = args.model_A
+ annotations_b = scored_b.loc[:, ann_cols].copy()
+ annotations_b["evaluated_model"] = args.model_B
+ annotations_df = pd.concat([annotations_a, annotations_b], ignore_index=True)
+ annotations_df = annotations_df.merge(
+ pairwise_turns.loc[
+ :, ["instruction_index", "score_A", "score_B", "preference"]
+ ],
+ on="instruction_index",
+ how="left",
+ validate="many_to_one",
+ )
+
+ results = {
+ "dataset": args.dataset,
+ "model_A": args.model_A,
+ "model_B": args.model_B,
+ "judge_model": args.judge_model,
+ "judge_temperature": 0.6,
+ "evaluation_mode": "single_answer_grading",
+ "num_battles": dialogue_pairwise["num_battles"],
+ "winrate": dialogue_pairwise["winrate"],
+ "num_wins": dialogue_pairwise["num_wins"],
+ "num_losses": dialogue_pairwise["num_losses"],
+ "num_ties": dialogue_pairwise["num_ties"],
+ "num_missing": dialogue_pairwise["num_missing"],
+ "per_category": dialogue_pairwise["per_task"],
+ "model_A_scores": absolute_a,
+ "model_B_scores": absolute_b,
+ "pairwise": pairwise_summary,
+ "preferences": pairwise_summary["preferences"],
+ "date": str(datetime.now().isoformat()),
+ "user": os.getenv("USER", ""),
+ }
+
+ _save_mt_bench_101_results(
+ args=args,
+ results=results,
+ annotations_df=annotations_df,
+ name_suffix="mtbench_101",
+ )
+ return pd.Series(pairwise_summary["preferences"])
diff --git a/openjury/prompts/mt_bench/system-base.txt b/openjury/prompts/mt_bench/system-base.txt
new file mode 100644
index 0000000..b4aff2e
--- /dev/null
+++ b/openjury/prompts/mt_bench/system-base.txt
@@ -0,0 +1 @@
+Please act as an impartial judge and evaluate the quality of the responses provided by two AI assistants to the user {user_subject}. {task_description} {focus_line}Begin your evaluation by {begin_instruction}. Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. Do not allow the length of the responses to influence your evaluation. Do not favor certain names of the assistants. Be as objective as possible. After providing your explanation, output your final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]" if assistant B is better, and "[[C]]" for a tie.
diff --git a/openjury/prompts/mt_bench/user-multi-base.txt b/openjury/prompts/mt_bench/user-multi-base.txt
new file mode 100644
index 0000000..33abb79
--- /dev/null
+++ b/openjury/prompts/mt_bench/user-multi-base.txt
@@ -0,0 +1,32 @@
+{reference_block}<|The Start of Assistant A's Conversation with User|>
+
+### User:
+{{question_1}}
+
+### Assistant A:
+{{answer_a_1}}
+
+### User:
+{{question_2}}
+
+### Assistant A:
+{{answer_a_2}}
+
+<|The End of Assistant A's Conversation with User|>
+
+
+<|The Start of Assistant B's Conversation with User|>
+
+### User:
+{{question_1}}
+
+### Assistant B:
+{{answer_b_1}}
+
+### User:
+{{question_2}}
+
+### Assistant B:
+{{answer_b_2}}
+
+<|The End of Assistant B's Conversation with User|>
diff --git a/openjury/prompts/mt_bench/user-multi-reference-block.txt b/openjury/prompts/mt_bench/user-multi-reference-block.txt
new file mode 100644
index 0000000..703554d
--- /dev/null
+++ b/openjury/prompts/mt_bench/user-multi-reference-block.txt
@@ -0,0 +1,16 @@
+<|The Start of Reference Answer|>
+
+### User:
+{question_1}
+
+### Reference answer:
+{ref_answer_1}
+
+### User:
+{question_2}
+
+### Reference answer:
+{ref_answer_2}
+
+<|The End of Reference Answer|>
+
diff --git a/openjury/prompts/mt_bench/user-single-base.txt b/openjury/prompts/mt_bench/user-single-base.txt
new file mode 100644
index 0000000..ee7701c
--- /dev/null
+++ b/openjury/prompts/mt_bench/user-single-base.txt
@@ -0,0 +1,10 @@
+[User Question]
+{{question}}
+
+{reference_block}[The Start of Assistant A's Answer]
+{{answer_a}}
+[The End of Assistant A's Answer]
+
+[The Start of Assistant B's Answer]
+{{answer_b}}
+[The End of Assistant B's Answer]
diff --git a/openjury/prompts/mt_bench/user-single-reference-block.txt b/openjury/prompts/mt_bench/user-single-reference-block.txt
new file mode 100644
index 0000000..1b687d2
--- /dev/null
+++ b/openjury/prompts/mt_bench/user-single-reference-block.txt
@@ -0,0 +1,4 @@
+[The Start of Reference Answer]
+{ref_answer_1}
+[The End of Reference Answer]
+
diff --git a/openjury/prompts/mt_bench_101/AR.txt b/openjury/prompts/mt_bench_101/AR.txt
new file mode 100644
index 0000000..61e77ae
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/AR.txt
@@ -0,0 +1,13 @@
+The AI assistant's understanding of references is essential for maintaining a coherent dialogue. The following criteria should be used to evaluate its performance:
+
+1. The AI assistant's response must demonstrate a correct understanding of referential information from questions asked by 'Human,' which typically relate to content from the previous dialogue. Ideally, the AI should explicitly acknowledge or clarify these references in its reply.
+2. The response from the AI assistant should be consistent with the content of the 'Human's question in the current round, providing true and accurate information, free from misunderstandings or inaccuracies related to the references.
+
+Scoring Guidelines:
+
+- 1-3 points: The AI assistant fails to recognize or correctly interpret the referential information, leading to responses that are either inaccurate or unrelated to the previous content.
+- 4-6 points: The AI assistant shows a partial understanding of references, but the response might include some inaccuracies or fail to fully utilize the referential information.
+- 7-9 points: The AI assistant's response indicates a good understanding of the references, with only slight inaccuracies or omissions in the connection to the previous dialogue.
+- 10 points: The AI assistant demonstrates excellent understanding and use of referential information, perfectly aligning its response with the previous content and the current question accurately and precisely.
+
+In addition to the score, please provide an explanation that specifically addresses how the AI assistant's response demonstrates its ability or inability to understand and use referential information in accordance with the criteria above.
diff --git a/openjury/prompts/mt_bench_101/CC.txt b/openjury/prompts/mt_bench_101/CC.txt
new file mode 100644
index 0000000..f9d0bb6
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/CC.txt
@@ -0,0 +1,17 @@
+The AI assistant's capability to resist interference will be measured against these criteria:
+
+
+1. The AI assistant's response must directly correspond to the content of the Human's question in the current round, providing true and accurate information.
+2. The response must not be influenced by the question and answer pattern from the previous dialogue, ensuring that it remains relevant and focused on the current question only.
+
+
+Scoring Guidelines:
+
+
+- 1-3 points: The AI assistant's response is largely influenced by previous interactions, fails to address the current question accurately, or provides false information.
+- 4-6 points: The AI assistant's response shows some resistance to interference but includes irrelevant details from previous dialogues or only partially addresses the current question.
+- 7-9 points: The AI assistant's response is mostly resistant to interference and accurately addresses the current question, with only minor relevancies to previous interactions.
+- 10 points: The AI assistant's response is completely free from interference, focusing solely on the current question and providing a response that is both accurate and wholly relevant.
+
+
+Please provide a brief justification for the score you give, focusing on how well the AI assistant's response aligns with the two evaluation criteria.
diff --git a/openjury/prompts/mt_bench_101/CM.txt b/openjury/prompts/mt_bench_101/CM.txt
new file mode 100644
index 0000000..db8beb7
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/CM.txt
@@ -0,0 +1,15 @@
+The capacity of a large language model to recall and utilize previously mentioned information from earlier in the conversation is a critical indicator of its conversational memory abilities. This competency is essential for maintaining context and coherence throughout an extended dialogue. The performance of the AI assistant should be evaluated based on its ability to consistently reference and integrate past information into current responses. The evaluation criteria are as follows:
+
+1.Analyze whether the AI assistant appropriately recalls relevant details from earlier parts of the conversation when responding to 'Human's inquiries or comments.
+2.Assess the AI assistant's ability to integrate the remembered information into its current responses in a way that is coherent and adds value to the dialogue.
+3.Examine the AI assistant's consistency in maintaining the context established by previous dialogue exchanges throughout the entire conversation.
+4.Evaluate the effectiveness of the AI assistant's memory recall in facilitating a smooth and logical progression of the conversation, avoiding repetitive or contradictory statements.
+Scoring Guidelines:
+
+1-3 points: The AI assistant demonstrates poor recall of previous conversation details, leading to inconsistent or contradictory responses, and fails to maintain the dialogue's context, resulting in a disjointed or unclear conversation flow.
+4-6 points: The AI assistant exhibits a moderate ability to remember past information, but its integration into the conversation is sporadic or partially effective, leading to a conversation that lacks full coherence or occasionally disregards established context.
+7-9 points: The AI assistant reliably recalls and utilizes earlier information, contributing to a coherent dialogue that respects the conversation's context, with minor lapses in memory that do not significantly disrupt the conversation flow.
+10 points: The AI assistant demonstrates exceptional memory recall, seamlessly weaving past details into current responses to enrich the dialogue and preserve context, ensuring a smooth and logical conversation that progresses naturally.
+When scoring, consider the significance of the AI assistant's memory recall to the overall quality of the conversation. If recalling past information was not necessary for a particular exchange, the AI assistant's failure to reference earlier dialogue should not impact the score negatively. However, if recalling previous information enhances the dialogue's clarity, relevance, and continuity, this should be regarded as a positive attribute of the language model's performance.
+
+Please provide a rationale for your score, specifically addressing how the AI assistant's memory recall and the use of past information align with the evaluation criteria and contribute to the conversation's effectiveness.
diff --git a/openjury/prompts/mt_bench_101/GR.txt b/openjury/prompts/mt_bench_101/GR.txt
new file mode 100644
index 0000000..42968e9
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/GR.txt
@@ -0,0 +1,15 @@
+The AI assistant's general reasoning capabilities are crucial for accurately addressing and explaining a wide range of problems posed by 'Human'. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the cogency of its reasoning process. The evaluation criteria are as follows:
+
+1. Verify the accuracy of the AI assistant's answer against the provided reference solution in format '### reference solution ###' for the specific problem.
+2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows the principles of sound reasoning.
+3. Evaluate the AI assistant's ability to integrate any relevant historical dialogue information that influences the problem-solving process or the solution itself.
+4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.
+Scoring Guidelines:
+
+1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not adhere to standards of sound reasoning.
+4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps but generally follows sound reasoning principles.
+7-9 points: The AI assistant gives correct answers with a well-articulated reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.
+10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with sound reasoning principles and enhances 'Human's understanding.
+When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.
+
+Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the general reasoning process, considering the evaluation criteria and the comparison with the reference solution.
diff --git a/openjury/prompts/mt_bench_101/IC.txt b/openjury/prompts/mt_bench_101/IC.txt
new file mode 100644
index 0000000..db991a7
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/IC.txt
@@ -0,0 +1,16 @@
+The AI assistant's ability to engage in a productive dialogue is often enhanced by its use of counter-questions, particularly when dealing with incomplete or vague queries. The assistant's performance should be assessed based on its ability to recognize when a rhetorical question is necessary and to use it effectively to clarify the 'Human's intent. The evaluation criteria are as follows:
+
+1. Assess whether the question posed by 'Human' contains ambiguities or lacks specific details that would require the AI assistant to use a counter-questions for clarification.
+2. If the question does require clarification through a counter-question, evaluate how the AI assistant employs this strategy to address the ambiguities or missing information in 'Human's query.
+3. Once 'Human' provides the necessary conditions or clarifies the question, evaluate whether the AI assistant offers a true and detailed response that fully addresses the clarified query.
+
+Scoring Guidelines:
+
+- 1-3 points: The AI assistant fails to identify the need for a rhetorical question when necessary, or it employs rhetorical questions ineffectively, leading to answers that do not align with 'Human's query, or lack the detail required to fully clarify the question.
+- 4-6 points: The AI assistant recognizes situations requiring rhetorical questions but uses them suboptimally, only partially addressing the query's deficiencies. Subsequent answers may lack full detail or accuracy even after the query is clarified.
+- 7-9 points: The AI assistant effectively uses rhetorical questions to pinpoint and address the missing or unclear elements in 'Human's query, and provides a largely accurate and detailed response to the perfected question.
+- 10 points: The AI assistant expertly discerns when to use rhetorical questions and employs them precisely to address the ambiguities or missing information in the query. Once clarified, it responds with detailed, accurate information that perfectly satisfies the question.
+
+When scoring, consider whether the use of a counter-question was essential and whether the AI assistant's decision to use or not use one improved the clarity and outcome of the dialogue. If a counter-question was not necessary, and the AI assistant refrained from using one, this should not negatively affect the score. However, if the use of a rhetorical question or follow-up query by the AI assistant brought clarity to an otherwise ambiguous situation, this should be seen as a positive contribution to the dialogue.
+
+Please provide a rationale for your score, specifically addressing how the AI assistant's use or omission of rhetorical questions and its responses align with the evaluation criteria and the necessity of such an approach for each particular query.
diff --git a/openjury/prompts/mt_bench_101/MR.txt b/openjury/prompts/mt_bench_101/MR.txt
new file mode 100644
index 0000000..4315e11
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/MR.txt
@@ -0,0 +1,15 @@
+The AI assistant's mathematical reasoning capabilities are vital for accurately solving and explaining mathematical problems posed by 'Human'. The model should leverage both the conditions provided in the current question and any relevant information from the historical dialogue. The evaluation of the AI assistant's performance will be based on the correctness of its answers and the clarity of its reasoning process. The evaluation criteria are as follows:
+
+1. Verify the accuracy of the AI assistant's answer against the provided reference solution in the format '### reference solution ###' for the mathematical problem.
+2. Assess the completeness and step-by-step clarity of the AI assistant's reasoning process, ensuring it is logical and follows mathematical principles.
+3. Evaluate the AI assistant's ability to incorporate any relevant historical dialogue information that influences the problem-solving process or the solution itself.
+4. Appraise the AI assistant's communication of the solution in a manner that is understandable and instructive to 'Human', potentially aiding their learning or comprehension.
+Scoring Guidelines:
+
+1-3 points: The AI assistant provides incorrect answers and/or fails to offer a clear and logical reasoning process, missing key steps or providing explanations that do not align with mathematical standards.
+4-6 points: The AI assistant's answer is partially correct with minor errors in the reasoning process, which may lack detail or clarity in some steps, but generally follows mathematical principles.
+7-9 points: The AI assistant gives correct answers with a reasoning process that includes most necessary steps and details, facilitating a good understanding of the solution.
+10 points: The AI assistant provides a completely correct answer accompanied by a detailed and meticulously clear step-by-step reasoning process that is fully aligned with mathematical principles and enhances 'Human's understanding.
+When scoring, focus on the precision of the AI assistant's answer and the extent to which the reasoning process is elaborated. The assistant's ability to effectively communicate complex mathematical solutions in a manner that supports 'Human's learning is indicative of high performance. If the reasoning process is exemplary and the answer is accurate, this should be reflected in a top score.
+
+Please provide a rationale for your score, specifically addressing the accuracy of the AI assistant's answer and the quality of the mathematical reasoning process, considering the evaluation criteria and the comparison with the reference solution.
diff --git a/openjury/prompts/mt_bench_101/PI.txt b/openjury/prompts/mt_bench_101/PI.txt
new file mode 100644
index 0000000..8702484
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/PI.txt
@@ -0,0 +1,15 @@
+The AI assistant's interactivity, represented by its ability to proactively initiate and sustain engaging dialogues with 'Human', is a key aspect of a dynamic conversational experience. The model should not only respond passively but should also contribute to the momentum of the conversation by introducing questions, suggesting topics, or encouraging further discourse. The performance of the AI assistant should be evaluated on its capacity for active engagement and conversational leadership. The evaluation criteria are as follows:
+
+1. Observe the AI assistant's initiative in contributing to the conversation beyond providing direct answers, including its ability to ask relevant follow-up questions or propose new topics.
+2. Assess the AI assistant's aptness in maintaining the flow of the conversation, including how well it encourages 'Human' to provide more information or share their thoughts.
+3. Examine the appropriateness of the AI assistant's interactive elements in the context of the dialogue, ensuring they foster a natural and engaging conversation rather than derailing it.
+4. Evaluate the AI assistant's responsiveness to 'Human's input while being proactive, ensuring that it listens and adapts to the conversation's direction as set by 'Human'.
+Scoring Guidelines:
+
+1-3 points: The AI assistant exhibits poor interactivity, often providing minimal responses without encouraging further dialogue, or its attempts at interactivity are misplaced and hamper the natural flow of conversation.
+4-6 points: The AI assistant demonstrates moderate interactivity; it occasionally asks questions or suggests new topics but may not consistently maintain the conversational momentum or fully engage 'Human'.
+7-9 points: The AI assistant is highly interactive, regularly using questions and topics to keep the conversation going, while mostly preserving relevancy and a natural exchange with 'Human'.
+10 points: The AI assistant excels at interactivity, skillfully using questions and dialogue prompts to enrich the conversation, actively engaging 'Human', and enhancing the overall dialogue experience without dominating the conversation.
+When scoring, consider the balance the AI assistant strikes between guiding the conversation and allowing 'Human' to steer the dialogue. The AI assistant's interactivity should feel like a natural extension of the conversation, not forced or distracting from 'Human's intent. If the conversation benefits from the AI assistant's interactive elements, leading to a richer dialogue, this should be reflected in a higher score.
+
+Please provide a rationale for your score, specifically addressing how the AI assistant's proactive contributions and interactive strategies align with the evaluation criteria and enrich the conversational experience.
diff --git a/openjury/prompts/mt_bench_101/SA.txt b/openjury/prompts/mt_bench_101/SA.txt
new file mode 100644
index 0000000..03fb0c0
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/SA.txt
@@ -0,0 +1,13 @@
+We aim to specifically evaluate the ability of the large language model (AI assistant) to maintain confidence in its original response when challenged by 'Human', provided that the challenge is incorrect. The criteria for evaluation are as follows:
+
+1. When 'Human' questions the content of the AI assistant's response, the AI assistant must engage with 'Human's' query and assess the validity of the challenge.
+2. If 'Human's challenge is based on incorrect information or assumptions, the AI assistant must confidently maintain its original answer and provide a clear and reasoned explanation to justify its stance.
+
+Scoring Guidelines:
+
+- 1-3 points: The AI assistant either does not engage with 'Human's' challenge appropriately or changes its original response without proper justification, even when 'Human's' challenge is incorrect.
+- 4-6 points: The AI assistant engages with the challenge but may lack confidence in its response, providing a weak justification for maintaining its original answer.
+- 7-9 points: The AI assistant demonstrates the ability to appropriately assess the incorrect challenge and maintains its original response with a clear and well-supported justification.
+- 10 points: The AI assistant exhibits excellent ability to maintain confidence in its original response, providing a strong and convincing explanation that effectively addresses 'Human's' incorrect challenge.
+
+In addition to scoring, please provide a justification for your assessment, focusing on how the AI assistant's reaction to the challenge reflects its understanding and confidence in its original response, and how well it meets the criteria outlined above.
diff --git a/openjury/prompts/mt_bench_101/SC.txt b/openjury/prompts/mt_bench_101/SC.txt
new file mode 100644
index 0000000..1472a0d
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/SC.txt
@@ -0,0 +1,13 @@
+We aim to specifically evaluate the self-correction ability of the large language model (AI assistant) when the 'Human' identifies an error in the AI assistant's initial response. The criteria for evaluation are as follows:
+
+1. Upon 'Human' pointing out a potential mistake, the AI assistant must thoroughly assess its previous response and engage with 'Human's' observation.
+2. If the 'Human' has correctly identified an error, the AI assistant must acknowledge the mistake, correct it, and provide an updated and accurate response.
+
+Scoring Guidelines:
+
+- 1-3 points: The AI assistant fails to recognize or adequately address the error identified by 'Human,' and does not make the necessary corrections to its response.
+- 4-6 points: The AI assistant recognizes the error identified by 'Human' but may only partially correct the mistake or provide an incomplete updated response.
+- 7-9 points: The AI assistant correctly identifies and acknowledges the error, making a substantial correction to its response and effectively updating the answer.
+- 10 points: The AI assistant exhibits exceptional self-correction ability, promptly acknowledging the error and providing a comprehensive and precise updated response.
+
+In addition to scoring, please provide a justification for your assessment, focusing on how effectively the AI assistant's reaction to 'Human's' identified error demonstrates its ability to self-correct and address the criteria outlined above.
diff --git a/openjury/prompts/mt_bench_101/SI.txt b/openjury/prompts/mt_bench_101/SI.txt
new file mode 100644
index 0000000..61ffe78
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/SI.txt
@@ -0,0 +1,13 @@
+We aim to specifically evaluate the command-following ability of the large language model (AI assistant). The criteria for evaluation are as follows:
+
+1. In the first round, 'Human' will present a task request without providing details about what needs to be done. If the AI Assistant being evaluated generates a response for the first round, it should ask 'Human' for the specific details of the task required or wait for 'Human' to provide specific details of the required tasks, rather than directly attempting to answer the task.
+2. Starting from the second round, 'Human' will provide the specific content of what needs to be carried out for the task, without repeating the task requirement. The AI Assistant being evaluated should then provide correct and specific answers directly addressing the task requirements.
+
+Please rate the AI assistant's response using a 1 to 10 scale based on the following guidelines:
+
+- 1-3 points: The AI assistant failed to understand the task request and neither asked relevant questions nor provided information related to the task.
+- 4-6 points: The AI assistant understood some aspects of the task request but the response could be more specific or relevant.
+- 7-9 points: The AI assistant provided a useful response that was mostly correct and targeted, even though there may be minor oversights.
+- 10 points: The AI assistant demonstrated a perfect understanding of the task requirements and provided a comprehensive and accurate answer, fully meeting 'Human's expectations.
+
+Additionally, please provide a brief justification for the score given, particularly highlighting how the AI assistant's response aligns with or deviates from the above criteria. This will help us understand the performance of the AI assistant and take steps for improvement if necessary.
diff --git a/openjury/prompts/mt_bench_101/TS.txt b/openjury/prompts/mt_bench_101/TS.txt
new file mode 100644
index 0000000..2bbc354
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/TS.txt
@@ -0,0 +1,15 @@
+The AI assistant's ability to handle shifts in conversation topics is crucial for maintaining relevance and adaptability during a dialogue. This skill is particularly important when 'Human' introduces a new topic or changes the subject abruptly. The performance of the AI assistant should be evaluated on its capacity to smoothly transition between topics without being inappropriately influenced by previous dialogue content. The evaluation criteria are as follows:
+
+1. Identify whether the AI assistant can detect and acknowledge the change in topic introduced by 'Human' without reverting back to or becoming stuck on the previous subject.
+2. Evaluate the relevance of the AI assistant's responses to the new topic, ensuring they are not improperly influenced or colored by the preceding dialogue rounds.
+3. Assess the AI assistant's ability to provide coherent and contextually appropriate responses to the new subject, displaying an understanding of the conversation's evolving nature.
+4. Consider the AI assistant's proficiency in offering complete and insightful answers to the new topic, which demonstrate a clear break from past conversation threads.
+Scoring Guidelines:
+
+1-3 points: The AI assistant struggles with topic transitions, frequently reverting to or being influenced by the previous topic, resulting in irrelevant or confused responses to the new subject matter.
+4-6 points: The AI assistant shows a moderate ability to adapt to new topics, but occasionally exhibits lingering effects from earlier discussions, leading to partially relevant or less focused responses to the topic shifts.
+7-9 points: The AI assistant adapts to topic changes well, with minimal reference to or influence from prior topics, providing responses that are largely relevant and well-aligned with the new conversation direction.
+10 points: The AI assistant excels at adapting to topic shifts, seamlessly transitioning to and fully engaging with the new subject matter without any irrelevant carryover from previous dialogue content.
+When scoring, consider the smoothness of the AI assistant's transition between topics and its ability to engage with the new subject matter independently of the prior conversation. If a topic shift is not present or is so subtle that continuity with previous content is warranted, the AI assistant's ability to maintain coherence should not negatively affect the score. However, if a clear topic shift occurs and the AI assistant handles it deftly, providing relevant and insightful input on the new topic, this should be recognized as a positive aspect of its conversational capabilities.
+
+Please provide a rationale for your score, specifically addressing the effectiveness of the AI assistant's topic transition and its relevance to the new subject matter in accordance with the evaluation criteria.
diff --git a/openjury/prompts/mt_bench_101/global_system.txt b/openjury/prompts/mt_bench_101/global_system.txt
new file mode 100644
index 0000000..ca94e9f
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/global_system.txt
@@ -0,0 +1,5 @@
+Please act as an impartial judge following these instructions: In the following conversations, the response of the "assistant" in the last round of conversations is the output of the large language model (AI assistant) that needs to be evaluated.
+
+Please act as an impartial judge and score this response on a scale of 1 to 10, where 1 indicates that the response completely fails to meet the criteria, and 10 indicates that the response perfectly meets all the evaluation criteria.
+
+Note that only the response of the "assistant" in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated; the previous conversations are the ground truth history which do NOT need to be evaluated.
diff --git a/openjury/prompts/mt_bench_101/rephrasing.txt b/openjury/prompts/mt_bench_101/rephrasing.txt
new file mode 100644
index 0000000..9bd5e9c
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/rephrasing.txt
@@ -0,0 +1,13 @@
+We aim to specifically evaluate the paraphrasing ability of the large language model (AI assistant). The criteria for evaluation are as follows:
+
+1. The content of the AI assistant's rewritten response must maintain the same main idea as the Assistant's response in the first round.
+2. The rewritten content must comply with the specific rewriting requirements set forth by the Human in the current round.
+
+Scoring Guidelines:
+
+- 1-3 points: The rewritten response significantly deviates from the original main idea or fails to meet the rewriting requirements.
+- 4-6 points: The rewritten response captures the original main idea but only partially meets the rewriting requirements or lacks fluency/coherence.
+- 7-9 points: The rewritten response maintains the original main idea and satisfies most of the rewriting requirements with minor discrepancies or stylistic issues.
+- 10 points: The rewritten response perfectly preserves the original main idea and fulfills all of the rewriting requirements set by Human, exhibiting a seamless and natural integration of the required changes.
+
+Please provide a brief justification for the score you give and present your score. Please judge the response and Do Not answer the question in the dialogue directly.
diff --git a/openjury/prompts/mt_bench_101/scoring_format.txt b/openjury/prompts/mt_bench_101/scoring_format.txt
new file mode 100644
index 0000000..b28f9eb
--- /dev/null
+++ b/openjury/prompts/mt_bench_101/scoring_format.txt
@@ -0,0 +1,9 @@
+Note that only the response of the "assistant" in the LAST ROUND of conversations is the output of the large language model (the AI assistant) that needs to be evaluated.
+
+You must provide your explanation. After providing your explanation, show the score by strictly following this format: "Rating: [[score]]", for example "Rating: [[6]]".
+
+The DIALOGUE needs to be judged in this format:
+
+***
+DIALOGUE
+***
diff --git a/openjury/prompts/prompt-with-explanation.txt b/openjury/prompts/prompt-with-explanation.txt
deleted file mode 100644
index 6600f51..0000000
--- a/openjury/prompts/prompt-with-explanation.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-<|User Prompt|>
-{user_prompt}
-
-<|The Start of Assistant A's Answer|>
-{completion_A}
-<|The End of Assistant A's Answer|>
-
-<|The Start of Assistant B's Answer|>
-{completion_B}
-<|The End of Assistant B's Answer|>
-
-# Your output
-
-## Format description
-Your output should follow this format:
-```
-score_A:
-score_B:
-```
-
-## Your output, do not repeat the input above, first starts with an explanation of your judgement
diff --git a/openjury/prompts/prompt.txt b/openjury/prompts/prompt.txt
index 21d2e48..38021e6 100644
--- a/openjury/prompts/prompt.txt
+++ b/openjury/prompts/prompt.txt
@@ -1,13 +1,13 @@
<|User Prompt|>
{user_prompt}
-<|The Start of Assistant A's Answer|>
+<|The Start of Assistant A's {completion_label}|>
{completion_A}
-<|The End of Assistant A's Answer|>
+<|The End of Assistant A's {completion_label}|>
-<|The Start of Assistant B's Answer|>
+<|The Start of Assistant B's {completion_label}|>
{completion_B}
-<|The End of Assistant B's Answer|>
+<|The End of Assistant B's {completion_label}|>
# Your output
@@ -18,5 +18,4 @@ score_A:
```
-## Your output, do not repeat the input above
-```
+## Your output, do not repeat the input above{explanation_suffix}
diff --git a/openjury/utils.py b/openjury/utils.py
index 27db11d..93563e7 100644
--- a/openjury/utils.py
+++ b/openjury/utils.py
@@ -8,7 +8,6 @@
from huggingface_hub import snapshot_download
import pandas as pd
from tqdm.asyncio import tqdm
-from langchain_community.llms import LlamaCpp
from langchain_openai import ChatOpenAI
from langchain_community.cache import SQLiteCache
from langchain_core.globals import set_llm_cache
@@ -43,6 +42,23 @@ def read_df(filename: Path, **pandas_kwargs) -> pd.DataFrame:
return pd.read_parquet(filename, **pandas_kwargs)
+def truncate(s: str, max_len: int | None = None) -> str:
+ if not isinstance(s, str):
+ return ""
+ if max_len is not None:
+ return s[:max_len]
+ return s
+
+
+def safe_text(value: object, truncate_chars: int | None) -> str:
+ if value is None:
+ return ""
+ is_missing = pd.isna(value)
+ if isinstance(is_missing, bool) and is_missing:
+ return ""
+ return truncate(str(value), max_len=truncate_chars)
+
+
def compute_pref_summary(prefs: pd.Series) -> dict[str, float | int]:
"""Compute win/loss/tie stats for preference series (0=A, 0.5=tie, 1=B)."""
prefs = pd.Series(prefs, dtype="float64")
@@ -156,7 +172,141 @@ async def ainvoke(self, input, **invoke_kwargs):
return self.message
-class ChatVLLM:
+class BaseLocalModel:
+ """Shared prompt conversion and invoke helpers for local model wrappers."""
+
+ def _to_messages(self, input_item) -> list[dict]:
+ """Convert LangChain prompt input to OpenAI-style messages."""
+ role_map = {"human": "user", "ai": "assistant", "system": "system"}
+
+ if hasattr(input_item, "to_messages"):
+ lc_messages = input_item.to_messages()
+ return [
+ {"role": role_map.get(msg.type, msg.type), "content": msg.content}
+ for msg in lc_messages
+ ]
+ elif (
+ isinstance(input_item, list)
+ and input_item
+ and isinstance(input_item[0], tuple)
+ ):
+ return [
+ {"role": role if role != "human" else "user", "content": content}
+ for role, content in input_item
+ ]
+ elif (
+ isinstance(input_item, list)
+ and input_item
+ and isinstance(input_item[0], dict)
+ ):
+ return input_item
+ elif isinstance(input_item, str):
+ return [{"role": "user", "content": input_item}]
+ else:
+ raise ValueError(f"Unsupported input type: {type(input_item)}")
+
+ def _to_raw_text(self, input_item) -> str:
+ """Extract raw text from an input item for text-completion mode."""
+ if isinstance(input_item, str):
+ return input_item
+ if hasattr(input_item, "to_string"):
+ return input_item.to_string()
+ if (
+ isinstance(input_item, list)
+ and input_item
+ and isinstance(input_item[0], dict)
+ ):
+ return "\n".join(msg["content"] for msg in input_item)
+ raise ValueError(f"Cannot extract raw text from: {type(input_item)}")
+
+ def invoke(self, input_item, **invoke_kwargs) -> str:
+ return self.batch([input_item], **invoke_kwargs)[0]
+
+ async def ainvoke(self, input_item, **invoke_kwargs):
+ loop = asyncio.get_event_loop()
+ return await loop.run_in_executor(
+ None, lambda: self.invoke(input_item, **invoke_kwargs)
+ )
+
+
+class ChatLlamaCppModel(BaseLocalModel):
+ """LlamaCpp wrapper that auto-detects and applies the GGUF chat template.
+
+ Mirrors the ChatVLLM pattern but for local GGUF models via llama-cpp-python.
+
+ Chat template handling:
+ - If the GGUF file embeds a chat template (typical for instruct models),
+ uses ``create_chat_completion()`` which applies the template and
+ handles EOS tokens correctly.
+ - If no template is found (base/pretrained models), falls back to
+ ``create_completion()`` (text mode) and emits a warning.
+
+ Unlike langchain's ``ChatLlamaCpp``, this wrapper explicitly calls
+ ``Llama.reset()`` between conversations to clear stale KV-cache state.
+
+ Sampling defaults:
+ - ``temperature=None`` means do not pass temperature explicitly and keep
+ llama-cpp's backend default behavior.
+ """
+
+ def __init__(
+ self,
+ model_path: str,
+ max_tokens: int = 1024,
+ n_ctx: int = 0,
+ temperature: float | None = None,
+ **kwargs,
+ ):
+ from llama_cpp import Llama
+
+ self.model_path = model_path
+ self.max_tokens = max_tokens
+ self.temperature = temperature
+ self.llama = Llama(
+ model_path=model_path,
+ n_ctx=n_ctx,
+ verbose=True,
+ **kwargs,
+ )
+
+ chat_template = self.llama.metadata.get("tokenizer.chat_template")
+ if chat_template:
+ self._use_generate = False
+ print(f"ChatLlamaCppModel: using GGUF chat template for '{model_path}'")
+ else:
+ self._use_generate = True
+ warnings.warn(
+ f"Model '{model_path}' does not embed a chat template. "
+ f"Falling back to text-completion mode (no chat formatting). "
+ f"Override with --chat_template if this model needs one.",
+ )
+
+ def batch(self, inputs: list, **kwargs) -> list[str]:
+ """Process a batch of inputs, resetting KV cache between conversations."""
+ results = []
+ for inp in inputs:
+ self.llama.reset()
+ if self._use_generate:
+ text = self._to_raw_text(inp)
+ create_kwargs = {"prompt": text, "max_tokens": self.max_tokens}
+ if self.temperature is not None:
+ create_kwargs["temperature"] = self.temperature
+ response = self.llama.create_completion(**create_kwargs)
+ results.append(response["choices"][0]["text"])
+ else:
+ messages = self._to_messages(inp)
+ create_kwargs = {"messages": messages, "max_tokens": self.max_tokens}
+ if self.temperature is not None:
+ create_kwargs["temperature"] = self.temperature
+ response = self.llama.create_chat_completion(**create_kwargs)
+ results.append(response["choices"][0]["message"]["content"])
+ return results
+
+ def set_temperature(self, temperature: float | None) -> None:
+ self.temperature = None if temperature is None else float(temperature)
+
+
+class ChatVLLM(BaseLocalModel):
"""VLLM wrapper that auto-detects whether to use chat() or generate().
Chat template handling:
@@ -169,9 +319,21 @@ class ChatVLLM:
falls back to ``llm.generate()`` and emits a warning. This avoids the
``ValueError`` raised by ``transformers >= v4.44`` which removed the
default chat template.
+
+ Sampling defaults:
+ - Uses ``temperature=0.6`` and ``top_p=0.95`` unless explicitly
+ overridden.
"""
- def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None = None, **vllm_kwargs):
+ def __init__(
+ self,
+ model: str,
+ max_tokens: int = 8192,
+ temperature: float = 0.6,
+ top_p: float = 0.95,
+ chat_template: str | None = None,
+ **vllm_kwargs,
+ ):
from vllm import LLM, SamplingParams
self.model_path = model
@@ -183,6 +345,7 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None
if max_model_len is not None:
try:
from transformers import AutoConfig
+
config = AutoConfig.from_pretrained(model, trust_remote_code=True)
model_max_pos = getattr(config, "max_position_embeddings", None)
if model_max_pos is not None and max_model_len > model_max_pos:
@@ -200,10 +363,13 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None
)
self.llm = LLM(model=model, trust_remote_code=True, **vllm_kwargs)
- self.sampling_params = SamplingParams(
+ self._SamplingParams = SamplingParams
+ self._temperature = temperature
+ self._top_p = top_p
+ self.sampling_params = self._SamplingParams(
max_tokens=max_tokens,
- temperature=0.6,
- top_p=0.95,
+ temperature=self._temperature,
+ top_p=self._top_p,
)
# Resolve chat template:
@@ -229,52 +395,13 @@ def __init__(self, model: str, max_tokens: int = 8192, chat_template: str | None
self._use_generate = False
print(f"ChatVLLM: using tokenizer's chat template for '{model}'")
- def _to_messages(self, input_item) -> list[dict]:
- """Convert LangChain prompt input to OpenAI-style messages."""
- # Map LangChain message types to OpenAI roles
- role_map = {"human": "user", "ai": "assistant", "system": "system"}
-
- # Handle ChatPromptValue from LangChain
- if hasattr(input_item, "to_messages"):
- lc_messages = input_item.to_messages()
- return [
- {"role": role_map.get(msg.type, msg.type), "content": msg.content}
- for msg in lc_messages
- ]
- # Handle list of tuples like [("system", "..."), ("user", "...")]
- elif (
- isinstance(input_item, list)
- and input_item
- and isinstance(input_item[0], tuple)
- ):
- return [
- {"role": role if role != "human" else "user", "content": content}
- for role, content in input_item
- ]
- # Handle already formatted messages
- elif (
- isinstance(input_item, list)
- and input_item
- and isinstance(input_item[0], dict)
- ):
- return input_item
- # Handle plain string (wrap as user message)
- elif isinstance(input_item, str):
- return [{"role": "user", "content": input_item}]
- else:
- raise ValueError(f"Unsupported input type: {type(input_item)}")
-
- def _to_raw_text(self, input_item) -> str:
- """Extract raw text from an input item for use with llm.generate()."""
- if isinstance(input_item, str):
- return input_item
- # ChatPromptValue from LangChain
- if hasattr(input_item, "to_string"):
- return input_item.to_string()
- # List of dicts (messages) - concatenate contents
- if isinstance(input_item, list) and input_item and isinstance(input_item[0], dict):
- return "\n".join(msg["content"] for msg in input_item)
- raise ValueError(f"Cannot extract raw text from: {type(input_item)}")
+ def set_temperature(self, temperature: float) -> None:
+ self._temperature = float(temperature)
+ self.sampling_params = self._SamplingParams(
+ max_tokens=self.max_tokens,
+ temperature=self._temperature,
+ top_p=self._top_p,
+ )
def batch(self, inputs: list, **invoke_kwargs) -> list[str]:
"""Process a batch of inputs using vllm.LLM.chat() or llm.generate().
@@ -295,28 +422,21 @@ def batch(self, inputs: list, **invoke_kwargs) -> list[str]:
)
return [out.outputs[0].text for out in outputs]
- def invoke(self, input_item, **invoke_kwargs) -> str:
- """Process a single input."""
- results = self.batch([input_item], **invoke_kwargs)
- return results[0]
-
- async def ainvoke(self, input_item, **invoke_kwargs):
- """Async version - runs sync version in executor for compatibility."""
- import asyncio
- loop = asyncio.get_event_loop()
- return await loop.run_in_executor(
- None, lambda: self.invoke(input_item, **invoke_kwargs)
- )
-
-
-def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
+def make_model(
+ model: str,
+ max_tokens: int | None = 8192,
+ temperature: float | None = None,
+ **engine_kwargs,
+):
"""Instantiate a model wrapper from a provider/model-name string.
Args:
model: Format ``{Provider}/{model_path}``, e.g.
``VLLM/meta-llama/Llama-3.3-70B-Instruct``.
max_tokens: Maximum tokens the model may generate.
+ temperature: Optional generation temperature override. ``None`` keeps
+ each provider wrapper's default temperature behavior.
**engine_kwargs: Engine-specific options forwarded to the model wrapper.
"""
# Avoid mutating the original engine_kwargs dictionary
@@ -326,6 +446,8 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
# Dedicated arguments like max_tokens always win over engine_kwargs.
engine_kwargs["max_tokens"] = max_tokens or 8192
+ if temperature is not None:
+ engine_kwargs["temperature"] = temperature
model_provider = model.split("/")[0]
@@ -344,7 +466,6 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
model=model_name,
**engine_kwargs,
)
-
if model_provider == "OpenRouter":
# Special case we need to override API url and key
return ChatOpenAI(
@@ -353,15 +474,15 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
model=model_name,
**engine_kwargs,
)
+ elif model_provider == "LlamaCpp":
+ engine_kwargs["model_path"] = model_name
+ engine_kwargs.setdefault("n_ctx", 0)
+ return ChatLlamaCppModel(**engine_kwargs)
else:
model_classes = [
- LlamaCpp,
ChatOpenAI,
]
- if model_provider == "LlamaCpp":
- engine_kwargs["model_path"] = model_name
- else:
- engine_kwargs["model"] = model_name
+ engine_kwargs["model"] = model_name
try:
from langchain_together.llms import Together
@@ -383,6 +504,9 @@ def make_model(model: str, max_tokens: int | None = 8192, **engine_kwargs):
def download_all():
+ from openjury.instruction_dataset.mt_bench import download_mt_bench
+ from openjury.instruction_dataset.mt_bench_101 import download_mt_bench_101
+
print(f"Downloading all dataset in {data_root}")
for dataset in ["alpaca-eval", "arena-hard", "m-arena-hard"]:
local_path_tables = data_root / "tables"
@@ -396,6 +520,9 @@ def download_all():
force_download=False,
)
+ download_mt_bench()
+ download_mt_bench_101()
+
class Timeblock:
"""Timer context manager"""
diff --git a/tests/test_generate_and_evaluate.py b/tests/test_generate_and_evaluate.py
index 7fa07f2..a89671e 100644
--- a/tests/test_generate_and_evaluate.py
+++ b/tests/test_generate_and_evaluate.py
@@ -2,6 +2,8 @@
import pytest
import openjury.generate_and_evaluate as generate_and_evaluate
+import openjury.mt_bench.pipeline as mt_bench_pipeline
+import openjury.mt_bench_101.pipeline as mt_bench_101_pipeline
from openjury.generate_and_evaluate import (
main as main_generate_and_eval,
CliArgs,
@@ -10,26 +12,89 @@
@pytest.fixture(autouse=True)
def mock_external_data_and_cache(monkeypatch):
- instructions = pd.DataFrame(
+ single_turn_instructions = pd.DataFrame(
{
"instruction": [f"Synthetic instruction {i}" for i in range(20)],
},
index=pd.Index(range(20), name="instruction_index"),
)
+ # Mix of general and NEED_REF_CATS categories to exercise both code paths.
+ categories = ["writing", "math", "reasoning", "coding", "roleplay",
+ "writing", "math", "reasoning", "coding", "roleplay",
+ "writing", "math", "reasoning", "coding", "roleplay",
+ "writing", "math", "reasoning", "coding", "roleplay"]
+ ref_turn_1 = [
+ f"Reference answer turn 1 for q{i}" if cat in ("math", "reasoning", "coding") else None
+ for i, cat in enumerate(categories)
+ ]
+ ref_turn_2 = [
+ f"Reference answer turn 2 for q{i}" if cat in ("math", "reasoning", "coding") else None
+ for i, cat in enumerate(categories)
+ ]
+ mt_bench_questions = pd.DataFrame(
+ {
+ "category": categories,
+ "turn_1": [f"Synthetic MT-Bench turn 1 question {i}" for i in range(20)],
+ "turn_2": [f"Synthetic MT-Bench turn 2 follow-up {i}" for i in range(20)],
+ "reference_turn_1": ref_turn_1,
+ "reference_turn_2": ref_turn_2,
+ },
+ index=pd.Index(range(20), name="instruction_index"),
+ )
+ mt_bench_questions["instruction"] = mt_bench_questions["turn_1"]
+
+ mt_bench_101_eval_items = pd.DataFrame(
+ {
+ "dialogue_id": [0, 0, 1],
+ "dialogue_uid": ["CM:0", "CM:0", "MR:1"],
+ "task": ["CM", "CM", "MR"],
+ "ability": ["perceptivity", "perceptivity", "adaptability"],
+ "turn_index": [2, 3, 1],
+ "golden_context": [
+ [{"user": "CM user 1", "bot": "CM bot 1"}],
+ [
+ {"user": "CM user 1", "bot": "CM bot 1"},
+ {"user": "CM user 2", "bot": "CM bot 2"},
+ ],
+ [],
+ ],
+ "user_message": ["CM user 2", "CM user 3", "MR user 1"],
+ "reference_answer": ["CM ref 2", "CM ref 3", "MR ref 1"],
+ "requires_reference": [False, False, True],
+ "instruction": ["CM user 2", "CM user 3", "MR user 1"],
+ },
+ index=pd.Index(range(3), name="instruction_index"),
+ )
+
+ def _load_instructions(dataset: str, n_instructions: int | None = None) -> pd.DataFrame:
+ if dataset == "mt-bench":
+ df = mt_bench_questions
+ elif dataset == "mt-bench-101":
+ df = mt_bench_101_eval_items
+ else:
+ df = single_turn_instructions
+ return df.head(n_instructions) if n_instructions is not None else df
+
monkeypatch.setattr(
generate_and_evaluate,
"load_instructions",
- lambda dataset, n_instructions=None: (
- instructions.head(n_instructions)
- if n_instructions is not None
- else instructions
- ),
+ _load_instructions,
+ )
+ monkeypatch.setattr(
+ mt_bench_pipeline,
+ "load_instructions",
+ _load_instructions,
+ )
+ monkeypatch.setattr(
+ mt_bench_101_pipeline,
+ "load_instructions",
+ _load_instructions,
)
monkeypatch.setattr(
generate_and_evaluate,
"load_contexts",
- lambda dataset: instructions.loc[:, "instruction"],
+ lambda dataset: single_turn_instructions.loc[:, "instruction"],
)
monkeypatch.setattr(
@@ -44,6 +109,12 @@ def _run_without_cache(fun, **_kwargs):
monkeypatch.setattr(
generate_and_evaluate, "cache_function_dataframe", _run_without_cache
)
+ monkeypatch.setattr(
+ mt_bench_pipeline, "cache_function_dataframe", _run_without_cache
+ )
+ monkeypatch.setattr(
+ mt_bench_101_pipeline, "cache_function_dataframe", _run_without_cache
+ )
@pytest.mark.parametrize(
@@ -86,4 +157,233 @@ def test_generate_and_evaluate_correct_order_bias(tmp_path):
)
avg_pref = sum(prefs) / len(prefs)
- assert avg_pref == 0.5
+ assert avg_pref == pytest.approx(0.5)
+
+
+def test_main_non_mt_bench_reuses_judge_turn(monkeypatch, tmp_path):
+ captured = {"calls": 0, "kwargs": None}
+
+ def _judge_turn_stub(**kwargs):
+ captured["calls"] += 1
+ captured["kwargs"] = kwargs
+ return (
+ [{"judge_completion": "score A: 0 score B: 10"}],
+ [],
+ [{"instruction_index": 0}],
+ [],
+ pd.Series([1.0]),
+ [{"instruction_index": 0}],
+ )
+
+ monkeypatch.setattr(
+ generate_and_evaluate,
+ "_judge_turn",
+ _judge_turn_stub,
+ )
+
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="alpaca-eval",
+ model_A="Dummy/no answer",
+ model_B="Dummy/open is better than close isnt'it",
+ judge_model="Dummy/score A: 0 score B: 10",
+ n_instructions=1,
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert captured["calls"] == 1
+ assert captured["kwargs"]["swap_mode"] == "fixed"
+ assert captured["kwargs"]["metadata"] == [{"instruction_index": 0}]
+ assert prefs.tolist() == [1.0]
+
+
+def test_format_mt_bench_turn_2_uses_conversation_blocks():
+ questions = pd.DataFrame(
+ {
+ "category": ["math", "writing"],
+ "turn_1": ["Math question turn 1", "Writing question turn 1"],
+ "turn_2": ["Math question turn 2", "Writing question turn 2"],
+ "reference_turn_1": ["Math reference turn 1", None],
+ "reference_turn_2": ["Math reference turn 2", None],
+ },
+ index=pd.Index([0, 1], name="instruction_index"),
+ )
+ completions_a = pd.DataFrame(
+ {
+ "completion_turn_1": ["A1 math", "A1 writing"],
+ "completion_turn_2": ["A2 math", "A2 writing"],
+ },
+ index=pd.Index([0, 1], name="instruction_index"),
+ )
+ completions_b = pd.DataFrame(
+ {
+ "completion_turn_1": ["B1 math", "B1 writing"],
+ "completion_turn_2": ["B2 math", "B2 writing"],
+ },
+ index=pd.Index([0, 1], name="instruction_index"),
+ )
+
+ turn_1_inputs, turn_2_inputs = generate_and_evaluate.format_mt_bench_for_evaluation(
+ questions=questions,
+ completions_A=completions_a,
+ completions_B=completions_b,
+ turns_mode="both",
+ truncate_input_chars=8192,
+ )
+ (
+ instructions_turn_1,
+ _completions_a_turn_1,
+ _completions_b_turn_1,
+ _metadata_turn_1,
+ ) = turn_1_inputs
+ (
+ instructions_turn_2,
+ completions_a_turn_2,
+ completions_b_turn_2,
+ _metadata_turn_2,
+ ) = turn_2_inputs
+
+ assert "Please focus on which assistant provides a better answer to the second user question." in instructions_turn_2[0]
+ assert "<|The Start of Reference Answer|>" in instructions_turn_2[0]
+ assert "Math reference turn 1" in instructions_turn_2[0]
+ assert "Math reference turn 2" in instructions_turn_2[0]
+ assert "<|The Start of Reference Answer|>" not in instructions_turn_2[1]
+
+ assert "### User:\nMath question turn 1" in completions_a_turn_2[0]
+ assert "### Assistant:\nA1 math" in completions_a_turn_2[0]
+ assert "### User:\nMath question turn 2" in completions_a_turn_2[0]
+ assert "### Assistant:\nA2 math" in completions_a_turn_2[0]
+
+ assert "### User:\nMath question turn 1" in completions_b_turn_2[0]
+ assert "### Assistant:\nB1 math" in completions_b_turn_2[0]
+ assert "### User:\nMath question turn 2" in completions_b_turn_2[0]
+ assert "### Assistant:\nB2 math" in completions_b_turn_2[0]
+
+ assert instructions_turn_1[1] == "Writing question turn 1"
+ assert "[MT-Bench | Turn 1]" in instructions_turn_1[0]
+
+
+def test_mt_bench_pairwise(tmp_path):
+ """Test MT-Bench pipeline through score-based parsing."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer for turn 1 and turn 2",
+ model_B="Dummy/another answer",
+ judge_model="Dummy/score A: 10 score B: 0",
+ n_instructions=5,
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert all(p < 0.5 for p in prefs)
+ assert len(prefs) == 10 # two turns per question
+
+
+def test_mt_bench_swap_mode(tmp_path):
+ """Test that MT-Bench swap mode doubles the annotations and corrects bias."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer A",
+ model_B="Dummy/answer B",
+ judge_model="Dummy/score A: 10 score B: 0",
+ n_instructions=3,
+ swap_mode="both",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert len(prefs) == 12 # (3 questions * 2 turns) * 2 swap directions
+ assert float(sum(prefs) / len(prefs)) == pytest.approx(0.5)
+
+
+def test_mt_bench_single_turn_only(tmp_path):
+ """Test MT-Bench single-turn-only evaluation (--mt_bench_turns single)."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer A",
+ model_B="Dummy/answer B",
+ judge_model="Dummy/score A: 10 score B: 0",
+ n_instructions=5,
+ mt_bench_turns="single",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert all(p < 0.5 for p in prefs)
+ assert len(prefs) == 5 # one annotation per question, turn 1 only
+
+
+def test_mt_bench_multi_turn_only(tmp_path):
+ """Test MT-Bench multi-turn-only evaluation (--mt_bench_turns multi)."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer A",
+ model_B="Dummy/answer B",
+ judge_model="Dummy/score A: 0 score B: 10",
+ n_instructions=5,
+ mt_bench_turns="multi",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert all(p > 0.5 for p in prefs)
+ assert len(prefs) == 5 # one annotation per question, turn 2 only
+
+
+def test_mt_bench_fastchat_fixed_verdicts(tmp_path):
+ """FastChat-compatible MT-Bench judging uses [[A]]/[[B]]/[[C]] parsing."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer A",
+ model_B="Dummy/answer B",
+ judge_model="Dummy/[[A]]",
+ n_instructions=5,
+ mt_bench_compatibility="fastchat",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ assert len(prefs) == 10 # two turns per question
+ assert all(p < 0.5 for p in prefs)
+
+
+def test_mt_bench_fastchat_conservative_swap_mode(tmp_path):
+ """FastChat-compatible swap_mode='both' is conservative (tie if inconsistent)."""
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench",
+ model_A="Dummy/answer A",
+ model_B="Dummy/answer B",
+ judge_model="Dummy/[[A]]", # position-A biased judge
+ n_instructions=3,
+ swap_mode="both",
+ mt_bench_compatibility="fastchat",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ # Conservative swap runs both orders, but returns one resolved verdict per match.
+ assert len(prefs) == 6 # 3 questions * 2 turns
+ assert all(p == pytest.approx(0.5) for p in prefs)
+
+
+def test_mt_bench_101_pipeline(tmp_path):
+ prefs = main_generate_and_eval(
+ CliArgs(
+ dataset="mt-bench-101",
+ model_A="Dummy/model-a-response",
+ model_B="Dummy/model-b-response",
+ judge_model="Dummy/Explanation.\nRating: [[10]]",
+ result_folder=str(tmp_path),
+ )
+ )
+
+ # Both models receive the same constant judge score, so all pairwise prefs are ties.
+ assert len(prefs) == 3
+ assert all(float(pref) == pytest.approx(0.5) for pref in prefs)
diff --git a/tests/test_mt_bench_101.py b/tests/test_mt_bench_101.py
new file mode 100644
index 0000000..b51d8a7
--- /dev/null
+++ b/tests/test_mt_bench_101.py
@@ -0,0 +1,139 @@
+import json
+
+import pandas as pd
+import pytest
+
+import openjury.instruction_dataset.mt_bench_101 as mt_bench_101_dataset
+from openjury.mt_bench_101.evaluate import (
+ derive_mt_bench_101_pairwise_preferences,
+ judge_mt_bench_101_single,
+ parse_mt_bench_101_rating,
+ summarize_mt_bench_101_absolute_scores,
+ summarize_mt_bench_101_pairwise,
+)
+from openjury.utils import DummyModel
+
+
+def test_load_mt_bench_101_turn_expansion(tmp_path, monkeypatch):
+ dataset_path = tmp_path / "mtbench101.jsonl"
+ records = [
+ {
+ "task": "CM",
+ "id": 1,
+ "history": [
+ {"user": "u1", "bot": "b1"},
+ {"user": "u2", "bot": "b2"},
+ ],
+ },
+ {
+ "task": "PI",
+ "id": 2,
+ "history": [
+ {"user": "x1", "bot": "y1"},
+ {"user": "x2", "bot": "y2"},
+ ],
+ },
+ ]
+ dataset_path.write_text(
+ "\n".join(json.dumps(record) for record in records) + "\n",
+ encoding="utf-8",
+ )
+ monkeypatch.setattr(
+ mt_bench_101_dataset,
+ "download_mt_bench_101",
+ lambda local_dir=None: dataset_path,
+ )
+
+ eval_items = mt_bench_101_dataset.load_mt_bench_101()
+
+ # CM starts at turn 2 (1 row), PI starts at turn 1 (2 rows) => total 3.
+ assert len(eval_items) == 3
+ cm_rows = eval_items[eval_items["task"] == "CM"]
+ assert cm_rows.iloc[0]["turn_index"] == 2
+ assert len(cm_rows.iloc[0]["golden_context"]) == 1
+
+
+def test_parse_mt_bench_101_rating():
+ assert parse_mt_bench_101_rating("Reasoning...\nRating: [[7]]") == pytest.approx(7.0)
+ assert parse_mt_bench_101_rating("rating: [[10]]") == pytest.approx(10.0)
+ assert parse_mt_bench_101_rating("I would rate this [[6]] overall.") == pytest.approx(6.0)
+ assert (
+ parse_mt_bench_101_rating("See section [3] for details...\nRating: [[6]]")
+ == pytest.approx(6.0)
+ )
+ assert parse_mt_bench_101_rating("Rating: [[0]]") is None
+ assert parse_mt_bench_101_rating("Rating: [[11]]") is None
+ assert parse_mt_bench_101_rating("Rating: [6]") is None
+ assert parse_mt_bench_101_rating("No rating present.") is None
+
+
+def test_judge_mt_bench_101_includes_reference_block_for_mr():
+ eval_items = pd.DataFrame(
+ {
+ "instruction_index": [0],
+ "dialogue_id": [1],
+ "dialogue_uid": ["MR:1"],
+ "task": ["MR"],
+ "ability": ["adaptability"],
+ "turn_index": [2],
+ "golden_context": [[{"user": "q1", "bot": "a1"}]],
+ "user_message": ["q2"],
+ "reference_answer": ["ref answer"],
+ }
+ ).set_index("instruction_index")
+ completions = pd.DataFrame(
+ {"instruction_index": [0], "completion": ["model answer"]}
+ )
+
+ scored = judge_mt_bench_101_single(
+ judge_chat_model=DummyModel("Dummy/reasoning\nRating: [[8]]"),
+ eval_items=eval_items,
+ completions=completions,
+ use_tqdm=False,
+ )
+
+ user_prompt = scored.iloc[0]["user_prompt"]
+ assert scored.iloc[0]["score"] == pytest.approx(8.0)
+ assert "The dialogue need to be judged is:" in user_prompt
+ assert "The reference solution is:" in user_prompt
+ assert " Human: q1" in user_prompt
+ assert "Assistant: model answer" in user_prompt
+ assert user_prompt.find("***") < user_prompt.find("The reference solution is:")
+ assert "strictly following this format" in scored.iloc[0]["system_prompt"]
+
+
+def test_mt_bench_101_aggregation_and_pairwise():
+ scored_a = pd.DataFrame(
+ {
+ "instruction_index": [0, 1, 2],
+ "dialogue_uid": ["PI:1", "PI:1", "PI:2"],
+ "dialogue_id": [1, 1, 2],
+ "task": ["PI", "PI", "PI"],
+ "ability": ["interactivity", "interactivity", "interactivity"],
+ "turn_index": [1, 2, 1],
+ "score": [9.0, 2.0, 4.0],
+ }
+ )
+ scored_b = pd.DataFrame(
+ {
+ "instruction_index": [0, 1, 2],
+ "dialogue_uid": ["PI:1", "PI:1", "PI:2"],
+ "dialogue_id": [1, 1, 2],
+ "task": ["PI", "PI", "PI"],
+ "ability": ["interactivity", "interactivity", "interactivity"],
+ "turn_index": [1, 2, 1],
+ "score": [8.0, 1.0, 6.0],
+ }
+ )
+
+ absolute_a = summarize_mt_bench_101_absolute_scores(scored_turns=scored_a)
+ assert absolute_a["per_task"]["PI"] == pytest.approx(3.0)
+ assert absolute_a["overall"] == pytest.approx(3.0)
+
+ pairwise_turns = derive_mt_bench_101_pairwise_preferences(scored_a, scored_b)
+ summary = summarize_mt_bench_101_pairwise(pairwise_turns=pairwise_turns)
+
+ assert summary["turn_level"]["num_battles"] == 3
+ assert summary["dialogue_level"]["num_battles"] == 2
+ # dialogue-level uses min scores per dialogue: one A win and one A loss
+ assert summary["dialogue_level"]["winrate"] == pytest.approx(0.5)
diff --git a/tests/test_mt_bench_downloads.py b/tests/test_mt_bench_downloads.py
new file mode 100644
index 0000000..a41996d
--- /dev/null
+++ b/tests/test_mt_bench_downloads.py
@@ -0,0 +1,73 @@
+from pathlib import Path
+
+import openjury.instruction_dataset.mt_bench as mt_bench
+import openjury.instruction_dataset.mt_bench_101 as mt_bench_101
+import openjury.utils as utils
+
+
+def test_download_mt_bench_skips_question_download_if_cached(tmp_path, monkeypatch):
+ question_path = tmp_path / "data" / "mt_bench" / "question.jsonl"
+ question_path.parent.mkdir(parents=True, exist_ok=True)
+ question_path.write_text('{"question_id": 1, "turns": ["Q1"]}\n')
+
+ reference_path = tmp_path / "reference_answer" / "gpt-4.jsonl"
+ reference_path.parent.mkdir(parents=True, exist_ok=True)
+ reference_path.write_text('{"question_id": 1, "choices": [{"turns": ["A1"]}]}\n')
+
+ calls = {"snapshot_download": 0}
+
+ def _snapshot_download_stub(**_kwargs):
+ calls["snapshot_download"] += 1
+
+ monkeypatch.setattr(mt_bench, "snapshot_download", _snapshot_download_stub)
+ monkeypatch.setattr(
+ mt_bench,
+ "_download_gpt4_references",
+ lambda _local_dir: reference_path,
+ )
+
+ downloaded_question_path, downloaded_reference_path = mt_bench.download_mt_bench(
+ local_dir=tmp_path
+ )
+
+ assert downloaded_question_path == question_path
+ assert downloaded_reference_path == reference_path
+ assert calls["snapshot_download"] == 0
+
+
+def test_download_all_includes_mt_bench(tmp_path, monkeypatch):
+ hf_datasets = []
+ calls = {"contexts": 0, "mt_bench": 0, "mt_bench_101": 0}
+
+ monkeypatch.setattr(utils, "data_root", tmp_path)
+ monkeypatch.setattr(
+ utils,
+ "download_hf",
+ lambda name, local_path: hf_datasets.append((name, local_path)),
+ )
+
+ def _contexts_snapshot_stub(**_kwargs):
+ calls["contexts"] += 1
+
+ monkeypatch.setattr(utils, "snapshot_download", _contexts_snapshot_stub)
+ monkeypatch.setattr(
+ mt_bench,
+ "download_mt_bench",
+ lambda: calls.__setitem__("mt_bench", calls["mt_bench"] + 1),
+ )
+ monkeypatch.setattr(
+ mt_bench_101,
+ "download_mt_bench_101",
+ lambda: calls.__setitem__("mt_bench_101", calls["mt_bench_101"] + 1),
+ )
+
+ utils.download_all()
+
+ assert [name for name, _ in hf_datasets] == [
+ "alpaca-eval",
+ "arena-hard",
+ "m-arena-hard",
+ ]
+ assert calls["contexts"] == 1
+ assert calls["mt_bench"] == 1
+ assert calls["mt_bench_101"] == 1