22Default LLM judge for Eval Protocol. Inspired by Arena-Hard-Auto.
33"""
44
5- from collections .abc import Awaitable , Callable
6- import os
7- from datetime import datetime
8- from typing import List , Dict , Any , Optional
9- from typing_extensions import cast
105from tqdm import tqdm
6+ from typing import Optional
117
12- import pytest
13-
14- from eval_protocol .models import EvaluateResult , EvaluationRow , MetricResult
15- from eval_protocol .pytest import evaluation_test
16- from eval_protocol .pytest .default_single_turn_rollout_process import SingleTurnRolloutProcessor
8+ from eval_protocol .models import EvaluationRow
9+ from eval_protocol .adapters .base import BaseAdapter
1710from eval_protocol .quickstart .utils import (
18- split_multi_turn_rows ,
1911 JUDGE_CONFIGS ,
2012 calculate_bootstrap_scores ,
2113 run_judgment_async ,
2214)
2315import asyncio
2416from openai import AsyncOpenAI
25- from eval_protocol .adapters .langfuse import create_langfuse_adapter
26-
27- adapter = create_langfuse_adapter ()
28-
29-
30- @pytest .mark .asyncio
31- @evaluation_test (
32- input_rows = [
33- adapter .get_evaluation_rows (
34- to_timestamp = datetime (2025 , 9 , 12 , 0 , 11 , 18 ),
35- limit = 711 ,
36- sample_size = 50 ,
37- sleep_between_gets = 3.0 ,
38- max_retries = 5 ,
39- )
40- ],
41- completion_params = [
42- {"model" : "gpt-4.1" },
43- {
44- "max_tokens" : 131000 ,
45- "extra_body" : {"reasoning_effort" : "medium" },
46- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" ,
47- },
48- {
49- "max_tokens" : 131000 ,
50- "extra_body" : {"reasoning_effort" : "low" },
51- "model" : "fireworks_ai/accounts/fireworks/models/gpt-oss-20b" ,
52- },
53- ],
54- rollout_processor = SingleTurnRolloutProcessor (),
55- preprocess_fn = split_multi_turn_rows ,
56- max_concurrent_rollouts = 64 ,
57- mode = "all" ,
58- )
59- async def test_llm_judge (rows : list [EvaluationRow ]) -> list [EvaluationRow ]:
60- return await aha_judge (rows )
6117
6218
63- async def aha_judge (rows : list [EvaluationRow ], judge_name : str = "gemini-2.5-pro" ) -> list [EvaluationRow ]:
19+ async def aha_judge (
20+ rows : list [EvaluationRow ], judge_name : str = "gemini-2.5-pro" , adapter : Optional [BaseAdapter ] = None
21+ ) -> list [EvaluationRow ]:
6422 """
6523 LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
6624
@@ -73,6 +31,8 @@ async def aha_judge(rows: list[EvaluationRow], judge_name: str = "gemini-2.5-pro
7331
7432 Args:
7533 rows: List of EvaluationRow objects with messages, ground_truth, and tools
34+ judge_name: Name of the judge configuration to use
35+ adapter: Optional adapter to push scores back to (if provided)
7636
7737 Returns:
7838 Same rows with updated evaluation_result containing scores and judgments
@@ -133,7 +93,8 @@ async def run_judgment(row):
13393 if row .evaluation_result :
13494 row .evaluation_result .score = mean_score
13595
136- # Optional, push scores back to Langfuse. Note that one score per model will be pushed back onto same trace.
137- adapter .push_scores (rows , model_name , mean_score )
96+ # Push scores back to adapter if provided. Note that one score per model will be pushed back onto same trace.
97+ if adapter :
98+ adapter .push_scores (rows , model_name , mean_score )
13899
139100 return rows
0 commit comments