3131Example output: "My final verdict is tie: [[A=B]]"."""
3232
3333
34- # Judge configurations, feel free to add your own!
34+ # Judge model configurations for Arena-Hard-Auto style evaluation
35+ # Each config specifies the model, parameters, and concurrency limits for LLM judges
3536JUDGE_CONFIGS = {
3637 "gpt-4.1" : {
3738 "model" : "gpt-4.1" ,
4950 },
5051}
5152
52- # Label to score mapping for Arena-Hard-Auto judgments
53+ # Mapping from Arena-Hard-Auto judgment labels to numerical scores
54+ # Stronger preferences (>> or <<) get weighted more heavily (3x) than slight preferences
5355LABEL_TO_SCORE = {
5456 "A>B" : [1 ],
5557 "A>>B" : [1 ] * 3 ,
@@ -200,7 +202,20 @@ def fetch_langfuse_traces_as_evaluation_rows(
200202 hours_back : Optional [int ] = None ,
201203 include_tool_calls : bool = True ,
202204) -> List [EvaluationRow ]:
203- """Fetch Langfuse traces as evaluation rows."""
205+ """
206+ Fetch Langfuse traces and convert them to EvaluationRow objects.
207+
208+ Args:
209+ limit: Maximum number of traces to fetch
210+ tags: Filter traces by tags
211+ user_id: Filter traces by user ID
212+ session_id: Filter traces by session ID
213+ hours_back: Only fetch traces from the last N hours
214+ include_tool_calls: Whether to include tool calls in messages
215+
216+ Returns:
217+ List of EvaluationRow objects converted from Langfuse traces
218+ """
204219 try :
205220 from eval_protocol .adapters .langfuse import create_langfuse_adapter
206221
@@ -220,10 +235,17 @@ def fetch_langfuse_traces_as_evaluation_rows(
220235
221236def calculate_bootstrap_scores (judgments : List [Dict [str , Any ]]) -> tuple [float , float , float ]:
222237 """
223- Calculate bootstrap scores from judgments.
238+ Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.
239+
240+ Converts judgment labels (A>B, A>>B, etc.) to numerical scores, performs bootstrap
241+ sampling to estimate score distribution, and returns mean with 90% confidence interval.
242+
243+ Args:
244+ judgments: List of judgment dicts, each containing "games" with two rounds of scores
224245
225246 Returns:
226- tuple: (mean_score, lower_score, upper_score)
247+ tuple: (mean_score, lower_5th_percentile, upper_95th_percentile)
248+ Returns (0.0, 0.0, 0.0) if no valid scores found
227249 """
228250 # Extract scores from judgments
229251 scores_data = []
@@ -255,15 +277,22 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
255277
256278def run_judgment (row : EvaluationRow , model_name : str , judge_name : str ) -> Optional [Dict [str , Any ]]:
257279 """
258- Run pairwise judgment for a single evaluation row.
280+ Run Arena-Hard-Auto style pairwise judgment for a single evaluation row.
281+
282+ Performs two rounds of judgment (A vs B, B vs A) to reduce position bias:
283+ - Round 1: ground_truth (original) vs messages[-1] (new model response)
284+ - Round 2: messages[-1] (new model response) vs ground_truth (original)
285+
286+ Updates the row's evaluation_result with judgment details and returns results
287+ for aggregation across the dataset.
259288
260289 Args:
261- row: EvaluationRow to judge
262- model_name: Name of the model being evaluated
263- judge_name: Name of the judge config to use
290+ row: EvaluationRow containing messages, ground_truth, and tools
291+ model_name: Name of the model being evaluated (for result tracking)
292+ judge_name: Key from JUDGE_CONFIGS to use for judgment
264293
265294 Returns:
266- Dict containing model name and games results , or None if failed
295+ Dict with " model" and " games" keys , or None if row has no messages
267296 """
268297 if not row .messages :
269298 return None
@@ -313,7 +342,21 @@ def run_judgment(row: EvaluationRow, model_name: str, judge_name: str) -> Option
313342
314343
315344def push_scores_to_langfuse (rows : List [EvaluationRow ], model_name : str , mean_score : float ) -> None :
316- """Push scores back to Langfuse traces."""
345+ """
346+ Push evaluation scores back to Langfuse traces for tracking and analysis.
347+
348+ Creates a score entry in Langfuse for each unique trace_id found in the evaluation
349+ rows' session data. This allows you to see evaluation results directly in the
350+ Langfuse UI alongside the original traces.
351+
352+ Args:
353+ rows: List of EvaluationRow objects with session_data containing trace IDs
354+ model_name: Name of the model (used as the score name in Langfuse)
355+ mean_score: The calculated mean score to push to Langfuse
356+
357+ Note:
358+ Silently handles errors if Langfuse is unavailable or if rows lack session data
359+ """
317360 try :
318361 from eval_protocol .adapters .langfuse import create_langfuse_adapter
319362
0 commit comments