Skip to content

Commit 25d6c12

Browse files
committed
update docstrings
1 parent 579db3c commit 25d6c12

File tree

2 files changed

+67
-16
lines changed

2 files changed

+67
-16
lines changed

eval_protocol/quickstart/llm_judge.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,20 @@
4444
)
4545
async def test_llm_judge(rows: list[EvaluationRow]) -> list[EvaluationRow]:
4646
"""
47-
Simplified LLM Judge for Arena-Hard-Auto pairwise comparisons.
47+
LLM Judge evaluation using Arena-Hard-Auto style pairwise comparisons.
4848
49-
Each row contains:
50-
- messages[:-1]: Question/prompt (conversation context)
51-
- messages[-1]: Model B's answer (comparison model response)
52-
- ground_truth: Model A's answer (original assistant response)
49+
Compares model responses against ground truth using an LLM judge. For each row:
50+
1. Extracts the question from messages[:-1]
51+
2. Compares messages[-1] (new model response) vs ground_truth (baseline response)
52+
3. Runs two judgment rounds (A vs B, B vs A) to reduce position bias
53+
4. Calculates bootstrap scores across all comparisons
54+
5. Updates evaluation_result with final scores and confidence intervals
55+
56+
Args:
57+
rows: List of EvaluationRow objects with messages, ground_truth, and tools
58+
59+
Returns:
60+
Same rows with updated evaluation_result containing scores and judgments
5361
"""
5462

5563
judge_name = "gemini-2.5-pro" # Edit to which judge you'd like to use. Configs are in utils.py.

eval_protocol/quickstart/utils.py

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@
3131
Example output: "My final verdict is tie: [[A=B]]"."""
3232

3333

34-
# Judge configurations, feel free to add your own!
34+
# Judge model configurations for Arena-Hard-Auto style evaluation
35+
# Each config specifies the model, parameters, and concurrency limits for LLM judges
3536
JUDGE_CONFIGS = {
3637
"gpt-4.1": {
3738
"model": "gpt-4.1",
@@ -49,7 +50,8 @@
4950
},
5051
}
5152

52-
# Label to score mapping for Arena-Hard-Auto judgments
53+
# Mapping from Arena-Hard-Auto judgment labels to numerical scores
54+
# Stronger preferences (>> or <<) get weighted more heavily (3x) than slight preferences
5355
LABEL_TO_SCORE = {
5456
"A>B": [1],
5557
"A>>B": [1] * 3,
@@ -200,7 +202,20 @@ def fetch_langfuse_traces_as_evaluation_rows(
200202
hours_back: Optional[int] = None,
201203
include_tool_calls: bool = True,
202204
) -> List[EvaluationRow]:
203-
"""Fetch Langfuse traces as evaluation rows."""
205+
"""
206+
Fetch Langfuse traces and convert them to EvaluationRow objects.
207+
208+
Args:
209+
limit: Maximum number of traces to fetch
210+
tags: Filter traces by tags
211+
user_id: Filter traces by user ID
212+
session_id: Filter traces by session ID
213+
hours_back: Only fetch traces from the last N hours
214+
include_tool_calls: Whether to include tool calls in messages
215+
216+
Returns:
217+
List of EvaluationRow objects converted from Langfuse traces
218+
"""
204219
try:
205220
from eval_protocol.adapters.langfuse import create_langfuse_adapter
206221

@@ -220,10 +235,17 @@ def fetch_langfuse_traces_as_evaluation_rows(
220235

221236
def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float, float, float]:
222237
"""
223-
Calculate bootstrap scores from judgments.
238+
Calculate bootstrap confidence intervals for Arena-Hard-Auto style judgments.
239+
240+
Converts judgment labels (A>B, A>>B, etc.) to numerical scores, performs bootstrap
241+
sampling to estimate score distribution, and returns mean with 90% confidence interval.
242+
243+
Args:
244+
judgments: List of judgment dicts, each containing "games" with two rounds of scores
224245
225246
Returns:
226-
tuple: (mean_score, lower_score, upper_score)
247+
tuple: (mean_score, lower_5th_percentile, upper_95th_percentile)
248+
Returns (0.0, 0.0, 0.0) if no valid scores found
227249
"""
228250
# Extract scores from judgments
229251
scores_data = []
@@ -255,15 +277,22 @@ def calculate_bootstrap_scores(judgments: List[Dict[str, Any]]) -> tuple[float,
255277

256278
def run_judgment(row: EvaluationRow, model_name: str, judge_name: str) -> Optional[Dict[str, Any]]:
257279
"""
258-
Run pairwise judgment for a single evaluation row.
280+
Run Arena-Hard-Auto style pairwise judgment for a single evaluation row.
281+
282+
Performs two rounds of judgment (A vs B, B vs A) to reduce position bias:
283+
- Round 1: ground_truth (original) vs messages[-1] (new model response)
284+
- Round 2: messages[-1] (new model response) vs ground_truth (original)
285+
286+
Updates the row's evaluation_result with judgment details and returns results
287+
for aggregation across the dataset.
259288
260289
Args:
261-
row: EvaluationRow to judge
262-
model_name: Name of the model being evaluated
263-
judge_name: Name of the judge config to use
290+
row: EvaluationRow containing messages, ground_truth, and tools
291+
model_name: Name of the model being evaluated (for result tracking)
292+
judge_name: Key from JUDGE_CONFIGS to use for judgment
264293
265294
Returns:
266-
Dict containing model name and games results, or None if failed
295+
Dict with "model" and "games" keys, or None if row has no messages
267296
"""
268297
if not row.messages:
269298
return None
@@ -313,7 +342,21 @@ def run_judgment(row: EvaluationRow, model_name: str, judge_name: str) -> Option
313342

314343

315344
def push_scores_to_langfuse(rows: List[EvaluationRow], model_name: str, mean_score: float) -> None:
316-
"""Push scores back to Langfuse traces."""
345+
"""
346+
Push evaluation scores back to Langfuse traces for tracking and analysis.
347+
348+
Creates a score entry in Langfuse for each unique trace_id found in the evaluation
349+
rows' session data. This allows you to see evaluation results directly in the
350+
Langfuse UI alongside the original traces.
351+
352+
Args:
353+
rows: List of EvaluationRow objects with session_data containing trace IDs
354+
model_name: Name of the model (used as the score name in Langfuse)
355+
mean_score: The calculated mean score to push to Langfuse
356+
357+
Note:
358+
Silently handles errors if Langfuse is unavailable or if rows lack session data
359+
"""
317360
try:
318361
from eval_protocol.adapters.langfuse import create_langfuse_adapter
319362

0 commit comments

Comments
 (0)