Skip to content

Commit 2e225b7

Browse files
authored
Ground truth now a json serializable type (#159)
1 parent 3dbbfed commit 2e225b7

File tree

6 files changed

+10
-7
lines changed

6 files changed

+10
-7
lines changed

eval_protocol/benchmarks/test_aime25.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
9999

100100
extracted_text = _extract_boxed_text(content_str)
101101
extracted_int = _normalize_to_int_or_none(extracted_text)
102-
gt_int = _normalize_to_int_or_none(row.ground_truth or "")
102+
gt_int = _normalize_to_int_or_none(str(row.ground_truth))
103103

104104
is_valid = extracted_int is not None and gt_int is not None
105105
score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0

eval_protocol/benchmarks/test_livebench_data_analysis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:
407407
if row.ground_truth is None:
408408
return {"ground_truth": None, "release": None}
409409
try:
410-
payload = json.loads(row.ground_truth)
410+
payload = json.loads(str(row.ground_truth))
411411
if isinstance(payload, dict):
412412
return payload
413413
except Exception:

eval_protocol/models.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
from enum import Enum
44
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
55

6+
JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
7+
68
from openai.types import CompletionUsage
79
from openai.types.chat.chat_completion_message import (
810
FunctionCall,
@@ -598,8 +600,8 @@ class EvaluationRow(BaseModel):
598600
)
599601

600602
# Ground truth reference (moved from EvaluateResult to top level)
601-
ground_truth: Optional[str] = Field(
602-
default=None, description="Optional ground truth reference for this evaluation."
603+
ground_truth: Optional[JSONType] = Field(
604+
default=None, description="JSON-serializable ground truth reference for this evaluation."
603605
)
604606

605607
# Unified evaluation result

tests/pytest/test_apps_coding.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
4747
# Use evaluate_apps_solution directly
4848
result = evaluate_apps_solution(
4949
messages=row.messages,
50-
ground_truth=row.ground_truth,
50+
ground_truth=str(row.ground_truth),
5151
)
5252

5353
# Set the evaluation result on the row

tests/pytest/test_markdown_highlighting.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,12 +42,13 @@ def test_markdown_highlighting_evaluation(row: EvaluationRow) -> EvaluationRow:
4242
"""
4343

4444
assistant_response = row.messages[-1].content
45+
assistant_response = str(assistant_response or "")
4546

4647
if not assistant_response:
4748
row.evaluation_result = EvaluateResult(score=0.0, reason="❌ No assistant response found")
4849
return row
4950

50-
required_highlights = int(row.ground_truth)
51+
required_highlights = int(str(row.ground_truth))
5152

5253
# Check if the response contains the required number of formatted sections
5354
# e.g. **bold** or *italic*

tests/pytest/test_pytest_function_calling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu
2727
)
2828
async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
2929
"""Run pointwise evaluation on sample dataset using pytest interface."""
30-
ground_truth = json.loads(row.ground_truth)
30+
ground_truth = json.loads(str(row.ground_truth))
3131
result = exact_tool_match_reward(row.messages, ground_truth)
3232
row.evaluation_result = result
3333
print(result)

0 commit comments

Comments
 (0)