eval-protocol · xzrderek · Sep 4, 2025 · Sep 4, 2025
diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py
@@ -99,7 +99,7 @@ def test_aime25_pointwise(row: EvaluationRow) -> EvaluationRow:
 
     extracted_text = _extract_boxed_text(content_str)
     extracted_int = _normalize_to_int_or_none(extracted_text)
-    gt_int = _normalize_to_int_or_none(row.ground_truth or "")
+    gt_int = _normalize_to_int_or_none(str(row.ground_truth))
 
     is_valid = extracted_int is not None and gt_int is not None
     score = 1.0 if (is_valid and extracted_int == gt_int) else 0.0

diff --git a/eval_protocol/benchmarks/test_livebench_data_analysis.py b/eval_protocol/benchmarks/test_livebench_data_analysis.py
@@ -407,7 +407,7 @@ def _extract_gt(row: EvaluationRow) -> Dict[str, Any]:
     if row.ground_truth is None:
         return {"ground_truth": None, "release": None}
     try:
-        payload = json.loads(row.ground_truth)
+        payload = json.loads(str(row.ground_truth))
         if isinstance(payload, dict):
             return payload
     except Exception:

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -3,6 +3,8 @@
 from enum import Enum
 from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
 
+JSONType = Union[Dict[str, Any], List[Any], str, int, float, bool, None]
+
 from openai.types import CompletionUsage
 from openai.types.chat.chat_completion_message import (
     FunctionCall,
@@ -598,8 +600,8 @@ class EvaluationRow(BaseModel):
     )
 
     # Ground truth reference (moved from EvaluateResult to top level)
-    ground_truth: Optional[str] = Field(
-        default=None, description="Optional ground truth reference for this evaluation."
+    ground_truth: Optional[JSONType] = Field(
+        default=None, description="JSON-serializable ground truth reference for this evaluation."
     )
 
     # Unified evaluation result

diff --git a/tests/pytest/test_apps_coding.py b/tests/pytest/test_apps_coding.py
@@ -47,7 +47,7 @@ def test_apps_code_evaluation(row: EvaluationRow) -> EvaluationRow:
     # Use evaluate_apps_solution directly
     result = evaluate_apps_solution(
         messages=row.messages,
-        ground_truth=row.ground_truth,
+        ground_truth=str(row.ground_truth),
     )
 
     # Set the evaluation result on the row

diff --git a/tests/pytest/test_markdown_highlighting.py b/tests/pytest/test_markdown_highlighting.py
@@ -42,12 +42,13 @@ def test_markdown_highlighting_evaluation(row: EvaluationRow) -> EvaluationRow:
     """
 
     assistant_response = row.messages[-1].content
+    assistant_response = str(assistant_response or "")
 
     if not assistant_response:
         row.evaluation_result = EvaluateResult(score=0.0, reason="❌ No assistant response found")
         return row
 
-    required_highlights = int(row.ground_truth)
+    required_highlights = int(str(row.ground_truth))
 
     # Check if the response contains the required number of formatted sections
     # e.g. **bold** or *italic*

diff --git a/tests/pytest/test_pytest_function_calling.py b/tests/pytest/test_pytest_function_calling.py
@@ -27,7 +27,7 @@ def function_calling_to_evaluation_row(rows: List[Dict[str, Any]]) -> List[Evalu
 )
 async def test_pytest_function_calling(row: EvaluationRow) -> EvaluationRow:
     """Run pointwise evaluation on sample dataset using pytest interface."""
-    ground_truth = json.loads(row.ground_truth)
+    ground_truth = json.loads(str(row.ground_truth))
     result = exact_tool_match_reward(row.messages, ground_truth)
     row.evaluation_result = result
     print(result)