From ecf2c1d948e8f0e48eb2050744a98c340ddbaa7b Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 27 Aug 2025 13:45:16 -0700 Subject: [PATCH 1/3] allow extra field --- eval_protocol/models.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eval_protocol/models.py b/eval_protocol/models.py index f930c717..a5d52acd 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -496,6 +496,8 @@ class EvaluationRow(BaseModel): supporting both row-wise batch evaluation and trajectory-based RL evaluation. """ + model_config = ConfigDict(extra="allow") + # Core OpenAI ChatCompletion compatible conversation data messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.") From aac07bc379571005680c665c3866a049f4ff843b Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 27 Aug 2025 14:05:17 -0700 Subject: [PATCH 2/3] add ut --- tests/test_models.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tests/test_models.py b/tests/test_models.py index 3e1f7706..63c6ad0b 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,4 +1,5 @@ import json +import logging from typing import Dict import pytest @@ -660,3 +661,35 @@ def test_stable_hash_across_subprocess(): assert isinstance(child_hash, int) assert parent_hash == child_hash + +def test_evaluation_row_extra_fields(): + example = { + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + ], + "ground_truth": "Paris", + "evaluation_result": {"score": 1.0, "reason": "Correct"}, + "input_metadata": {"model": "gpt-4"}, + "eval": {"score": 0.5}, + "eval_details": { + "score": 0.5, + "reason": "Correct", + "is_score_valid": True, + "metrics": { + "accuracy": { + "score": 1.0, + "reason": "Correct", + "is_score_valid": True, + }, + }, + }, + "extra_fields": { + "test": "test", + }, + } + row = EvaluationRow(**example) + dictionary = json.loads(row.model_dump_json()) + assert "eval" in dictionary + assert "accuracy" in dictionary["eval_details"]["metrics"] + assert "test" in dictionary["extra_fields"] \ No newline at end of file From 9ba3978c02b2a9eb74dde7ef793028f04c7b82a5 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Wed, 27 Aug 2025 14:15:05 -0700 Subject: [PATCH 3/3] format --- tests/test_models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_models.py b/tests/test_models.py index 63c6ad0b..0b373519 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -662,6 +662,7 @@ def test_stable_hash_across_subprocess(): assert isinstance(child_hash, int) assert parent_hash == child_hash + def test_evaluation_row_extra_fields(): example = { "messages": [ @@ -692,4 +693,4 @@ def test_evaluation_row_extra_fields(): dictionary = json.loads(row.model_dump_json()) assert "eval" in dictionary assert "accuracy" in dictionary["eval_details"]["metrics"] - assert "test" in dictionary["extra_fields"] \ No newline at end of file + assert "test" in dictionary["extra_fields"]