diff --git a/eval_protocol/models.py b/eval_protocol/models.py index f930c717..a5d52acd 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -496,6 +496,8 @@ class EvaluationRow(BaseModel): supporting both row-wise batch evaluation and trajectory-based RL evaluation. """ + model_config = ConfigDict(extra="allow") + # Core OpenAI ChatCompletion compatible conversation data messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.") diff --git a/tests/test_models.py b/tests/test_models.py index 3e1f7706..0b373519 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,4 +1,5 @@ import json +import logging from typing import Dict import pytest @@ -660,3 +661,36 @@ def test_stable_hash_across_subprocess(): assert isinstance(child_hash, int) assert parent_hash == child_hash + + +def test_evaluation_row_extra_fields(): + example = { + "messages": [ + {"role": "user", "content": "What is the capital of France?"}, + {"role": "assistant", "content": "The capital of France is Paris."}, + ], + "ground_truth": "Paris", + "evaluation_result": {"score": 1.0, "reason": "Correct"}, + "input_metadata": {"model": "gpt-4"}, + "eval": {"score": 0.5}, + "eval_details": { + "score": 0.5, + "reason": "Correct", + "is_score_valid": True, + "metrics": { + "accuracy": { + "score": 1.0, + "reason": "Correct", + "is_score_valid": True, + }, + }, + }, + "extra_fields": { + "test": "test", + }, + } + row = EvaluationRow(**example) + dictionary = json.loads(row.model_dump_json()) + assert "eval" in dictionary + assert "accuracy" in dictionary["eval_details"]["metrics"] + assert "test" in dictionary["extra_fields"]