From ecf2c1d948e8f0e48eb2050744a98c340ddbaa7b Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Wed, 27 Aug 2025 13:45:16 -0700
Subject: [PATCH 1/3] allow extra field

---
 eval_protocol/models.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index f930c717..a5d52acd 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -496,6 +496,8 @@ class EvaluationRow(BaseModel):
     supporting both row-wise batch evaluation and trajectory-based RL evaluation.
     """
 
+    model_config = ConfigDict(extra="allow")
+
     # Core OpenAI ChatCompletion compatible conversation data
     messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.")
 

From aac07bc379571005680c665c3866a049f4ff843b Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Wed, 27 Aug 2025 14:05:17 -0700
Subject: [PATCH 2/3] add ut

---
 tests/test_models.py | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/tests/test_models.py b/tests/test_models.py
index 3e1f7706..63c6ad0b 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,4 +1,5 @@
 import json
+import logging
 from typing import Dict
 
 import pytest
@@ -660,3 +661,35 @@ def test_stable_hash_across_subprocess():
 
     assert isinstance(child_hash, int)
     assert parent_hash == child_hash
+
+def test_evaluation_row_extra_fields():
+    example = {
+        "messages": [
+            {"role": "user", "content": "What is the capital of France?"},
+            {"role": "assistant", "content": "The capital of France is Paris."},
+        ],
+        "ground_truth": "Paris",
+        "evaluation_result": {"score": 1.0, "reason": "Correct"},
+        "input_metadata": {"model": "gpt-4"},
+        "eval": {"score": 0.5},
+        "eval_details": {
+            "score": 0.5,
+            "reason": "Correct",
+            "is_score_valid": True,
+            "metrics": {
+                "accuracy": {
+                    "score": 1.0,
+                    "reason": "Correct",
+                    "is_score_valid": True,
+                },
+            },
+        },
+        "extra_fields": {
+            "test": "test",
+        },
+    }
+    row = EvaluationRow(**example)
+    dictionary = json.loads(row.model_dump_json())
+    assert "eval" in dictionary
+    assert "accuracy" in dictionary["eval_details"]["metrics"]
+    assert "test" in dictionary["extra_fields"]
\ No newline at end of file

From 9ba3978c02b2a9eb74dde7ef793028f04c7b82a5 Mon Sep 17 00:00:00 2001
From: Yinghan Ma <yinghan.ma@fireworks.ai>
Date: Wed, 27 Aug 2025 14:15:05 -0700
Subject: [PATCH 3/3] format

---
 tests/test_models.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_models.py b/tests/test_models.py
index 63c6ad0b..0b373519 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -662,6 +662,7 @@ def test_stable_hash_across_subprocess():
     assert isinstance(child_hash, int)
     assert parent_hash == child_hash
 
+
 def test_evaluation_row_extra_fields():
     example = {
         "messages": [
@@ -692,4 +693,4 @@ def test_evaluation_row_extra_fields():
     dictionary = json.loads(row.model_dump_json())
     assert "eval" in dictionary
     assert "accuracy" in dictionary["eval_details"]["metrics"]
-    assert "test" in dictionary["extra_fields"]
\ No newline at end of file
+    assert "test" in dictionary["extra_fields"]