Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,6 +496,8 @@ class EvaluationRow(BaseModel):
supporting both row-wise batch evaluation and trajectory-based RL evaluation.
"""

model_config = ConfigDict(extra="allow")

# Core OpenAI ChatCompletion compatible conversation data
messages: List[Message] = Field(description="List of messages in the conversation. Also known as a trajectory.")

Expand Down
34 changes: 34 additions & 0 deletions tests/test_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import logging
from typing import Dict

import pytest
Expand Down Expand Up @@ -660,3 +661,36 @@ def test_stable_hash_across_subprocess():

assert isinstance(child_hash, int)
assert parent_hash == child_hash


def test_evaluation_row_extra_fields():
example = {
"messages": [
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is Paris."},
],
"ground_truth": "Paris",
"evaluation_result": {"score": 1.0, "reason": "Correct"},
"input_metadata": {"model": "gpt-4"},
"eval": {"score": 0.5},
"eval_details": {
"score": 0.5,
"reason": "Correct",
"is_score_valid": True,
"metrics": {
"accuracy": {
"score": 1.0,
"reason": "Correct",
"is_score_valid": True,
},
},
},
"extra_fields": {
"test": "test",
},
}
row = EvaluationRow(**example)
dictionary = json.loads(row.model_dump_json())
assert "eval" in dictionary
assert "accuracy" in dictionary["eval_details"]["metrics"]
assert "test" in dictionary["extra_fields"]
Loading