Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion eval_protocol/pytest/tracing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,14 @@ def update_row_with_remote_trace(
row.messages = remote_row.messages
row.tools = remote_row.tools
row.input_metadata.session_data = remote_row.input_metadata.session_data
row.input_metadata.dataset_info = remote_row.input_metadata.dataset_info
remote_info = remote_row.input_metadata.dataset_info or {}
if row.input_metadata.dataset_info is None:
row.input_metadata.dataset_info = dict(remote_info)
else:
for k, v in remote_info.items():
if k not in row.input_metadata.dataset_info:
row.input_metadata.dataset_info[k] = v

row.execution_metadata = remote_row.execution_metadata
return None
else:
Expand Down
20 changes: 20 additions & 0 deletions tests/remote_server/test_remote_fireworks.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,16 @@ def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader:


def rows() -> List[EvaluationRow]:
"""Generate local rows with rich input_metadata to verify it survives remote traces."""
base_dataset_info = {
"requirements": ["Answer with the capital city of France."],
"total_requirements": 1,
"original_prompt": "What is the capital of France?",
}

row = EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])
row.input_metadata.dataset_info = dict(base_dataset_info)

return [row, row, row]


Expand Down Expand Up @@ -127,6 +136,17 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat
assert row.execution_metadata.rollout_id in ROLLOUT_IDS, (
f"Row rollout_id {row.execution_metadata.rollout_id} should be in tracked rollout_ids: {ROLLOUT_IDS}"
)
assert row.input_metadata.completion_params["model"] == "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"
assert row.input_metadata.completion_params["temperature"] == 0.5, "Row should have temperature at top level"

assert row.input_metadata.row_id is not None

assert row.input_metadata.dataset_info is not None
assert row.input_metadata.dataset_info["requirements"] == ["Answer with the capital city of France."]
assert row.input_metadata.dataset_info["total_requirements"] == 1
assert row.input_metadata.dataset_info["original_prompt"] == "What is the capital of France?"

assert "data_loader_type" in row.input_metadata.dataset_info
assert "data_loader_num_rows" in row.input_metadata.dataset_info

return row