From 73505bd772bb4351e010273e499a9329af625062 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 25 Nov 2025 10:01:38 -0800 Subject: [PATCH 1/2] fix remote dataset info --- eval_protocol/pytest/tracing_utils.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/eval_protocol/pytest/tracing_utils.py b/eval_protocol/pytest/tracing_utils.py index 0eea0c2e..6b0add61 100644 --- a/eval_protocol/pytest/tracing_utils.py +++ b/eval_protocol/pytest/tracing_utils.py @@ -171,7 +171,14 @@ def update_row_with_remote_trace( row.messages = remote_row.messages row.tools = remote_row.tools row.input_metadata.session_data = remote_row.input_metadata.session_data - row.input_metadata.dataset_info = remote_row.input_metadata.dataset_info + remote_info = remote_row.input_metadata.dataset_info or {} + if row.input_metadata.dataset_info is None: + row.input_metadata.dataset_info = dict(remote_info) + else: + for k, v in remote_info.items(): + if k not in row.input_metadata.dataset_info: + row.input_metadata.dataset_info[k] = v + row.execution_metadata = remote_row.execution_metadata return None else: From b9a87e72110d53063fefac201973d8ca2c50a6a1 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 25 Nov 2025 10:13:00 -0800 Subject: [PATCH 2/2] add test --- tests/remote_server/test_remote_fireworks.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/remote_server/test_remote_fireworks.py b/tests/remote_server/test_remote_fireworks.py index 4ef67f8e..db5fdb49 100644 --- a/tests/remote_server/test_remote_fireworks.py +++ b/tests/remote_server/test_remote_fireworks.py @@ -94,7 +94,16 @@ def fireworks_output_data_loader(config: DataLoaderConfig) -> DynamicDataLoader: def rows() -> List[EvaluationRow]: + """Generate local rows with rich input_metadata to verify it survives remote traces.""" + base_dataset_info = { + "requirements": ["Answer with the capital city of France."], + "total_requirements": 1, + "original_prompt": "What is the capital of France?", + } + row = EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")]) + row.input_metadata.dataset_info = dict(base_dataset_info) + return [row, row, row] @@ -127,6 +136,17 @@ async def test_remote_rollout_and_fetch_fireworks(row: EvaluationRow) -> Evaluat assert row.execution_metadata.rollout_id in ROLLOUT_IDS, ( f"Row rollout_id {row.execution_metadata.rollout_id} should be in tracked rollout_ids: {ROLLOUT_IDS}" ) + assert row.input_metadata.completion_params["model"] == "fireworks_ai/accounts/fireworks/models/gpt-oss-120b" assert row.input_metadata.completion_params["temperature"] == 0.5, "Row should have temperature at top level" + assert row.input_metadata.row_id is not None + + assert row.input_metadata.dataset_info is not None + assert row.input_metadata.dataset_info["requirements"] == ["Answer with the capital city of France."] + assert row.input_metadata.dataset_info["total_requirements"] == 1 + assert row.input_metadata.dataset_info["original_prompt"] == "What is the capital of France?" + + assert "data_loader_type" in row.input_metadata.dataset_info + assert "data_loader_num_rows" in row.input_metadata.dataset_info + return row