From 70b70cb378910b0add7d0a206fecc56ceb5b8379 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 22:16:04 -0800 Subject: [PATCH 1/6] add --- eval_protocol/data_loader/models.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/eval_protocol/data_loader/models.py b/eval_protocol/data_loader/models.py index 0179272e..a2c0b8af 100644 --- a/eval_protocol/data_loader/models.py +++ b/eval_protocol/data_loader/models.py @@ -111,7 +111,7 @@ def _process_variant(self, result: DataLoaderResult) -> DataLoaderResult: def _apply_metadata(self, result: DataLoaderResult, original_count: int, processed_count: int) -> None: """Apply metadata to all rows in the result.""" - for row in result.rows: + for idx, row in enumerate(result.rows): if row.input_metadata.dataset_info is None: row.input_metadata.dataset_info = {} @@ -126,3 +126,4 @@ def _apply_metadata(self, result: DataLoaderResult, original_count: int, process # Apply row counts row.input_metadata.dataset_info["data_loader_num_rows"] = original_count row.input_metadata.dataset_info["data_loader_num_rows_after_preprocessing"] = processed_count + row.input_metadata.dataset_info["data_loader_row_idx"] = idx From 6769bf533348cb7302813de6e46c3a3095746080 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 22:40:43 -0800 Subject: [PATCH 2/6] respect original line number from the source dataloader --- eval_protocol/pytest/tracing_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/pytest/tracing_utils.py b/eval_protocol/pytest/tracing_utils.py index 0382ba40..0eea0c2e 100644 --- a/eval_protocol/pytest/tracing_utils.py +++ b/eval_protocol/pytest/tracing_utils.py @@ -171,6 +171,7 @@ def update_row_with_remote_trace( row.messages = remote_row.messages row.tools = remote_row.tools row.input_metadata.session_data = remote_row.input_metadata.session_data + row.input_metadata.dataset_info = remote_row.input_metadata.dataset_info row.execution_metadata = remote_row.execution_metadata return None else: From 916e8982c1ef3149e5b5479648c49caf5f598f7a Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 22:49:52 -0800 Subject: [PATCH 3/6] add ut --- .../test_data_loader_stable_row_id.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 tests/data_loader/test_data_loader_stable_row_id.py diff --git a/tests/data_loader/test_data_loader_stable_row_id.py b/tests/data_loader/test_data_loader_stable_row_id.py new file mode 100644 index 00000000..d9e7d80e --- /dev/null +++ b/tests/data_loader/test_data_loader_stable_row_id.py @@ -0,0 +1,22 @@ +from eval_protocol.data_loader.dynamic_data_loader import DynamicDataLoader +from eval_protocol.models import EvaluationRow, Message, EvaluateResult +from eval_protocol.pytest import evaluation_test +from typing import List + +def generator() -> list[EvaluationRow]: + return [EvaluationRow(messages=[Message(role="user", content="What is 2 + 2?")])] * 2 + +@evaluation_test( + data_loaders=DynamicDataLoader( + generators=[generator], + ), + mode="all", +) +def test_data_loader_stable_row_id(rows: List[EvaluationRow]) -> List[EvaluationRow]: + """Test that the row id is stable even when the data loader is called multiple times.""" + row_ids = set() + for row in rows: + row_ids.add(row.input_metadata.row_id) + row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") + assert len(row_ids) == 2 + return rows \ No newline at end of file From 312503433b623925118eba95496598e1157a691c Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 22:58:13 -0800 Subject: [PATCH 4/6] add --- tests/data_loader/test_data_loader_stable_row_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_loader/test_data_loader_stable_row_id.py b/tests/data_loader/test_data_loader_stable_row_id.py index d9e7d80e..0e48f195 100644 --- a/tests/data_loader/test_data_loader_stable_row_id.py +++ b/tests/data_loader/test_data_loader_stable_row_id.py @@ -12,7 +12,7 @@ def generator() -> list[EvaluationRow]: ), mode="all", ) -def test_data_loader_stable_row_id(rows: List[EvaluationRow]) -> List[EvaluationRow]: +def test_data_loader_stable_row_id_with_same_content(rows: List[EvaluationRow]) -> List[EvaluationRow]: """Test that the row id is stable even when the data loader is called multiple times.""" row_ids = set() for row in rows: From ecb05e0db75d97ab2c28192242fb47678f5ac55a Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 23:06:38 -0800 Subject: [PATCH 5/6] add --- tests/data_loader/test_data_loader_stable_row_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_loader/test_data_loader_stable_row_id.py b/tests/data_loader/test_data_loader_stable_row_id.py index 0e48f195..46b0d933 100644 --- a/tests/data_loader/test_data_loader_stable_row_id.py +++ b/tests/data_loader/test_data_loader_stable_row_id.py @@ -19,4 +19,4 @@ def test_data_loader_stable_row_id_with_same_content(rows: List[EvaluationRow]) row_ids.add(row.input_metadata.row_id) row.evaluation_result = EvaluateResult(score=0.0, reason="Dummy evaluation result") assert len(row_ids) == 2 - return rows \ No newline at end of file + return rows From 93ebb15cf00027f548f3aef1c85d6a2c5ae0205e Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Mon, 24 Nov 2025 23:39:58 -0800 Subject: [PATCH 6/6] fix --- tests/data_loader/test_data_loader_stable_row_id.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data_loader/test_data_loader_stable_row_id.py b/tests/data_loader/test_data_loader_stable_row_id.py index 46b0d933..d9aaab96 100644 --- a/tests/data_loader/test_data_loader_stable_row_id.py +++ b/tests/data_loader/test_data_loader_stable_row_id.py @@ -4,7 +4,7 @@ from typing import List def generator() -> list[EvaluationRow]: - return [EvaluationRow(messages=[Message(role="user", content="What is 2 + 2?")])] * 2 + return [EvaluationRow(messages=[Message(role="user", content="What is 2 + 2?")]) for _ in range(2)] @evaluation_test( data_loaders=DynamicDataLoader(