eval-protocol · dphuang2 · Sep 24, 2025 · Sep 24, 2025
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -284,8 +284,13 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                     to specially handle data_loaders here so we don't double
                     apply preprocess_fn.
                     """
-                    if preprocess_fn and not data_loaders:
-                        data = preprocess_fn(data)
+                    if preprocess_fn:
+                        if not data_loaders:
+                            data = preprocess_fn(data)
+                        else:
+                            raise ValueError(
+                                "preprocess_fn should not be used with data_loaders. Pass preprocess_fn to data_loaders instead."
+                            )
 
                     for row in data:
                         # generate a stable row_id for each row

diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py
@@ -54,9 +54,9 @@ def braintrust_data_generator():
 @evaluation_test(
     data_loaders=DynamicDataLoader(
         generators=[braintrust_data_generator],
+        preprocess_fn=multi_turn_assistant_to_ground_truth,
     ),
     rollout_processor=SingleTurnRolloutProcessor(),
-    preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_evaluations=2,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py
@@ -51,9 +51,9 @@ def langfuse_data_generator():
 @evaluation_test(
     data_loaders=DynamicDataLoader(
         generators=[langfuse_data_generator],
+        preprocess_fn=multi_turn_assistant_to_ground_truth,
     ),
     rollout_processor=SingleTurnRolloutProcessor(),
-    preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_evaluations=2,
 )
 async def test_llm_judge(row: EvaluationRow) -> EvaluationRow:

diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py
@@ -68,6 +68,7 @@ def langsmith_data_generator() -> List[EvaluationRow]:
 @evaluation_test(
     data_loaders=DynamicDataLoader(
         generators=[langsmith_data_generator],
+        preprocess_fn=multi_turn_assistant_to_ground_truth,
     ),
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=multi_turn_assistant_to_ground_truth,

diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -57,9 +57,9 @@ def openai_responses_data_generator():
 @evaluation_test(
     data_loaders=DynamicDataLoader(
         generators=[openai_responses_data_generator],
+        preprocess_fn=multi_turn_assistant_to_ground_truth,
     ),
     rollout_processor=SingleTurnRolloutProcessor(),
-    preprocess_fn=multi_turn_assistant_to_ground_truth,
     max_concurrent_evaluations=2,
 )
 async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow: