diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index a60ca51b..d9dfe811 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -284,8 +284,13 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo to specially handle data_loaders here so we don't double apply preprocess_fn. """ - if preprocess_fn and not data_loaders: - data = preprocess_fn(data) + if preprocess_fn: + if not data_loaders: + data = preprocess_fn(data) + else: + raise ValueError( + "preprocess_fn should not be used with data_loaders. Pass preprocess_fn to data_loaders instead." + ) for row in data: # generate a stable row_id for each row diff --git a/eval_protocol/quickstart/llm_judge_braintrust.py b/eval_protocol/quickstart/llm_judge_braintrust.py index 01619bcb..88ad7c26 100644 --- a/eval_protocol/quickstart/llm_judge_braintrust.py +++ b/eval_protocol/quickstart/llm_judge_braintrust.py @@ -54,9 +54,9 @@ def braintrust_data_generator(): @evaluation_test( data_loaders=DynamicDataLoader( generators=[braintrust_data_generator], + preprocess_fn=multi_turn_assistant_to_ground_truth, ), rollout_processor=SingleTurnRolloutProcessor(), - preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_evaluations=2, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: diff --git a/eval_protocol/quickstart/llm_judge_langfuse.py b/eval_protocol/quickstart/llm_judge_langfuse.py index bdcdb4dd..21459494 100644 --- a/eval_protocol/quickstart/llm_judge_langfuse.py +++ b/eval_protocol/quickstart/llm_judge_langfuse.py @@ -51,9 +51,9 @@ def langfuse_data_generator(): @evaluation_test( data_loaders=DynamicDataLoader( generators=[langfuse_data_generator], + preprocess_fn=multi_turn_assistant_to_ground_truth, ), rollout_processor=SingleTurnRolloutProcessor(), - preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_evaluations=2, ) async def test_llm_judge(row: EvaluationRow) -> EvaluationRow: diff --git a/eval_protocol/quickstart/llm_judge_langsmith.py b/eval_protocol/quickstart/llm_judge_langsmith.py index be78b28f..20d12730 100644 --- a/eval_protocol/quickstart/llm_judge_langsmith.py +++ b/eval_protocol/quickstart/llm_judge_langsmith.py @@ -68,6 +68,7 @@ def langsmith_data_generator() -> List[EvaluationRow]: @evaluation_test( data_loaders=DynamicDataLoader( generators=[langsmith_data_generator], + preprocess_fn=multi_turn_assistant_to_ground_truth, ), rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=multi_turn_assistant_to_ground_truth, diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index a2b6c7c9..aafc8fb7 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -57,9 +57,9 @@ def openai_responses_data_generator(): @evaluation_test( data_loaders=DynamicDataLoader( generators=[openai_responses_data_generator], + preprocess_fn=multi_turn_assistant_to_ground_truth, ), rollout_processor=SingleTurnRolloutProcessor(), - preprocess_fn=multi_turn_assistant_to_ground_truth, max_concurrent_evaluations=2, ) async def test_llm_judge_openai_responses(row: EvaluationRow) -> EvaluationRow: