From aa6077cd04401ce7e80b9d0cb62c5b0b93070b48 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Wed, 17 Sep 2025 15:42:29 -0700 Subject: [PATCH 1/5] v2 proposal --- development/notes/eval_protocol_v2.md | 277 ++++++++++++++++++++++++++ 1 file changed, 277 insertions(+) create mode 100644 development/notes/eval_protocol_v2.md diff --git a/development/notes/eval_protocol_v2.md b/development/notes/eval_protocol_v2.md new file mode 100644 index 00000000..a8b8b130 --- /dev/null +++ b/development/notes/eval_protocol_v2.md @@ -0,0 +1,277 @@ +Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator). + +## Complete Examples + +### 1. Pointwise Evaluator (100 separate tests) + +```python +# conftest.py +import pytest +from eval_protocol.framework import EvaluationFramework, EvaluationRow + +MATH_DATASET = [ + {"problem": "What is 2+2?", "answer": "4"}, + {"problem": "What is 3*3?", "answer": "9"}, + {"problem": "What is 10/2?", "answer": "5"}, + {"problem": "What is 15-7?", "answer": "8"}, + {"problem": "What is 6*7?", "answer": "42"}, + # ... 95 more rows +] + +@pytest.fixture +def math_dataset(): + """Raw math dataset fixture""" + return MATH_DATASET + +@pytest.fixture +def preprocess_fn(): + """Preprocessing function for the dataset""" + def _preprocess(item): + return { + "messages": [{"role": "user", "content": item["problem"]}], + "expected_answer": item["answer"] + } + return _preprocess + +@pytest.fixture(params=[ + {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100}, + {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100}, + {"model": "claude-3", "temperature": 0.3, "max_tokens": 100} +]) +def completion_params(request): + """Completion parameters - parametrized across different models""" + return request.param + +# Pointwise fixture - parametrized across BOTH completion params AND dataset rows +@pytest.fixture(params=range(len(MATH_DATASET))) +def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request): + """Single evaluation row - parametrized across completion params AND dataset rows""" + framework = EvaluationFramework() + + # Get the specific row based on parametrization + row_index = request.param + raw_item = math_dataset[row_index] + processed_item = preprocess_fn(raw_item) + + # Run the completion + result = await framework.run_completion(processed_item, completion_params) + + return EvaluationRow( + input_data=processed_item, + completion_params=completion_params, + completion_response=result + ) + +# Batch fixture - parametrized across completion params only +@pytest.fixture +async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params): + """All evaluation rows - parametrized across completion params only""" + framework = EvaluationFramework() + + # Process all rows + processed_items = [preprocess_fn(item) for item in math_dataset] + + # Run completions for all rows + results = [] + for item in processed_items: + result = await framework.run_completion(item, completion_params) + results.append(EvaluationRow( + input_data=item, + completion_params=completion_params, + completion_response=result + )) + + return results +``` + +```python +# test_math_evaluation.py +import pytest +import re +from eval_protocol.framework import EvaluationRow + +# POINTWISE EVALUATOR - 100 separate tests (one per row per model) +def test_math_accuracy_pointwise(evaluation_row_pointwise): + """Pointwise evaluator - runs once per row per completion param""" + response = evaluation_row_pointwise.completion_response + expected = evaluation_row_pointwise.input_data["expected_answer"] + + # Extract numeric answer from response + numbers = re.findall(r'-?\d+\.?\d*', response) + if not numbers: + pytest.fail(f"Could not extract number from response: {response}") + + predicted = float(numbers[0]) + expected_num = float(expected) + + # Assert the answer is correct + assert abs(predicted - expected_num) < 0.01, \ + f"Expected {expected_num}, got {predicted} in response: {response}" + +# BATCH EVALUATOR - 3 tests total (one per model) +def test_math_accuracy_batch(evaluation_rows_batch): + """Batch evaluator - runs once per completion param with all rows""" + total_correct = 0 + total_samples = len(evaluation_rows_batch) + failed_rows = [] + + for i, row in enumerate(evaluation_rows_batch): + response = row.completion_response + expected = row.input_data["expected_answer"] + + # Extract numeric answer + numbers = re.findall(r'-?\d+\.?\d*', response) + if not numbers: + failed_rows.append({ + "index": i, + "problem": row.input_data["messages"][0]["content"], + "expected": expected, + "response": response, + "error": "Could not extract number" + }) + continue + + predicted = float(numbers[0]) + expected_num = float(expected) + + if abs(predicted - expected_num) < 0.01: + total_correct += 1 + else: + failed_rows.append({ + "index": i, + "problem": row.input_data["messages"][0]["content"], + "expected": expected, + "predicted": predicted, + "response": response, + "error": f"Expected {expected_num}, got {predicted}" + }) + + # Calculate accuracy + accuracy = total_correct / total_samples + + # Print detailed results for debugging + print(f"\nBatch Evaluation Results:") + print(f"Total samples: {total_samples}") + print(f"Correct: {total_correct}") + print(f"Accuracy: {accuracy:.2f}") + + if failed_rows: + print(f"\nFailed rows ({len(failed_rows)}):") + for row in failed_rows[:10]: # Show first 10 failures + print(f" Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})") + if len(failed_rows) > 10: + print(f" ... and {len(failed_rows) - 10} more failures") + + # Assertions + assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8" + assert total_correct > 0, "No correct answers found" + +# Additional batch evaluator with model-specific assertions +def test_math_accuracy_with_model_info(evaluation_rows_batch): + """Batch evaluator with model-specific assertions""" + model = evaluation_rows_batch[0].completion_params["model"] + temperature = evaluation_rows_batch[0].completion_params["temperature"] + + total_correct = 0 + for row in evaluation_rows_batch: + response = row.completion_response + expected = row.input_data["expected_answer"] + + numbers = re.findall(r'-?\d+\.?\d*', response) + if numbers: + predicted = float(numbers[0]) + expected_num = float(expected) + if abs(predicted - expected_num) < 0.01: + total_correct += 1 + + accuracy = total_correct / len(evaluation_rows_batch) + + # Model-specific assertions + if model == "gpt-4": + assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low" + elif model == "gpt-3.5-turbo": + assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low" + elif model == "claude-3": + assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low" + + print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}") + +# Optional: Debug function for specific rows +def test_math_accuracy_debug_specific_rows(evaluation_rows_batch): + """Debug function to test specific rows - only runs on first few rows""" + # Only test first 5 rows for debugging + debug_rows = evaluation_rows_batch[:5] + + for i, row in enumerate(debug_rows): + response = row.completion_response + expected = row.input_data["expected_answer"] + + numbers = re.findall(r'-?\d+\.?\d*', response) + if not numbers: + pytest.fail(f"Row {i}: Could not extract number from response: {response}") + + predicted = float(numbers[0]) + expected_num = float(expected) + + assert abs(predicted - expected_num) < 0.01, \ + f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}" +``` + +### 2. Running the Tests + +```bash +# Run pointwise evaluator (100 rows × 3 models = 300 tests) +pytest test_math_evaluation.py::test_math_accuracy_pointwise -v + +# Run batch evaluator (3 models = 3 tests) +pytest test_math_evaluation.py::test_math_accuracy_batch -v + +# Run all tests (300 + 3 = 303 tests total) +pytest test_math_evaluation.py -v + +# Run with specific model +pytest test_math_evaluation.py -k "gpt-4" -v + +# Run only batch tests +pytest test_math_evaluation.py -k "batch" -v + +# Run only pointwise tests +pytest test_math_evaluation.py -k "pointwise" -v +``` + +### 3. Expected Output + +**Pointwise evaluator output:** +``` +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED +# ... 97 more tests for completion_params0 +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED +# ... 100 tests for completion_params1 +test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED +# ... 100 tests for completion_params2 +``` + +**Batch evaluator output:** +``` +test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED +test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED +test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED +``` + +### 4. Key Differences + +**Pointwise Evaluator:** +- **Test count**: 100 rows × 3 models = 300 tests +- **Benefits**: Easy to debug individual rows, clear failure reporting per row +- **Use case**: When you want to see exactly which rows fail and why +- **Pytest output**: Each row gets its own test result + +**Batch Evaluator:** +- **Test count**: 3 models = 3 tests +- **Benefits**: Faster execution, easier to manage, good for overall accuracy +- **Use case**: When you care about overall performance across the dataset +- **Pytest output**: One test result per model with detailed internal reporting + +Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach! From 8e406ee8169b3151d57ce489f35b3ccea0cc4f8b Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 18 Sep 2025 14:32:11 -0700 Subject: [PATCH 2/5] allow for manual parametrization using pytest --- eval_protocol/__init__.py | 2 + eval_protocol/pytest/evaluation_test.py | 13 ++- eval_protocol/pytest/parameterize.py | 78 ++++++++++++---- .../quickstart/llm_judge_openai_responses.py | 12 ++- tests/pytest/test_parameterized_ids.py | 93 +++++++++++++------ 5 files changed, 147 insertions(+), 51 deletions(-) diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py index cd1efd2c..e6c000d2 100644 --- a/eval_protocol/__init__.py +++ b/eval_protocol/__init__.py @@ -39,6 +39,7 @@ from .typed_interface import reward_function from .quickstart import aha_judge, split_multi_turn_rows from .pytest import evaluation_test, SingleTurnRolloutProcessor +from .pytest.parameterize import DefaultParameterIdGenerator from .adapters import OpenAIResponsesAdapter @@ -61,6 +62,7 @@ warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol") __all__ = [ + "DefaultParameterIdGenerator", "aha_judge", "split_multi_turn_rows", "evaluation_test", diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index a7ec65f3..0def715b 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -158,8 +158,14 @@ def evaluation_test( exception_handler_config: Configuration for exception handling and backoff retry logic. If not provided, a default configuration will be used with common retryable exceptions. """ + # Default to [None] when completion_params is not provided + # This allows evaluation-only tests (e.g., using NoOpRolloutProcessor) + # to work without requiring model generation parameters if completion_params is None: + completion_params_provided = False completion_params = [None] + else: + completion_params_provided = True if rollout_processor is None: rollout_processor = NoOpRolloutProcessor() @@ -201,6 +207,7 @@ def decorator( combinations, input_dataset, completion_params, + completion_params_provided, input_messages, input_rows, evaluation_test_kwargs, @@ -565,12 +572,14 @@ async def execute_run_with_progress(run_idx: int, config): return create_dynamically_parameterized_wrapper( test_func, wrapper_body, - pytest_parametrize_args["argnames"], + pytest_parametrize_args["sig_parameters"], ) # Create the pytest wrapper pytest_wrapper = create_wrapper_with_signature() - pytest_wrapper = pytest.mark.parametrize(**pytest_parametrize_args)(pytest_wrapper) + pytest_wrapper = pytest.mark.parametrize(**pytest_parametrize_args["pytest_parametrize_kwargs"])( + pytest_wrapper + ) pytest_wrapper = pytest.mark.asyncio(pytest_wrapper) # Create the dual mode wrapper diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py index cba8f65c..d4089b5d 100644 --- a/eval_protocol/pytest/parameterize.py +++ b/eval_protocol/pytest/parameterize.py @@ -9,12 +9,28 @@ from eval_protocol.pytest.types import DatasetPathParam, EvaluationInputParam, InputMessagesParam, TestFunction -class PytestParametrizeArgs(TypedDict): +class PytestMarkParametrizeKwargs(TypedDict): argnames: Sequence[str] argvalues: Iterable[ParameterSet | Sequence[object] | object] ids: Iterable[str] | None +class ParametrizeArgs(TypedDict): + """ + This contains all the necessary information to properly hijack the test + function's signature and dynamically inject usage of + pytest.mark.parametrize. The two will differ when a user manually provides + the pytest.mark.parametrize decorator instead of passing completion_params + on their own. + """ + + # for create_dynamically_parameterized_wrapper + sig_parameters: Sequence[str] + + # for pytest.mark.parametrize + pytest_parametrize_kwargs: PytestMarkParametrizeKwargs + + class ParameterIdGenerator(Protocol): """Protocol for generating pytest parameter IDs from parameter combinations.""" @@ -30,7 +46,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None: ... -class DefaultParameterIdGenerator: +class DefaultParameterIdGenerator(ParameterIdGenerator): """Default ID generator that creates meaningful IDs from parameter combinations.""" def __init__(self, max_length: int = 200): @@ -46,22 +62,35 @@ def generate_id(self, combo: CombinationTuple) -> str | None: dataset, completion_params, messages, rows, evaluation_test_kwargs = combo if completion_params: - # Get all string, numeric, and boolean values from completion_params, sorted by key - str_values = [] - for key in sorted(completion_params.keys()): - value = completion_params[key] - if isinstance(value, (str, int, float, bool)): - str_values.append(str(value)) + id = self.generate_id_from_dict(completion_params, self.max_length) + if id: + return id + else: + if rows: + return f"rows(len={len(rows)})" + elif messages: + return f"messages(len={len(messages)})" + elif dataset: + return f"dataset(len={len(dataset)})" + return None - if str_values: - id_str = ":".join(str_values) + @staticmethod + def generate_id_from_dict(d: dict[str, object], max_length: int = 200) -> str | None: + # Get all string, numeric, and boolean values from completion_params, sorted by key + str_values = [] + for key in sorted(d.keys()): + value = d[key] + if isinstance(value, (str, int, float, bool)): + str_values.append(str(value)) - # Truncate if too long - if len(id_str) > self.max_length: - id_str = id_str[: self.max_length - 3] + "..." + if str_values: + id_str = ":".join(str_values) - return id_str + # Truncate if too long + if len(id_str) > max_length: + id_str = id_str[: max_length - 3] + "..." + return id_str return None @@ -69,11 +98,12 @@ def pytest_parametrize( combinations: list[CombinationTuple], input_dataset: Sequence[DatasetPathParam] | None, completion_params: Sequence[CompletionParams | None] | None, + completion_params_provided: bool, input_messages: Sequence[list[InputMessagesParam] | None] | None, input_rows: Sequence[list[EvaluationRow]] | None, evaluation_test_kwargs: Sequence[EvaluationInputParam | None] | None, id_generator: ParameterIdGenerator | None = None, -) -> PytestParametrizeArgs: +) -> ParametrizeArgs: """ This function dynamically generates pytest.mark.parametrize arguments for a given set of combinations. This is the magic that allows developers to pass in their @@ -84,16 +114,23 @@ def pytest_parametrize( # Create parameter tuples for pytest.mark.parametrize argnames: list[str] = [] + sig_parameters: list[str] = [] if input_dataset is not None: argnames.append("dataset_path") + sig_parameters.append("dataset_path") if completion_params is not None: - argnames.append("completion_params") + if completion_params_provided: + argnames.append("completion_params") + sig_parameters.append("completion_params") if input_messages is not None: argnames.append("input_messages") + sig_parameters.append("input_messages") if input_rows is not None: argnames.append("input_rows") + sig_parameters.append("input_rows") if evaluation_test_kwargs is not None: argnames.append("evaluation_test_kwargs") + sig_parameters.append("evaluation_test_kwargs") # Use default ID generator if none provided if id_generator is None: @@ -109,7 +146,7 @@ def pytest_parametrize( # Build parameter tuple based on what's provided if input_dataset is not None: param_tuple.append(dataset) - if completion_params is not None: + if completion_params_provided: param_tuple.append(cp) if input_messages is not None: param_tuple.append(messages) @@ -132,7 +169,12 @@ def pytest_parametrize( ids.append(combo_id) # Return None for ids if no IDs were generated (let pytest use defaults) - return PytestParametrizeArgs(argnames=argnames, argvalues=argvalues, ids=ids if ids else None) + return ParametrizeArgs( + pytest_parametrize_kwargs=PytestMarkParametrizeKwargs( + argnames=argnames, argvalues=argvalues, ids=ids if ids else None + ), + sig_parameters=sig_parameters, + ) def create_dynamically_parameterized_wrapper( diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 5d8cb983..06fe502c 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -27,6 +27,7 @@ EvaluationRow, SingleTurnRolloutProcessor, OpenAIResponsesAdapter, + DefaultParameterIdGenerator, ) adapter = OpenAIResponsesAdapter() @@ -41,10 +42,9 @@ @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI") -@pytest.mark.asyncio -@evaluation_test( - input_rows=[input_rows], - completion_params=[ +@pytest.mark.parametrize( + "completion_params", + [ { "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1", }, @@ -52,6 +52,10 @@ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", }, ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, +) +@evaluation_test( + input_rows=[input_rows], rollout_processor=SingleTurnRolloutProcessor(), preprocess_fn=split_multi_turn_rows, mode="all", diff --git a/tests/pytest/test_parameterized_ids.py b/tests/pytest/test_parameterized_ids.py index b182bfe5..b0f3d215 100644 --- a/tests/pytest/test_parameterized_ids.py +++ b/tests/pytest/test_parameterized_ids.py @@ -1,12 +1,47 @@ +from collections.abc import Awaitable, Callable + +import pytest from eval_protocol.models import EvaluationRow, Message from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.parameterize import DefaultParameterIdGenerator, pytest_parametrize from eval_protocol.pytest.generate_parameter_combinations import generate_parameter_combinations +from eval_protocol.pytest.types import TestFunction + + +def verify_parametrize_mark(test_function: TestFunction, expected_ids_set: list[object]): + # The function should exist and be callable + assert test_function is not None + assert callable(test_function) + + # Test that the decorator was applied (function should have pytest marks) + import pytest + + marks = getattr(test_function, "pytestmark", []) + assert len(marks) > 0, "Function should have pytest marks from evaluation_test decorator" + + # Verify it's a parametrize mark + parametrize_marks = [mark for mark in marks if hasattr(mark, "name") and mark.name == "parametrize"] + assert len(parametrize_marks) > 0, "Should have parametrize mark" + + assert len(parametrize_marks) == len(expected_ids_set), ( + f"Expected {len(expected_ids_set)} parametrize marks, got {len(parametrize_marks)}" + ) + + # Check that the parametrize mark has IDs + for parametrize_mark, expected_ids in zip(parametrize_marks, expected_ids_set): + assert hasattr(parametrize_mark, "kwargs"), "Parametrize mark should have kwargs" + assert "ids" in parametrize_mark.kwargs, "Should have ids in kwargs" + + # Extract the IDs from the parametrize mark + ids = parametrize_mark.kwargs.get("ids") + if not ids: + raise ValueError("No IDs found in parametrize mark") + # Should have IDs for all parameters that have string/numeric values + assert ids == expected_ids, f"Expected {expected_ids}, got {ids}" def test_parameterized_ids(): """Test that evaluation_test generates proper parameter IDs.""" - collected_ids = [] @evaluation_test( input_messages=[[[Message(role="user", content="Hello, how are you?")]]], @@ -17,35 +52,38 @@ def test_parameterized_ids(): ], ) def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow: - # Collect the row to verify it was processed - collected_ids.append(row.input_metadata.row_id) return row - # The function should exist and be callable - assert test_parameterized_ids is not None - assert callable(test_parameterized_ids) - - # Test that the decorator was applied (function should have pytest marks) - import pytest + verify_parametrize_mark( + test_parameterized_ids, [["fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "gpt-4", "0.5"]] + ) - marks = getattr(test_parameterized_ids, "pytestmark", []) - assert len(marks) > 0, "Function should have pytest marks from evaluation_test decorator" - # Verify it's a parametrize mark - parametrize_marks = [mark for mark in marks if hasattr(mark, "name") and mark.name == "parametrize"] - assert len(parametrize_marks) > 0, "Should have parametrize mark" +def test_parametrized_ids_with_manual_decorator_and_input_rows(): + """Test that evaluation_test generates proper parameter IDs.""" - # Check that the parametrize mark has IDs - parametrize_mark = parametrize_marks[0] - assert hasattr(parametrize_mark, "kwargs"), "Parametrize mark should have kwargs" - assert "ids" in parametrize_mark.kwargs, "Should have ids in kwargs" + @pytest.mark.parametrize( + "completion_params", + [ + {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"}, + {"model": "gpt-4"}, + {"temperature": 0.5}, + ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, + ) + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="Hello, how are you?")])]], + ) + def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow: + return row - # Extract the IDs from the parametrize mark - ids = parametrize_mark.kwargs.get("ids") - if ids is not None: - # Should have IDs for all parameters that have string/numeric values - expected_ids = ["fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "gpt-4", "0.5"] - assert list(ids) == expected_ids, f"Expected {expected_ids}, got {list(ids)}" + verify_parametrize_mark( + test_parameterized_ids, + [ + ["rows(len=1)"], + DefaultParameterIdGenerator.generate_id_from_dict, + ], + ) def test_default_id_generator(): @@ -113,14 +151,15 @@ def test_pytest_parametrize_with_custom_id_generator(): combinations=combinations, input_dataset=None, completion_params=[{"model": "gpt-4"}, {"model": "claude-3"}, {"temperature": 0.5}], + completion_params_provided=True, input_messages=None, input_rows=None, evaluation_test_kwargs=None, ) - assert result["argnames"] == ["completion_params"] - assert len(list(result["argvalues"])) == 3 - assert result["ids"] == ["gpt-4", "claude-3", "0.5"] # All have string/numeric values + assert result["pytest_parametrize_kwargs"]["argnames"] == ["completion_params"] + assert len(list(result["pytest_parametrize_kwargs"]["argvalues"])) == 3 + assert result["pytest_parametrize_kwargs"]["ids"] == ["gpt-4", "claude-3", "0.5"] # All have string/numeric values def test_id_generator_max_length(): From ab6d761df83caf89439d3110c9edb65a55b1fc9c Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 18 Sep 2025 14:33:04 -0700 Subject: [PATCH 3/5] delete proposal --- development/notes/eval_protocol_v2.md | 277 -------------------------- 1 file changed, 277 deletions(-) delete mode 100644 development/notes/eval_protocol_v2.md diff --git a/development/notes/eval_protocol_v2.md b/development/notes/eval_protocol_v2.md deleted file mode 100644 index a8b8b130..00000000 --- a/development/notes/eval_protocol_v2.md +++ /dev/null @@ -1,277 +0,0 @@ -Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator). - -## Complete Examples - -### 1. Pointwise Evaluator (100 separate tests) - -```python -# conftest.py -import pytest -from eval_protocol.framework import EvaluationFramework, EvaluationRow - -MATH_DATASET = [ - {"problem": "What is 2+2?", "answer": "4"}, - {"problem": "What is 3*3?", "answer": "9"}, - {"problem": "What is 10/2?", "answer": "5"}, - {"problem": "What is 15-7?", "answer": "8"}, - {"problem": "What is 6*7?", "answer": "42"}, - # ... 95 more rows -] - -@pytest.fixture -def math_dataset(): - """Raw math dataset fixture""" - return MATH_DATASET - -@pytest.fixture -def preprocess_fn(): - """Preprocessing function for the dataset""" - def _preprocess(item): - return { - "messages": [{"role": "user", "content": item["problem"]}], - "expected_answer": item["answer"] - } - return _preprocess - -@pytest.fixture(params=[ - {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100}, - {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100}, - {"model": "claude-3", "temperature": 0.3, "max_tokens": 100} -]) -def completion_params(request): - """Completion parameters - parametrized across different models""" - return request.param - -# Pointwise fixture - parametrized across BOTH completion params AND dataset rows -@pytest.fixture(params=range(len(MATH_DATASET))) -def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request): - """Single evaluation row - parametrized across completion params AND dataset rows""" - framework = EvaluationFramework() - - # Get the specific row based on parametrization - row_index = request.param - raw_item = math_dataset[row_index] - processed_item = preprocess_fn(raw_item) - - # Run the completion - result = await framework.run_completion(processed_item, completion_params) - - return EvaluationRow( - input_data=processed_item, - completion_params=completion_params, - completion_response=result - ) - -# Batch fixture - parametrized across completion params only -@pytest.fixture -async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params): - """All evaluation rows - parametrized across completion params only""" - framework = EvaluationFramework() - - # Process all rows - processed_items = [preprocess_fn(item) for item in math_dataset] - - # Run completions for all rows - results = [] - for item in processed_items: - result = await framework.run_completion(item, completion_params) - results.append(EvaluationRow( - input_data=item, - completion_params=completion_params, - completion_response=result - )) - - return results -``` - -```python -# test_math_evaluation.py -import pytest -import re -from eval_protocol.framework import EvaluationRow - -# POINTWISE EVALUATOR - 100 separate tests (one per row per model) -def test_math_accuracy_pointwise(evaluation_row_pointwise): - """Pointwise evaluator - runs once per row per completion param""" - response = evaluation_row_pointwise.completion_response - expected = evaluation_row_pointwise.input_data["expected_answer"] - - # Extract numeric answer from response - numbers = re.findall(r'-?\d+\.?\d*', response) - if not numbers: - pytest.fail(f"Could not extract number from response: {response}") - - predicted = float(numbers[0]) - expected_num = float(expected) - - # Assert the answer is correct - assert abs(predicted - expected_num) < 0.01, \ - f"Expected {expected_num}, got {predicted} in response: {response}" - -# BATCH EVALUATOR - 3 tests total (one per model) -def test_math_accuracy_batch(evaluation_rows_batch): - """Batch evaluator - runs once per completion param with all rows""" - total_correct = 0 - total_samples = len(evaluation_rows_batch) - failed_rows = [] - - for i, row in enumerate(evaluation_rows_batch): - response = row.completion_response - expected = row.input_data["expected_answer"] - - # Extract numeric answer - numbers = re.findall(r'-?\d+\.?\d*', response) - if not numbers: - failed_rows.append({ - "index": i, - "problem": row.input_data["messages"][0]["content"], - "expected": expected, - "response": response, - "error": "Could not extract number" - }) - continue - - predicted = float(numbers[0]) - expected_num = float(expected) - - if abs(predicted - expected_num) < 0.01: - total_correct += 1 - else: - failed_rows.append({ - "index": i, - "problem": row.input_data["messages"][0]["content"], - "expected": expected, - "predicted": predicted, - "response": response, - "error": f"Expected {expected_num}, got {predicted}" - }) - - # Calculate accuracy - accuracy = total_correct / total_samples - - # Print detailed results for debugging - print(f"\nBatch Evaluation Results:") - print(f"Total samples: {total_samples}") - print(f"Correct: {total_correct}") - print(f"Accuracy: {accuracy:.2f}") - - if failed_rows: - print(f"\nFailed rows ({len(failed_rows)}):") - for row in failed_rows[:10]: # Show first 10 failures - print(f" Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})") - if len(failed_rows) > 10: - print(f" ... and {len(failed_rows) - 10} more failures") - - # Assertions - assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8" - assert total_correct > 0, "No correct answers found" - -# Additional batch evaluator with model-specific assertions -def test_math_accuracy_with_model_info(evaluation_rows_batch): - """Batch evaluator with model-specific assertions""" - model = evaluation_rows_batch[0].completion_params["model"] - temperature = evaluation_rows_batch[0].completion_params["temperature"] - - total_correct = 0 - for row in evaluation_rows_batch: - response = row.completion_response - expected = row.input_data["expected_answer"] - - numbers = re.findall(r'-?\d+\.?\d*', response) - if numbers: - predicted = float(numbers[0]) - expected_num = float(expected) - if abs(predicted - expected_num) < 0.01: - total_correct += 1 - - accuracy = total_correct / len(evaluation_rows_batch) - - # Model-specific assertions - if model == "gpt-4": - assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low" - elif model == "gpt-3.5-turbo": - assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low" - elif model == "claude-3": - assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low" - - print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}") - -# Optional: Debug function for specific rows -def test_math_accuracy_debug_specific_rows(evaluation_rows_batch): - """Debug function to test specific rows - only runs on first few rows""" - # Only test first 5 rows for debugging - debug_rows = evaluation_rows_batch[:5] - - for i, row in enumerate(debug_rows): - response = row.completion_response - expected = row.input_data["expected_answer"] - - numbers = re.findall(r'-?\d+\.?\d*', response) - if not numbers: - pytest.fail(f"Row {i}: Could not extract number from response: {response}") - - predicted = float(numbers[0]) - expected_num = float(expected) - - assert abs(predicted - expected_num) < 0.01, \ - f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}" -``` - -### 2. Running the Tests - -```bash -# Run pointwise evaluator (100 rows × 3 models = 300 tests) -pytest test_math_evaluation.py::test_math_accuracy_pointwise -v - -# Run batch evaluator (3 models = 3 tests) -pytest test_math_evaluation.py::test_math_accuracy_batch -v - -# Run all tests (300 + 3 = 303 tests total) -pytest test_math_evaluation.py -v - -# Run with specific model -pytest test_math_evaluation.py -k "gpt-4" -v - -# Run only batch tests -pytest test_math_evaluation.py -k "batch" -v - -# Run only pointwise tests -pytest test_math_evaluation.py -k "pointwise" -v -``` - -### 3. Expected Output - -**Pointwise evaluator output:** -``` -test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED -test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED -test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED -# ... 97 more tests for completion_params0 -test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED -# ... 100 tests for completion_params1 -test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED -# ... 100 tests for completion_params2 -``` - -**Batch evaluator output:** -``` -test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED -test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED -test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED -``` - -### 4. Key Differences - -**Pointwise Evaluator:** -- **Test count**: 100 rows × 3 models = 300 tests -- **Benefits**: Easy to debug individual rows, clear failure reporting per row -- **Use case**: When you want to see exactly which rows fail and why -- **Pytest output**: Each row gets its own test result - -**Batch Evaluator:** -- **Test count**: 3 models = 3 tests -- **Benefits**: Faster execution, easier to manage, good for overall accuracy -- **Use case**: When you care about overall performance across the dataset -- **Pytest output**: One test result per model with detailed internal reporting - -Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach! From 94ae1b3ddef8956a4a9a6395edd359f288913134 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 18 Sep 2025 16:01:55 -0700 Subject: [PATCH 4/5] test_import_logs works --- eval_protocol/pytest/evaluation_test.py | 3 +- .../pytest/generate_parameter_combinations.py | 2 +- eval_protocol/pytest/parameterize.py | 117 +++++++++++++++++- .../quickstart/llm_judge_openai_responses.py | 1 - tests/pytest/test_parameterized_ids.py | 1 + 5 files changed, 119 insertions(+), 5 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 0def715b..e51d008b 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -205,6 +205,7 @@ def decorator( # Create parameter tuples for pytest.mark.parametrize pytest_parametrize_args = pytest_parametrize( combinations, + test_func, input_dataset, completion_params, completion_params_provided, @@ -268,7 +269,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo index = abs(index) % (max_index + 1) row.input_metadata.row_id = generate_id(seed=0, index=index) - completion_params = kwargs["completion_params"] + completion_params = kwargs["completion_params"] if "completion_params" in kwargs else None # Create eval metadata with test function info and current commit hash eval_metadata = EvalMetadata( name=test_func.__name__, diff --git a/eval_protocol/pytest/generate_parameter_combinations.py b/eval_protocol/pytest/generate_parameter_combinations.py index 6a1dcf2f..99c37b74 100644 --- a/eval_protocol/pytest/generate_parameter_combinations.py +++ b/eval_protocol/pytest/generate_parameter_combinations.py @@ -31,7 +31,7 @@ ] -class ParameterizedTestKwargs(TypedDict): +class ParameterizedTestKwargs(TypedDict, total=False): """ These are the type of parameters that can be passed to the generated pytest function. Every experiment is a unique combination of these parameters. diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py index d4089b5d..a2140da5 100644 --- a/eval_protocol/pytest/parameterize.py +++ b/eval_protocol/pytest/parameterize.py @@ -1,3 +1,4 @@ +import ast import inspect from typing import TypedDict, Protocol from collections.abc import Callable, Sequence, Iterable, Awaitable @@ -9,6 +10,111 @@ from eval_protocol.pytest.types import DatasetPathParam, EvaluationInputParam, InputMessagesParam, TestFunction +def _has_pytest_parametrize_with_completion_params(test_func: TestFunction) -> bool: + """ + Check if a test function has a pytest.mark.parametrize decorator with argnames="completion_params". + + This function uses inspect.getsource and ast to parse the function's source code and look for + pytest.mark.parametrize decorators that include "completion_params" in their argnames. + + Args: + test_func: The test function to analyze + + Returns: + True if the function has a pytest.mark.parametrize decorator with "completion_params" in argnames, + False otherwise + + Raises: + OSError: If the source code cannot be retrieved (e.g., function is defined in interactive mode) + SyntaxError: If the source code cannot be parsed as valid Python + """ + try: + source = inspect.getsource(test_func) + except OSError: + # Function source cannot be retrieved (e.g., defined in interactive mode) + return False + + try: + tree = ast.parse(source) + except SyntaxError: + # Source code cannot be parsed + return False + + # Walk through the AST to find pytest.mark.parametrize decorators + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): + # Check decorators on this function + for decorator in node.decorator_list: + if _is_pytest_parametrize_with_completion_params(decorator): + return True + + return False + + +def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool: + """ + Check if a decorator is pytest.mark.parametrize with "completion_params" in argnames. + + Args: + decorator: AST node representing a decorator + + Returns: + True if this is a pytest.mark.parametrize decorator with "completion_params" in argnames + """ + # Look for pytest.mark.parametrize pattern + if isinstance(decorator, ast.Call): + # Check if it's pytest.mark.parametrize + if isinstance(decorator.func, ast.Attribute): + if ( + isinstance(decorator.func.value, ast.Attribute) + and isinstance(decorator.func.value.value, ast.Name) + and decorator.func.value.value.id == "pytest" + and decorator.func.value.attr == "mark" + and decorator.func.attr == "parametrize" + ): + # Check positional arguments first (argnames is typically the first positional arg) + if len(decorator.args) > 0: + argnames_arg = decorator.args[0] + if _check_argnames_for_completion_params(argnames_arg): + return True + + # Check keyword arguments for argnames + for keyword in decorator.keywords: + if keyword.arg == "argnames": + if _check_argnames_for_completion_params(keyword.value): + return True + + return False + + +def _check_argnames_for_completion_params(argnames_node: ast.expr) -> bool: + """ + Check if an argnames AST node contains "completion_params". + + Args: + argnames_node: AST node representing the argnames value + + Returns: + True if argnames contains "completion_params" + """ + if isinstance(argnames_node, ast.Constant): + # Single string case: argnames="completion_params" + if argnames_node.value == "completion_params": + return True + elif isinstance(argnames_node, ast.List): + # List case: argnames=["completion_params", ...] + for elt in argnames_node.elts: + if isinstance(elt, ast.Constant) and elt.value == "completion_params": + return True + elif isinstance(argnames_node, ast.Tuple): + # Tuple case: argnames=("completion_params", ...) + for elt in argnames_node.elts: + if isinstance(elt, ast.Constant) and elt.value == "completion_params": + return True + + return False + + class PytestMarkParametrizeKwargs(TypedDict): argnames: Sequence[str] argvalues: Iterable[ParameterSet | Sequence[object] | object] @@ -96,6 +202,7 @@ def generate_id_from_dict(d: dict[str, object], max_length: int = 200) -> str | def pytest_parametrize( combinations: list[CombinationTuple], + test_func: TestFunction | None, input_dataset: Sequence[DatasetPathParam] | None, completion_params: Sequence[CompletionParams | None] | None, completion_params_provided: bool, @@ -112,6 +219,11 @@ def pytest_parametrize( API. """ + if test_func is not None: + has_pytest_parametrize = _has_pytest_parametrize_with_completion_params(test_func) + else: + has_pytest_parametrize = False + # Create parameter tuples for pytest.mark.parametrize argnames: list[str] = [] sig_parameters: list[str] = [] @@ -119,9 +231,10 @@ def pytest_parametrize( argnames.append("dataset_path") sig_parameters.append("dataset_path") if completion_params is not None: - if completion_params_provided: + if completion_params_provided and not has_pytest_parametrize: argnames.append("completion_params") - sig_parameters.append("completion_params") + if has_pytest_parametrize or completion_params_provided: + sig_parameters.append("completion_params") if input_messages is not None: argnames.append("input_messages") sig_parameters.append("input_messages") diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 06fe502c..7574e07d 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -52,7 +52,6 @@ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", }, ], - ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( input_rows=[input_rows], diff --git a/tests/pytest/test_parameterized_ids.py b/tests/pytest/test_parameterized_ids.py index b0f3d215..d3363d0c 100644 --- a/tests/pytest/test_parameterized_ids.py +++ b/tests/pytest/test_parameterized_ids.py @@ -149,6 +149,7 @@ def test_pytest_parametrize_with_custom_id_generator(): # Test with default generator result = pytest_parametrize( combinations=combinations, + test_func=None, input_dataset=None, completion_params=[{"model": "gpt-4"}, {"model": "claude-3"}, {"temperature": 0.5}], completion_params_provided=True, From d2d5d9555c63a76249bad11ee22cbbef9e8f2ba2 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Thu, 18 Sep 2025 16:03:40 -0700 Subject: [PATCH 5/5] add ids --- eval_protocol/quickstart/llm_judge_openai_responses.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py index 7574e07d..06fe502c 100644 --- a/eval_protocol/quickstart/llm_judge_openai_responses.py +++ b/eval_protocol/quickstart/llm_judge_openai_responses.py @@ -52,6 +52,7 @@ "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905", }, ], + ids=DefaultParameterIdGenerator.generate_id_from_dict, ) @evaluation_test( input_rows=[input_rows],