From aa6077cd04401ce7e80b9d0cb62c5b0b93070b48 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Wed, 17 Sep 2025 15:42:29 -0700
Subject: [PATCH 1/5] v2 proposal

---
 development/notes/eval_protocol_v2.md | 277 ++++++++++++++++++++++++++
 1 file changed, 277 insertions(+)
 create mode 100644 development/notes/eval_protocol_v2.md

diff --git a/development/notes/eval_protocol_v2.md b/development/notes/eval_protocol_v2.md
new file mode 100644
index 00000000..a8b8b130
--- /dev/null
+++ b/development/notes/eval_protocol_v2.md
@@ -0,0 +1,277 @@
+Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator).
+
+## Complete Examples
+
+### 1. Pointwise Evaluator (100 separate tests)
+
+```python
+# conftest.py
+import pytest
+from eval_protocol.framework import EvaluationFramework, EvaluationRow
+
+MATH_DATASET = [
+    {"problem": "What is 2+2?", "answer": "4"},
+    {"problem": "What is 3*3?", "answer": "9"},
+    {"problem": "What is 10/2?", "answer": "5"},
+    {"problem": "What is 15-7?", "answer": "8"},
+    {"problem": "What is 6*7?", "answer": "42"},
+    # ... 95 more rows
+]
+
+@pytest.fixture
+def math_dataset():
+    """Raw math dataset fixture"""
+    return MATH_DATASET
+
+@pytest.fixture
+def preprocess_fn():
+    """Preprocessing function for the dataset"""
+    def _preprocess(item):
+        return {
+            "messages": [{"role": "user", "content": item["problem"]}],
+            "expected_answer": item["answer"]
+        }
+    return _preprocess
+
+@pytest.fixture(params=[
+    {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100},
+    {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100},
+    {"model": "claude-3", "temperature": 0.3, "max_tokens": 100}
+])
+def completion_params(request):
+    """Completion parameters - parametrized across different models"""
+    return request.param
+
+# Pointwise fixture - parametrized across BOTH completion params AND dataset rows
+@pytest.fixture(params=range(len(MATH_DATASET)))
+def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request):
+    """Single evaluation row - parametrized across completion params AND dataset rows"""
+    framework = EvaluationFramework()
+
+    # Get the specific row based on parametrization
+    row_index = request.param
+    raw_item = math_dataset[row_index]
+    processed_item = preprocess_fn(raw_item)
+
+    # Run the completion
+    result = await framework.run_completion(processed_item, completion_params)
+
+    return EvaluationRow(
+        input_data=processed_item,
+        completion_params=completion_params,
+        completion_response=result
+    )
+
+# Batch fixture - parametrized across completion params only
+@pytest.fixture
+async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params):
+    """All evaluation rows - parametrized across completion params only"""
+    framework = EvaluationFramework()
+
+    # Process all rows
+    processed_items = [preprocess_fn(item) for item in math_dataset]
+
+    # Run completions for all rows
+    results = []
+    for item in processed_items:
+        result = await framework.run_completion(item, completion_params)
+        results.append(EvaluationRow(
+            input_data=item,
+            completion_params=completion_params,
+            completion_response=result
+        ))
+
+    return results
+```
+
+```python
+# test_math_evaluation.py
+import pytest
+import re
+from eval_protocol.framework import EvaluationRow
+
+# POINTWISE EVALUATOR - 100 separate tests (one per row per model)
+def test_math_accuracy_pointwise(evaluation_row_pointwise):
+    """Pointwise evaluator - runs once per row per completion param"""
+    response = evaluation_row_pointwise.completion_response
+    expected = evaluation_row_pointwise.input_data["expected_answer"]
+
+    # Extract numeric answer from response
+    numbers = re.findall(r'-?\d+\.?\d*', response)
+    if not numbers:
+        pytest.fail(f"Could not extract number from response: {response}")
+
+    predicted = float(numbers[0])
+    expected_num = float(expected)
+
+    # Assert the answer is correct
+    assert abs(predicted - expected_num) < 0.01, \
+        f"Expected {expected_num}, got {predicted} in response: {response}"
+
+# BATCH EVALUATOR - 3 tests total (one per model)
+def test_math_accuracy_batch(evaluation_rows_batch):
+    """Batch evaluator - runs once per completion param with all rows"""
+    total_correct = 0
+    total_samples = len(evaluation_rows_batch)
+    failed_rows = []
+
+    for i, row in enumerate(evaluation_rows_batch):
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        # Extract numeric answer
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if not numbers:
+            failed_rows.append({
+                "index": i,
+                "problem": row.input_data["messages"][0]["content"],
+                "expected": expected,
+                "response": response,
+                "error": "Could not extract number"
+            })
+            continue
+
+        predicted = float(numbers[0])
+        expected_num = float(expected)
+
+        if abs(predicted - expected_num) < 0.01:
+            total_correct += 1
+        else:
+            failed_rows.append({
+                "index": i,
+                "problem": row.input_data["messages"][0]["content"],
+                "expected": expected,
+                "predicted": predicted,
+                "response": response,
+                "error": f"Expected {expected_num}, got {predicted}"
+            })
+
+    # Calculate accuracy
+    accuracy = total_correct / total_samples
+
+    # Print detailed results for debugging
+    print(f"\nBatch Evaluation Results:")
+    print(f"Total samples: {total_samples}")
+    print(f"Correct: {total_correct}")
+    print(f"Accuracy: {accuracy:.2f}")
+
+    if failed_rows:
+        print(f"\nFailed rows ({len(failed_rows)}):")
+        for row in failed_rows[:10]:  # Show first 10 failures
+            print(f"  Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})")
+        if len(failed_rows) > 10:
+            print(f"  ... and {len(failed_rows) - 10} more failures")
+
+    # Assertions
+    assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8"
+    assert total_correct > 0, "No correct answers found"
+
+# Additional batch evaluator with model-specific assertions
+def test_math_accuracy_with_model_info(evaluation_rows_batch):
+    """Batch evaluator with model-specific assertions"""
+    model = evaluation_rows_batch[0].completion_params["model"]
+    temperature = evaluation_rows_batch[0].completion_params["temperature"]
+
+    total_correct = 0
+    for row in evaluation_rows_batch:
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if numbers:
+            predicted = float(numbers[0])
+            expected_num = float(expected)
+            if abs(predicted - expected_num) < 0.01:
+                total_correct += 1
+
+    accuracy = total_correct / len(evaluation_rows_batch)
+
+    # Model-specific assertions
+    if model == "gpt-4":
+        assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low"
+    elif model == "gpt-3.5-turbo":
+        assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low"
+    elif model == "claude-3":
+        assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low"
+
+    print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}")
+
+# Optional: Debug function for specific rows
+def test_math_accuracy_debug_specific_rows(evaluation_rows_batch):
+    """Debug function to test specific rows - only runs on first few rows"""
+    # Only test first 5 rows for debugging
+    debug_rows = evaluation_rows_batch[:5]
+
+    for i, row in enumerate(debug_rows):
+        response = row.completion_response
+        expected = row.input_data["expected_answer"]
+
+        numbers = re.findall(r'-?\d+\.?\d*', response)
+        if not numbers:
+            pytest.fail(f"Row {i}: Could not extract number from response: {response}")
+
+        predicted = float(numbers[0])
+        expected_num = float(expected)
+
+        assert abs(predicted - expected_num) < 0.01, \
+            f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}"
+```
+
+### 2. Running the Tests
+
+```bash
+# Run pointwise evaluator (100 rows × 3 models = 300 tests)
+pytest test_math_evaluation.py::test_math_accuracy_pointwise -v
+
+# Run batch evaluator (3 models = 3 tests)
+pytest test_math_evaluation.py::test_math_accuracy_batch -v
+
+# Run all tests (300 + 3 = 303 tests total)
+pytest test_math_evaluation.py -v
+
+# Run with specific model
+pytest test_math_evaluation.py -k "gpt-4" -v
+
+# Run only batch tests
+pytest test_math_evaluation.py -k "batch" -v
+
+# Run only pointwise tests
+pytest test_math_evaluation.py -k "pointwise" -v
+```
+
+### 3. Expected Output
+
+**Pointwise evaluator output:**
+```
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED
+# ... 97 more tests for completion_params0
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED
+# ... 100 tests for completion_params1
+test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED
+# ... 100 tests for completion_params2
+```
+
+**Batch evaluator output:**
+```
+test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED
+test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED
+test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED
+```
+
+### 4. Key Differences
+
+**Pointwise Evaluator:**
+- **Test count**: 100 rows × 3 models = 300 tests
+- **Benefits**: Easy to debug individual rows, clear failure reporting per row
+- **Use case**: When you want to see exactly which rows fail and why
+- **Pytest output**: Each row gets its own test result
+
+**Batch Evaluator:**
+- **Test count**: 3 models = 3 tests
+- **Benefits**: Faster execution, easier to manage, good for overall accuracy
+- **Use case**: When you care about overall performance across the dataset
+- **Pytest output**: One test result per model with detailed internal reporting
+
+Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach!

From 8e406ee8169b3151d57ce489f35b3ccea0cc4f8b Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Thu, 18 Sep 2025 14:32:11 -0700
Subject: [PATCH 2/5] allow for manual parametrization using pytest

---
 eval_protocol/__init__.py                     |  2 +
 eval_protocol/pytest/evaluation_test.py       | 13 ++-
 eval_protocol/pytest/parameterize.py          | 78 ++++++++++++----
 .../quickstart/llm_judge_openai_responses.py  | 12 ++-
 tests/pytest/test_parameterized_ids.py        | 93 +++++++++++++------
 5 files changed, 147 insertions(+), 51 deletions(-)

diff --git a/eval_protocol/__init__.py b/eval_protocol/__init__.py
index cd1efd2c..e6c000d2 100644
--- a/eval_protocol/__init__.py
+++ b/eval_protocol/__init__.py
@@ -39,6 +39,7 @@
 from .typed_interface import reward_function
 from .quickstart import aha_judge, split_multi_turn_rows
 from .pytest import evaluation_test, SingleTurnRolloutProcessor
+from .pytest.parameterize import DefaultParameterIdGenerator
 
 from .adapters import OpenAIResponsesAdapter
 
@@ -61,6 +62,7 @@
 warnings.filterwarnings("default", category=DeprecationWarning, module="eval_protocol")
 
 __all__ = [
+    "DefaultParameterIdGenerator",
     "aha_judge",
     "split_multi_turn_rows",
     "evaluation_test",
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index a7ec65f3..0def715b 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -158,8 +158,14 @@ def evaluation_test(
         exception_handler_config: Configuration for exception handling and backoff retry logic.
             If not provided, a default configuration will be used with common retryable exceptions.
     """
+    # Default to [None] when completion_params is not provided
+    # This allows evaluation-only tests (e.g., using NoOpRolloutProcessor)
+    # to work without requiring model generation parameters
     if completion_params is None:
+        completion_params_provided = False
         completion_params = [None]
+    else:
+        completion_params_provided = True
     if rollout_processor is None:
         rollout_processor = NoOpRolloutProcessor()
 
@@ -201,6 +207,7 @@ def decorator(
             combinations,
             input_dataset,
             completion_params,
+            completion_params_provided,
             input_messages,
             input_rows,
             evaluation_test_kwargs,
@@ -565,12 +572,14 @@ async def execute_run_with_progress(run_idx: int, config):
             return create_dynamically_parameterized_wrapper(
                 test_func,
                 wrapper_body,
-                pytest_parametrize_args["argnames"],
+                pytest_parametrize_args["sig_parameters"],
             )
 
         # Create the pytest wrapper
         pytest_wrapper = create_wrapper_with_signature()
-        pytest_wrapper = pytest.mark.parametrize(**pytest_parametrize_args)(pytest_wrapper)
+        pytest_wrapper = pytest.mark.parametrize(**pytest_parametrize_args["pytest_parametrize_kwargs"])(
+            pytest_wrapper
+        )
         pytest_wrapper = pytest.mark.asyncio(pytest_wrapper)
 
         # Create the dual mode wrapper
diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py
index cba8f65c..d4089b5d 100644
--- a/eval_protocol/pytest/parameterize.py
+++ b/eval_protocol/pytest/parameterize.py
@@ -9,12 +9,28 @@
 from eval_protocol.pytest.types import DatasetPathParam, EvaluationInputParam, InputMessagesParam, TestFunction
 
 
-class PytestParametrizeArgs(TypedDict):
+class PytestMarkParametrizeKwargs(TypedDict):
     argnames: Sequence[str]
     argvalues: Iterable[ParameterSet | Sequence[object] | object]
     ids: Iterable[str] | None
 
 
+class ParametrizeArgs(TypedDict):
+    """
+    This contains all the necessary information to properly hijack the test
+    function's signature and dynamically inject usage of
+    pytest.mark.parametrize. The two will differ when a user manually provides
+    the pytest.mark.parametrize decorator instead of passing completion_params
+    on their own.
+    """
+
+    # for create_dynamically_parameterized_wrapper
+    sig_parameters: Sequence[str]
+
+    # for pytest.mark.parametrize
+    pytest_parametrize_kwargs: PytestMarkParametrizeKwargs
+
+
 class ParameterIdGenerator(Protocol):
     """Protocol for generating pytest parameter IDs from parameter combinations."""
 
@@ -30,7 +46,7 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
         ...
 
 
-class DefaultParameterIdGenerator:
+class DefaultParameterIdGenerator(ParameterIdGenerator):
     """Default ID generator that creates meaningful IDs from parameter combinations."""
 
     def __init__(self, max_length: int = 200):
@@ -46,22 +62,35 @@ def generate_id(self, combo: CombinationTuple) -> str | None:
         dataset, completion_params, messages, rows, evaluation_test_kwargs = combo
 
         if completion_params:
-            # Get all string, numeric, and boolean values from completion_params, sorted by key
-            str_values = []
-            for key in sorted(completion_params.keys()):
-                value = completion_params[key]
-                if isinstance(value, (str, int, float, bool)):
-                    str_values.append(str(value))
+            id = self.generate_id_from_dict(completion_params, self.max_length)
+            if id:
+                return id
+        else:
+            if rows:
+                return f"rows(len={len(rows)})"
+            elif messages:
+                return f"messages(len={len(messages)})"
+            elif dataset:
+                return f"dataset(len={len(dataset)})"
+        return None
 
-            if str_values:
-                id_str = ":".join(str_values)
+    @staticmethod
+    def generate_id_from_dict(d: dict[str, object], max_length: int = 200) -> str | None:
+        # Get all string, numeric, and boolean values from completion_params, sorted by key
+        str_values = []
+        for key in sorted(d.keys()):
+            value = d[key]
+            if isinstance(value, (str, int, float, bool)):
+                str_values.append(str(value))
 
-                # Truncate if too long
-                if len(id_str) > self.max_length:
-                    id_str = id_str[: self.max_length - 3] + "..."
+        if str_values:
+            id_str = ":".join(str_values)
 
-                return id_str
+            # Truncate if too long
+            if len(id_str) > max_length:
+                id_str = id_str[: max_length - 3] + "..."
 
+            return id_str
         return None
 
 
@@ -69,11 +98,12 @@ def pytest_parametrize(
     combinations: list[CombinationTuple],
     input_dataset: Sequence[DatasetPathParam] | None,
     completion_params: Sequence[CompletionParams | None] | None,
+    completion_params_provided: bool,
     input_messages: Sequence[list[InputMessagesParam] | None] | None,
     input_rows: Sequence[list[EvaluationRow]] | None,
     evaluation_test_kwargs: Sequence[EvaluationInputParam | None] | None,
     id_generator: ParameterIdGenerator | None = None,
-) -> PytestParametrizeArgs:
+) -> ParametrizeArgs:
     """
     This function dynamically generates pytest.mark.parametrize arguments for a given
     set of combinations. This is the magic that allows developers to pass in their
@@ -84,16 +114,23 @@ def pytest_parametrize(
 
     # Create parameter tuples for pytest.mark.parametrize
     argnames: list[str] = []
+    sig_parameters: list[str] = []
     if input_dataset is not None:
         argnames.append("dataset_path")
+        sig_parameters.append("dataset_path")
     if completion_params is not None:
-        argnames.append("completion_params")
+        if completion_params_provided:
+            argnames.append("completion_params")
+        sig_parameters.append("completion_params")
     if input_messages is not None:
         argnames.append("input_messages")
+        sig_parameters.append("input_messages")
     if input_rows is not None:
         argnames.append("input_rows")
+        sig_parameters.append("input_rows")
     if evaluation_test_kwargs is not None:
         argnames.append("evaluation_test_kwargs")
+        sig_parameters.append("evaluation_test_kwargs")
 
     # Use default ID generator if none provided
     if id_generator is None:
@@ -109,7 +146,7 @@ def pytest_parametrize(
         # Build parameter tuple based on what's provided
         if input_dataset is not None:
             param_tuple.append(dataset)
-        if completion_params is not None:
+        if completion_params_provided:
             param_tuple.append(cp)
         if input_messages is not None:
             param_tuple.append(messages)
@@ -132,7 +169,12 @@ def pytest_parametrize(
             ids.append(combo_id)
 
     # Return None for ids if no IDs were generated (let pytest use defaults)
-    return PytestParametrizeArgs(argnames=argnames, argvalues=argvalues, ids=ids if ids else None)
+    return ParametrizeArgs(
+        pytest_parametrize_kwargs=PytestMarkParametrizeKwargs(
+            argnames=argnames, argvalues=argvalues, ids=ids if ids else None
+        ),
+        sig_parameters=sig_parameters,
+    )
 
 
 def create_dynamically_parameterized_wrapper(
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index 5d8cb983..06fe502c 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -27,6 +27,7 @@
     EvaluationRow,
     SingleTurnRolloutProcessor,
     OpenAIResponsesAdapter,
+    DefaultParameterIdGenerator,
 )
 
 adapter = OpenAIResponsesAdapter()
@@ -41,10 +42,9 @@
 
 
 @pytest.mark.skipif(os.environ.get("CI") == "true", reason="Skip in CI")
-@pytest.mark.asyncio
-@evaluation_test(
-    input_rows=[input_rows],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {
             "model": "fireworks_ai/accounts/fireworks/models/deepseek-v3p1",
         },
@@ -52,6 +52,10 @@
             "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         },
     ],
+    ids=DefaultParameterIdGenerator.generate_id_from_dict,
+)
+@evaluation_test(
+    input_rows=[input_rows],
     rollout_processor=SingleTurnRolloutProcessor(),
     preprocess_fn=split_multi_turn_rows,
     mode="all",
diff --git a/tests/pytest/test_parameterized_ids.py b/tests/pytest/test_parameterized_ids.py
index b182bfe5..b0f3d215 100644
--- a/tests/pytest/test_parameterized_ids.py
+++ b/tests/pytest/test_parameterized_ids.py
@@ -1,12 +1,47 @@
+from collections.abc import Awaitable, Callable
+
+import pytest
 from eval_protocol.models import EvaluationRow, Message
 from eval_protocol.pytest import evaluation_test
 from eval_protocol.pytest.parameterize import DefaultParameterIdGenerator, pytest_parametrize
 from eval_protocol.pytest.generate_parameter_combinations import generate_parameter_combinations
+from eval_protocol.pytest.types import TestFunction
+
+
+def verify_parametrize_mark(test_function: TestFunction, expected_ids_set: list[object]):
+    # The function should exist and be callable
+    assert test_function is not None
+    assert callable(test_function)
+
+    # Test that the decorator was applied (function should have pytest marks)
+    import pytest
+
+    marks = getattr(test_function, "pytestmark", [])
+    assert len(marks) > 0, "Function should have pytest marks from evaluation_test decorator"
+
+    # Verify it's a parametrize mark
+    parametrize_marks = [mark for mark in marks if hasattr(mark, "name") and mark.name == "parametrize"]
+    assert len(parametrize_marks) > 0, "Should have parametrize mark"
+
+    assert len(parametrize_marks) == len(expected_ids_set), (
+        f"Expected {len(expected_ids_set)} parametrize marks, got {len(parametrize_marks)}"
+    )
+
+    # Check that the parametrize mark has IDs
+    for parametrize_mark, expected_ids in zip(parametrize_marks, expected_ids_set):
+        assert hasattr(parametrize_mark, "kwargs"), "Parametrize mark should have kwargs"
+        assert "ids" in parametrize_mark.kwargs, "Should have ids in kwargs"
+
+        # Extract the IDs from the parametrize mark
+        ids = parametrize_mark.kwargs.get("ids")
+        if not ids:
+            raise ValueError("No IDs found in parametrize mark")
+        # Should have IDs for all parameters that have string/numeric values
+        assert ids == expected_ids, f"Expected {expected_ids}, got {ids}"
 
 
 def test_parameterized_ids():
     """Test that evaluation_test generates proper parameter IDs."""
-    collected_ids = []
 
     @evaluation_test(
         input_messages=[[[Message(role="user", content="Hello, how are you?")]]],
@@ -17,35 +52,38 @@ def test_parameterized_ids():
         ],
     )
     def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow:
-        # Collect the row to verify it was processed
-        collected_ids.append(row.input_metadata.row_id)
         return row
 
-    # The function should exist and be callable
-    assert test_parameterized_ids is not None
-    assert callable(test_parameterized_ids)
-
-    # Test that the decorator was applied (function should have pytest marks)
-    import pytest
+    verify_parametrize_mark(
+        test_parameterized_ids, [["fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "gpt-4", "0.5"]]
+    )
 
-    marks = getattr(test_parameterized_ids, "pytestmark", [])
-    assert len(marks) > 0, "Function should have pytest marks from evaluation_test decorator"
 
-    # Verify it's a parametrize mark
-    parametrize_marks = [mark for mark in marks if hasattr(mark, "name") and mark.name == "parametrize"]
-    assert len(parametrize_marks) > 0, "Should have parametrize mark"
+def test_parametrized_ids_with_manual_decorator_and_input_rows():
+    """Test that evaluation_test generates proper parameter IDs."""
 
-    # Check that the parametrize mark has IDs
-    parametrize_mark = parametrize_marks[0]
-    assert hasattr(parametrize_mark, "kwargs"), "Parametrize mark should have kwargs"
-    assert "ids" in parametrize_mark.kwargs, "Should have ids in kwargs"
+    @pytest.mark.parametrize(
+        "completion_params",
+        [
+            {"model": "fireworks_ai/accounts/fireworks/models/gpt-oss-120b"},
+            {"model": "gpt-4"},
+            {"temperature": 0.5},
+        ],
+        ids=DefaultParameterIdGenerator.generate_id_from_dict,
+    )
+    @evaluation_test(
+        input_rows=[[EvaluationRow(messages=[Message(role="user", content="Hello, how are you?")])]],
+    )
+    def test_parameterized_ids(row: EvaluationRow) -> EvaluationRow:
+        return row
 
-    # Extract the IDs from the parametrize mark
-    ids = parametrize_mark.kwargs.get("ids")
-    if ids is not None:
-        # Should have IDs for all parameters that have string/numeric values
-        expected_ids = ["fireworks_ai/accounts/fireworks/models/gpt-oss-120b", "gpt-4", "0.5"]
-        assert list(ids) == expected_ids, f"Expected {expected_ids}, got {list(ids)}"
+    verify_parametrize_mark(
+        test_parameterized_ids,
+        [
+            ["rows(len=1)"],
+            DefaultParameterIdGenerator.generate_id_from_dict,
+        ],
+    )
 
 
 def test_default_id_generator():
@@ -113,14 +151,15 @@ def test_pytest_parametrize_with_custom_id_generator():
         combinations=combinations,
         input_dataset=None,
         completion_params=[{"model": "gpt-4"}, {"model": "claude-3"}, {"temperature": 0.5}],
+        completion_params_provided=True,
         input_messages=None,
         input_rows=None,
         evaluation_test_kwargs=None,
     )
 
-    assert result["argnames"] == ["completion_params"]
-    assert len(list(result["argvalues"])) == 3
-    assert result["ids"] == ["gpt-4", "claude-3", "0.5"]  # All have string/numeric values
+    assert result["pytest_parametrize_kwargs"]["argnames"] == ["completion_params"]
+    assert len(list(result["pytest_parametrize_kwargs"]["argvalues"])) == 3
+    assert result["pytest_parametrize_kwargs"]["ids"] == ["gpt-4", "claude-3", "0.5"]  # All have string/numeric values
 
 
 def test_id_generator_max_length():

From ab6d761df83caf89439d3110c9edb65a55b1fc9c Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Thu, 18 Sep 2025 14:33:04 -0700
Subject: [PATCH 3/5] delete proposal

---
 development/notes/eval_protocol_v2.md | 277 --------------------------
 1 file changed, 277 deletions(-)
 delete mode 100644 development/notes/eval_protocol_v2.md

diff --git a/development/notes/eval_protocol_v2.md b/development/notes/eval_protocol_v2.md
deleted file mode 100644
index a8b8b130..00000000
--- a/development/notes/eval_protocol_v2.md
+++ /dev/null
@@ -1,277 +0,0 @@
-Perfect! Let me give you complete examples for both approaches - pointwise (100 separate tests) and batch (entire dataset passed to evaluator).
-
-## Complete Examples
-
-### 1. Pointwise Evaluator (100 separate tests)
-
-```python
-# conftest.py
-import pytest
-from eval_protocol.framework import EvaluationFramework, EvaluationRow
-
-MATH_DATASET = [
-    {"problem": "What is 2+2?", "answer": "4"},
-    {"problem": "What is 3*3?", "answer": "9"},
-    {"problem": "What is 10/2?", "answer": "5"},
-    {"problem": "What is 15-7?", "answer": "8"},
-    {"problem": "What is 6*7?", "answer": "42"},
-    # ... 95 more rows
-]
-
-@pytest.fixture
-def math_dataset():
-    """Raw math dataset fixture"""
-    return MATH_DATASET
-
-@pytest.fixture
-def preprocess_fn():
-    """Preprocessing function for the dataset"""
-    def _preprocess(item):
-        return {
-            "messages": [{"role": "user", "content": item["problem"]}],
-            "expected_answer": item["answer"]
-        }
-    return _preprocess
-
-@pytest.fixture(params=[
-    {"model": "gpt-4", "temperature": 0.7, "max_tokens": 100},
-    {"model": "gpt-3.5-turbo", "temperature": 0.5, "max_tokens": 100},
-    {"model": "claude-3", "temperature": 0.3, "max_tokens": 100}
-])
-def completion_params(request):
-    """Completion parameters - parametrized across different models"""
-    return request.param
-
-# Pointwise fixture - parametrized across BOTH completion params AND dataset rows
-@pytest.fixture(params=range(len(MATH_DATASET)))
-def evaluation_row_pointwise(math_dataset, preprocess_fn, completion_params, request):
-    """Single evaluation row - parametrized across completion params AND dataset rows"""
-    framework = EvaluationFramework()
-
-    # Get the specific row based on parametrization
-    row_index = request.param
-    raw_item = math_dataset[row_index]
-    processed_item = preprocess_fn(raw_item)
-
-    # Run the completion
-    result = await framework.run_completion(processed_item, completion_params)
-
-    return EvaluationRow(
-        input_data=processed_item,
-        completion_params=completion_params,
-        completion_response=result
-    )
-
-# Batch fixture - parametrized across completion params only
-@pytest.fixture
-async def evaluation_rows_batch(math_dataset, preprocess_fn, completion_params):
-    """All evaluation rows - parametrized across completion params only"""
-    framework = EvaluationFramework()
-
-    # Process all rows
-    processed_items = [preprocess_fn(item) for item in math_dataset]
-
-    # Run completions for all rows
-    results = []
-    for item in processed_items:
-        result = await framework.run_completion(item, completion_params)
-        results.append(EvaluationRow(
-            input_data=item,
-            completion_params=completion_params,
-            completion_response=result
-        ))
-
-    return results
-```
-
-```python
-# test_math_evaluation.py
-import pytest
-import re
-from eval_protocol.framework import EvaluationRow
-
-# POINTWISE EVALUATOR - 100 separate tests (one per row per model)
-def test_math_accuracy_pointwise(evaluation_row_pointwise):
-    """Pointwise evaluator - runs once per row per completion param"""
-    response = evaluation_row_pointwise.completion_response
-    expected = evaluation_row_pointwise.input_data["expected_answer"]
-
-    # Extract numeric answer from response
-    numbers = re.findall(r'-?\d+\.?\d*', response)
-    if not numbers:
-        pytest.fail(f"Could not extract number from response: {response}")
-
-    predicted = float(numbers[0])
-    expected_num = float(expected)
-
-    # Assert the answer is correct
-    assert abs(predicted - expected_num) < 0.01, \
-        f"Expected {expected_num}, got {predicted} in response: {response}"
-
-# BATCH EVALUATOR - 3 tests total (one per model)
-def test_math_accuracy_batch(evaluation_rows_batch):
-    """Batch evaluator - runs once per completion param with all rows"""
-    total_correct = 0
-    total_samples = len(evaluation_rows_batch)
-    failed_rows = []
-
-    for i, row in enumerate(evaluation_rows_batch):
-        response = row.completion_response
-        expected = row.input_data["expected_answer"]
-
-        # Extract numeric answer
-        numbers = re.findall(r'-?\d+\.?\d*', response)
-        if not numbers:
-            failed_rows.append({
-                "index": i,
-                "problem": row.input_data["messages"][0]["content"],
-                "expected": expected,
-                "response": response,
-                "error": "Could not extract number"
-            })
-            continue
-
-        predicted = float(numbers[0])
-        expected_num = float(expected)
-
-        if abs(predicted - expected_num) < 0.01:
-            total_correct += 1
-        else:
-            failed_rows.append({
-                "index": i,
-                "problem": row.input_data["messages"][0]["content"],
-                "expected": expected,
-                "predicted": predicted,
-                "response": response,
-                "error": f"Expected {expected_num}, got {predicted}"
-            })
-
-    # Calculate accuracy
-    accuracy = total_correct / total_samples
-
-    # Print detailed results for debugging
-    print(f"\nBatch Evaluation Results:")
-    print(f"Total samples: {total_samples}")
-    print(f"Correct: {total_correct}")
-    print(f"Accuracy: {accuracy:.2f}")
-
-    if failed_rows:
-        print(f"\nFailed rows ({len(failed_rows)}):")
-        for row in failed_rows[:10]:  # Show first 10 failures
-            print(f"  Row {row['index']}: {row['problem']} -> {row.get('predicted', 'N/A')} (expected: {row['expected']})")
-        if len(failed_rows) > 10:
-            print(f"  ... and {len(failed_rows) - 10} more failures")
-
-    # Assertions
-    assert accuracy > 0.8, f"Accuracy {accuracy:.2f} is too low, expected > 0.8"
-    assert total_correct > 0, "No correct answers found"
-
-# Additional batch evaluator with model-specific assertions
-def test_math_accuracy_with_model_info(evaluation_rows_batch):
-    """Batch evaluator with model-specific assertions"""
-    model = evaluation_rows_batch[0].completion_params["model"]
-    temperature = evaluation_rows_batch[0].completion_params["temperature"]
-
-    total_correct = 0
-    for row in evaluation_rows_batch:
-        response = row.completion_response
-        expected = row.input_data["expected_answer"]
-
-        numbers = re.findall(r'-?\d+\.?\d*', response)
-        if numbers:
-            predicted = float(numbers[0])
-            expected_num = float(expected)
-            if abs(predicted - expected_num) < 0.01:
-                total_correct += 1
-
-    accuracy = total_correct / len(evaluation_rows_batch)
-
-    # Model-specific assertions
-    if model == "gpt-4":
-        assert accuracy > 0.9, f"GPT-4 accuracy {accuracy:.2f} is too low"
-    elif model == "gpt-3.5-turbo":
-        assert accuracy > 0.8, f"GPT-3.5 accuracy {accuracy:.2f} is too low"
-    elif model == "claude-3":
-        assert accuracy > 0.85, f"Claude-3 accuracy {accuracy:.2f} is too low"
-
-    print(f"Model: {model}, Temperature: {temperature}, Accuracy: {accuracy:.2f}")
-
-# Optional: Debug function for specific rows
-def test_math_accuracy_debug_specific_rows(evaluation_rows_batch):
-    """Debug function to test specific rows - only runs on first few rows"""
-    # Only test first 5 rows for debugging
-    debug_rows = evaluation_rows_batch[:5]
-
-    for i, row in enumerate(debug_rows):
-        response = row.completion_response
-        expected = row.input_data["expected_answer"]
-
-        numbers = re.findall(r'-?\d+\.?\d*', response)
-        if not numbers:
-            pytest.fail(f"Row {i}: Could not extract number from response: {response}")
-
-        predicted = float(numbers[0])
-        expected_num = float(expected)
-
-        assert abs(predicted - expected_num) < 0.01, \
-            f"Row {i}: Expected {expected_num}, got {predicted} in response: {response}"
-```
-
-### 2. Running the Tests
-
-```bash
-# Run pointwise evaluator (100 rows × 3 models = 300 tests)
-pytest test_math_evaluation.py::test_math_accuracy_pointwise -v
-
-# Run batch evaluator (3 models = 3 tests)
-pytest test_math_evaluation.py::test_math_accuracy_batch -v
-
-# Run all tests (300 + 3 = 303 tests total)
-pytest test_math_evaluation.py -v
-
-# Run with specific model
-pytest test_math_evaluation.py -k "gpt-4" -v
-
-# Run only batch tests
-pytest test_math_evaluation.py -k "batch" -v
-
-# Run only pointwise tests
-pytest test_math_evaluation.py -k "pointwise" -v
-```
-
-### 3. Expected Output
-
-**Pointwise evaluator output:**
-```
-test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-0] PASSED
-test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-1] PASSED
-test_math_evaluation.py::test_math_accuracy_pointwise[completion_params0-2] PASSED
-# ... 97 more tests for completion_params0
-test_math_evaluation.py::test_math_accuracy_pointwise[completion_params1-0] PASSED
-# ... 100 tests for completion_params1
-test_math_evaluation.py::test_math_accuracy_pointwise[completion_params2-0] PASSED
-# ... 100 tests for completion_params2
-```
-
-**Batch evaluator output:**
-```
-test_math_evaluation.py::test_math_accuracy_batch[completion_params0] PASSED
-test_math_evaluation.py::test_math_accuracy_batch[completion_params1] PASSED
-test_math_evaluation.py::test_math_accuracy_batch[completion_params2] PASSED
-```
-
-### 4. Key Differences
-
-**Pointwise Evaluator:**
-- **Test count**: 100 rows × 3 models = 300 tests
-- **Benefits**: Easy to debug individual rows, clear failure reporting per row
-- **Use case**: When you want to see exactly which rows fail and why
-- **Pytest output**: Each row gets its own test result
-
-**Batch Evaluator:**
-- **Test count**: 3 models = 3 tests
-- **Benefits**: Faster execution, easier to manage, good for overall accuracy
-- **Use case**: When you care about overall performance across the dataset
-- **Pytest output**: One test result per model with detailed internal reporting
-
-Both approaches give you the flexibility to choose the right evaluation strategy for your use case while maintaining the pytest-native approach!

From 94ae1b3ddef8956a4a9a6395edd359f288913134 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Thu, 18 Sep 2025 16:01:55 -0700
Subject: [PATCH 4/5] test_import_logs works

---
 eval_protocol/pytest/evaluation_test.py       |   3 +-
 .../pytest/generate_parameter_combinations.py |   2 +-
 eval_protocol/pytest/parameterize.py          | 117 +++++++++++++++++-
 .../quickstart/llm_judge_openai_responses.py  |   1 -
 tests/pytest/test_parameterized_ids.py        |   1 +
 5 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 0def715b..e51d008b 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -205,6 +205,7 @@ def decorator(
         # Create parameter tuples for pytest.mark.parametrize
         pytest_parametrize_args = pytest_parametrize(
             combinations,
+            test_func,
             input_dataset,
             completion_params,
             completion_params_provided,
@@ -268,7 +269,7 @@ def _log_eval_error(status: Status, rows: list[EvaluationRow] | None, passed: bo
                             index = abs(index) % (max_index + 1)
                             row.input_metadata.row_id = generate_id(seed=0, index=index)
 
-                    completion_params = kwargs["completion_params"]
+                    completion_params = kwargs["completion_params"] if "completion_params" in kwargs else None
                     # Create eval metadata with test function info and current commit hash
                     eval_metadata = EvalMetadata(
                         name=test_func.__name__,
diff --git a/eval_protocol/pytest/generate_parameter_combinations.py b/eval_protocol/pytest/generate_parameter_combinations.py
index 6a1dcf2f..99c37b74 100644
--- a/eval_protocol/pytest/generate_parameter_combinations.py
+++ b/eval_protocol/pytest/generate_parameter_combinations.py
@@ -31,7 +31,7 @@
 ]
 
 
-class ParameterizedTestKwargs(TypedDict):
+class ParameterizedTestKwargs(TypedDict, total=False):
     """
     These are the type of parameters that can be passed to the generated pytest
     function. Every experiment is a unique combination of these parameters.
diff --git a/eval_protocol/pytest/parameterize.py b/eval_protocol/pytest/parameterize.py
index d4089b5d..a2140da5 100644
--- a/eval_protocol/pytest/parameterize.py
+++ b/eval_protocol/pytest/parameterize.py
@@ -1,3 +1,4 @@
+import ast
 import inspect
 from typing import TypedDict, Protocol
 from collections.abc import Callable, Sequence, Iterable, Awaitable
@@ -9,6 +10,111 @@
 from eval_protocol.pytest.types import DatasetPathParam, EvaluationInputParam, InputMessagesParam, TestFunction
 
 
+def _has_pytest_parametrize_with_completion_params(test_func: TestFunction) -> bool:
+    """
+    Check if a test function has a pytest.mark.parametrize decorator with argnames="completion_params".
+
+    This function uses inspect.getsource and ast to parse the function's source code and look for
+    pytest.mark.parametrize decorators that include "completion_params" in their argnames.
+
+    Args:
+        test_func: The test function to analyze
+
+    Returns:
+        True if the function has a pytest.mark.parametrize decorator with "completion_params" in argnames,
+        False otherwise
+
+    Raises:
+        OSError: If the source code cannot be retrieved (e.g., function is defined in interactive mode)
+        SyntaxError: If the source code cannot be parsed as valid Python
+    """
+    try:
+        source = inspect.getsource(test_func)
+    except OSError:
+        # Function source cannot be retrieved (e.g., defined in interactive mode)
+        return False
+
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        # Source code cannot be parsed
+        return False
+
+    # Walk through the AST to find pytest.mark.parametrize decorators
+    for node in ast.walk(tree):
+        if isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef):
+            # Check decorators on this function
+            for decorator in node.decorator_list:
+                if _is_pytest_parametrize_with_completion_params(decorator):
+                    return True
+
+    return False
+
+
+def _is_pytest_parametrize_with_completion_params(decorator: ast.expr) -> bool:
+    """
+    Check if a decorator is pytest.mark.parametrize with "completion_params" in argnames.
+
+    Args:
+        decorator: AST node representing a decorator
+
+    Returns:
+        True if this is a pytest.mark.parametrize decorator with "completion_params" in argnames
+    """
+    # Look for pytest.mark.parametrize pattern
+    if isinstance(decorator, ast.Call):
+        # Check if it's pytest.mark.parametrize
+        if isinstance(decorator.func, ast.Attribute):
+            if (
+                isinstance(decorator.func.value, ast.Attribute)
+                and isinstance(decorator.func.value.value, ast.Name)
+                and decorator.func.value.value.id == "pytest"
+                and decorator.func.value.attr == "mark"
+                and decorator.func.attr == "parametrize"
+            ):
+                # Check positional arguments first (argnames is typically the first positional arg)
+                if len(decorator.args) > 0:
+                    argnames_arg = decorator.args[0]
+                    if _check_argnames_for_completion_params(argnames_arg):
+                        return True
+
+                # Check keyword arguments for argnames
+                for keyword in decorator.keywords:
+                    if keyword.arg == "argnames":
+                        if _check_argnames_for_completion_params(keyword.value):
+                            return True
+
+    return False
+
+
+def _check_argnames_for_completion_params(argnames_node: ast.expr) -> bool:
+    """
+    Check if an argnames AST node contains "completion_params".
+
+    Args:
+        argnames_node: AST node representing the argnames value
+
+    Returns:
+        True if argnames contains "completion_params"
+    """
+    if isinstance(argnames_node, ast.Constant):
+        # Single string case: argnames="completion_params"
+        if argnames_node.value == "completion_params":
+            return True
+    elif isinstance(argnames_node, ast.List):
+        # List case: argnames=["completion_params", ...]
+        for elt in argnames_node.elts:
+            if isinstance(elt, ast.Constant) and elt.value == "completion_params":
+                return True
+    elif isinstance(argnames_node, ast.Tuple):
+        # Tuple case: argnames=("completion_params", ...)
+        for elt in argnames_node.elts:
+            if isinstance(elt, ast.Constant) and elt.value == "completion_params":
+                return True
+
+    return False
+
+
 class PytestMarkParametrizeKwargs(TypedDict):
     argnames: Sequence[str]
     argvalues: Iterable[ParameterSet | Sequence[object] | object]
@@ -96,6 +202,7 @@ def generate_id_from_dict(d: dict[str, object], max_length: int = 200) -> str |
 
 def pytest_parametrize(
     combinations: list[CombinationTuple],
+    test_func: TestFunction | None,
     input_dataset: Sequence[DatasetPathParam] | None,
     completion_params: Sequence[CompletionParams | None] | None,
     completion_params_provided: bool,
@@ -112,6 +219,11 @@ def pytest_parametrize(
     API.
     """
 
+    if test_func is not None:
+        has_pytest_parametrize = _has_pytest_parametrize_with_completion_params(test_func)
+    else:
+        has_pytest_parametrize = False
+
     # Create parameter tuples for pytest.mark.parametrize
     argnames: list[str] = []
     sig_parameters: list[str] = []
@@ -119,9 +231,10 @@ def pytest_parametrize(
         argnames.append("dataset_path")
         sig_parameters.append("dataset_path")
     if completion_params is not None:
-        if completion_params_provided:
+        if completion_params_provided and not has_pytest_parametrize:
             argnames.append("completion_params")
-        sig_parameters.append("completion_params")
+        if has_pytest_parametrize or completion_params_provided:
+            sig_parameters.append("completion_params")
     if input_messages is not None:
         argnames.append("input_messages")
         sig_parameters.append("input_messages")
diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index 06fe502c..7574e07d 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -52,7 +52,6 @@
             "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         },
     ],
-    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
     input_rows=[input_rows],
diff --git a/tests/pytest/test_parameterized_ids.py b/tests/pytest/test_parameterized_ids.py
index b0f3d215..d3363d0c 100644
--- a/tests/pytest/test_parameterized_ids.py
+++ b/tests/pytest/test_parameterized_ids.py
@@ -149,6 +149,7 @@ def test_pytest_parametrize_with_custom_id_generator():
     # Test with default generator
     result = pytest_parametrize(
         combinations=combinations,
+        test_func=None,
         input_dataset=None,
         completion_params=[{"model": "gpt-4"}, {"model": "claude-3"}, {"temperature": 0.5}],
         completion_params_provided=True,

From d2d5d9555c63a76249bad11ee22cbbef9e8f2ba2 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Thu, 18 Sep 2025 16:03:40 -0700
Subject: [PATCH 5/5] add ids

---
 eval_protocol/quickstart/llm_judge_openai_responses.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/eval_protocol/quickstart/llm_judge_openai_responses.py b/eval_protocol/quickstart/llm_judge_openai_responses.py
index 7574e07d..06fe502c 100644
--- a/eval_protocol/quickstart/llm_judge_openai_responses.py
+++ b/eval_protocol/quickstart/llm_judge_openai_responses.py
@@ -52,6 +52,7 @@
             "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct-0905",
         },
     ],
+    ids=DefaultParameterIdGenerator.generate_id_from_dict,
 )
 @evaluation_test(
     input_rows=[input_rows],