From c6db5898e7a90d2e5e8c070fbe82ec2bfa00d5f1 Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Thu, 25 Sep 2025 23:18:25 -0700 Subject: [PATCH 1/5] support custom invocation id --- eval_protocol/pytest/evaluation_test.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 4aabd296..535022b1 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -189,7 +189,7 @@ def evaluation_test( completion_params = parse_ep_completion_params(completion_params) original_completion_params = completion_params passed_threshold = parse_ep_passed_threshold(passed_threshold) - + custom_invocation_id = os.environ.get("EP_INVOCATION_ID", None) def decorator( test_func: TestFunction, ) -> TestFunction: @@ -228,7 +228,10 @@ def decorator( # Create wrapper function with exact signature that pytest expects def create_wrapper_with_signature() -> Callable[[], None]: # Create the function body that will be used - invocation_id = generate_id() + if custom_invocation_id: + invocation_id = custom_invocation_id + else: + invocation_id = generate_id() async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None: # Store URL for viewing results (after all postprocessing is complete) From 914205b7d451285af722ddae3a323b3f5dce6f2a Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Thu, 25 Sep 2025 23:20:28 -0700 Subject: [PATCH 2/5] format --- eval_protocol/pytest/evaluation_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 535022b1..4625114a 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -190,6 +190,7 @@ def evaluation_test( original_completion_params = completion_params passed_threshold = parse_ep_passed_threshold(passed_threshold) custom_invocation_id = os.environ.get("EP_INVOCATION_ID", None) + def decorator( test_func: TestFunction, ) -> TestFunction: From c1c97f88833411fb164b4e2943b32eabc3912cee Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 11:12:16 -0700 Subject: [PATCH 3/5] add test --- tests/pytest/test_pytest_env_overwrite.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 tests/pytest/test_pytest_env_overwrite.py diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py new file mode 100644 index 00000000..dcf1409d --- /dev/null +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -0,0 +1,18 @@ +from eval_protocol.models import EvaluationRow, Message +from eval_protocol.pytest import evaluation_test +from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor +import os + +os.environ["EP_INVOCATION_ID"] = "test-invocation-123" + +@evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])]], + completion_params=[{"model": "no-op"}], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", +) +def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: + """Run math evaluation on sample dataset using pytest interface.""" + assert row.messages[0].content == "What is the capital of France?" + assert row.execution_metadata.invocation_id == "test-invocation-123" + return row From 08c9e485c7de62caaca8ec54a1866efa7bde5cfb Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 11:16:41 -0700 Subject: [PATCH 4/5] format --- tests/pytest/test_pytest_env_overwrite.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index dcf1409d..8b3e28aa 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -5,6 +5,7 @@ os.environ["EP_INVOCATION_ID"] = "test-invocation-123" + @evaluation_test( input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])]], completion_params=[{"model": "no-op"}], From d435b6ec1d7ccc9bde3260b60219d5c1f98839ce Mon Sep 17 00:00:00 2001 From: Yinghan Ma Date: Fri, 26 Sep 2025 11:43:20 -0700 Subject: [PATCH 5/5] format --- tests/pytest/test_pytest_env_overwrite.py | 25 ++++++++++++----------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/pytest/test_pytest_env_overwrite.py b/tests/pytest/test_pytest_env_overwrite.py index 8b3e28aa..c88dd2b8 100644 --- a/tests/pytest/test_pytest_env_overwrite.py +++ b/tests/pytest/test_pytest_env_overwrite.py @@ -2,18 +2,19 @@ from eval_protocol.pytest import evaluation_test from eval_protocol.pytest.default_no_op_rollout_processor import NoOpRolloutProcessor import os +from unittest import mock -os.environ["EP_INVOCATION_ID"] = "test-invocation-123" +with mock.patch.dict(os.environ, {"EP_INVOCATION_ID": "test-invocation-123"}): -@evaluation_test( - input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])]], - completion_params=[{"model": "no-op"}], - rollout_processor=NoOpRolloutProcessor(), - mode="pointwise", -) -def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: - """Run math evaluation on sample dataset using pytest interface.""" - assert row.messages[0].content == "What is the capital of France?" - assert row.execution_metadata.invocation_id == "test-invocation-123" - return row + @evaluation_test( + input_rows=[[EvaluationRow(messages=[Message(role="user", content="What is the capital of France?")])]], + completion_params=[{"model": "no-op"}], + rollout_processor=NoOpRolloutProcessor(), + mode="pointwise", + ) + def test_input_messages_in_decorator(row: EvaluationRow) -> EvaluationRow: + """Run math evaluation on sample dataset using pytest interface.""" + assert row.messages[0].content == "What is the capital of France?" + assert row.execution_metadata.invocation_id == "test-invocation-123" + return row