From 5b0cda41f343650b104fcc64a2a38ac1c5065fd6 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 22 Sep 2025 09:15:23 -0700
Subject: [PATCH 1/3] fix printing of local ui URL to be at end of pytest
 session

---
 eval_protocol/pytest/evaluation_test.py       |   8 +-
 eval_protocol/pytest/plugin.py                |  54 ++++++-
 eval_protocol/pytest/store_results_url.py     |  46 ++++++
 eval_protocol/utils/show_results_url.py       |  28 ++--
 .../chinook/pydantic/test_pydantic_chinook.py |  15 +-
 tests/test_show_results_url.py                | 139 ++++++------------
 6 files changed, 165 insertions(+), 125 deletions(-)
 create mode 100644 eval_protocol/pytest/store_results_url.py

diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
index 2e1254d6..333214cd 100644
--- a/eval_protocol/pytest/evaluation_test.py
+++ b/eval_protocol/pytest/evaluation_test.py
@@ -59,7 +59,7 @@
     parse_ep_passed_threshold,
     rollout_processor_with_retry,
 )
-from eval_protocol.utils.show_results_url import show_results_url
+from eval_protocol.utils.show_results_url import store_local_ui_results_url
 
 from ..common_utils import load_jsonl
 
@@ -220,6 +220,9 @@ def create_wrapper_with_signature() -> Callable[[], None]:
             # Create the function body that will be used
             invocation_id = generate_id()
 
+            # Store URL for viewing results (after all postprocessing is complete)
+            store_local_ui_results_url(invocation_id)
+
             async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
                 eval_metadata = None
 
@@ -556,9 +559,6 @@ async def execute_run_with_progress(run_idx: int, config):
                             experiment_duration_seconds,
                         )
 
-                    # Show URL for viewing results (after all postprocessing is complete)
-                    show_results_url(invocation_id)
-
                 except AssertionError:
                     _log_eval_error(
                         Status.eval_finished(),
diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
index beb95bfc..ad48ad38 100644
--- a/eval_protocol/pytest/plugin.py
+++ b/eval_protocol/pytest/plugin.py
@@ -279,7 +279,7 @@ def pytest_configure(config) -> None:
         pass
 
 
-def pytest_sessionfinish(session, exitstatus):
+def _print_experiment_links(session):
     """Print all collected Fireworks experiment links from pytest stash."""
     try:
         # Late import to avoid circulars; if missing key, skip printing
@@ -291,9 +291,8 @@ def pytest_sessionfinish(session, exitstatus):
         except Exception:
             EXPERIMENT_LINKS_STASH_KEY = None
 
-        # Get links from pytest stash using shared key
+        # Get links from pytest stash
         links = []
-
         if EXPERIMENT_LINKS_STASH_KEY is not None and EXPERIMENT_LINKS_STASH_KEY in session.stash:
             links = session.stash[EXPERIMENT_LINKS_STASH_KEY]
 
@@ -309,6 +308,55 @@ def pytest_sessionfinish(session, exitstatus):
                     print(f"❌ Experiment {link['experiment_id']}: {link['job_link']}", file=sys.__stderr__)
 
             print("=" * 80, file=sys.__stderr__)
+            return True
+        return False
+    except Exception:
+        return False
+
+
+def _print_local_ui_results_urls(session):
+    """Print all collected evaluation results URLs from pytest stash."""
+    try:
+        # Late import to avoid circulars; if missing key, skip printing
+        RESULTS_URLS_STASH_KEY = None
+        try:
+            from .store_results_url import RESULTS_URLS_STASH_KEY as _URL_KEY  # type: ignore
+
+            RESULTS_URLS_STASH_KEY = _URL_KEY
+        except Exception:
+            RESULTS_URLS_STASH_KEY = None
+
+        # Get URLs from pytest stash
+        urls = []
+        if RESULTS_URLS_STASH_KEY is not None and RESULTS_URLS_STASH_KEY in session.stash:
+            urls = session.stash[RESULTS_URLS_STASH_KEY]
+
+        if urls:
+            print("\n" + "=" * 80, file=sys.__stderr__)
+            print("📊 LOCAL UI EVALUATION RESULTS", file=sys.__stderr__)
+            print("=" * 80, file=sys.__stderr__)
+
+            for url_data in urls:
+                print(f"📊 Invocation {url_data['invocation_id']}:", file=sys.__stderr__)
+                print(f"  📊 Aggregate scores: {url_data['pivot_url']}", file=sys.__stderr__)
+                print(f"  📋 Trajectories: {url_data['table_url']}", file=sys.__stderr__)
+
+            print("=" * 80, file=sys.__stderr__)
+            return True
+        return False
+    except Exception:
+        return False
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """Print all collected Fireworks experiment links and evaluation results URLs from pytest stash."""
+    try:
+        # Print experiment links and results URLs separately
+        links_printed = _print_experiment_links(session)
+        urls_printed = _print_local_ui_results_urls(session)
+
+        # Flush stderr if anything was printed
+        if links_printed or urls_printed:
             err_stream = getattr(sys, "__stderr__", None)
             if err_stream is not None:
                 try:
diff --git a/eval_protocol/pytest/store_results_url.py b/eval_protocol/pytest/store_results_url.py
new file mode 100644
index 00000000..ccf48541
--- /dev/null
+++ b/eval_protocol/pytest/store_results_url.py
@@ -0,0 +1,46 @@
+from typing import TypedDict
+from pytest import StashKey
+
+
+class ResultsUrl(TypedDict):
+    invocation_id: str
+    pivot_url: str
+    table_url: str
+
+
+RESULTS_URLS_STASH_KEY = StashKey[list[ResultsUrl]]()
+
+
+def _store_local_ui_url_in_stash(invocation_id: str, pivot_url: str, table_url: str):
+    """Store results URL in pytest session stash."""
+    try:
+        import sys
+
+        # Walk up the call stack to find the pytest session
+        session = None
+        frame = sys._getframe()  # pyright: ignore[reportPrivateUsage]
+        while frame:
+            if "session" in frame.f_locals and hasattr(frame.f_locals["session"], "stash"):  # pyright: ignore[reportAny]
+                session = frame.f_locals["session"]  # pyright: ignore[reportAny]
+                break
+            frame = frame.f_back
+
+        if session is not None:
+            global RESULTS_URLS_STASH_KEY
+
+            if RESULTS_URLS_STASH_KEY not in session.stash:  # pyright: ignore[reportAny]
+                session.stash[RESULTS_URLS_STASH_KEY] = []  # pyright: ignore[reportAny]
+
+            session.stash[RESULTS_URLS_STASH_KEY].append(  # pyright: ignore[reportAny]
+                {"invocation_id": invocation_id, "pivot_url": pivot_url, "table_url": table_url}
+            )
+        else:
+            pass
+
+    except Exception as e:  # pyright: ignore[reportUnusedVariable]
+        pass
+
+
+def store_local_ui_url(invocation_id: str, pivot_url: str, table_url: str):
+    """Public function to store results URL in pytest session stash."""
+    _store_local_ui_url_in_stash(invocation_id, pivot_url, table_url)
diff --git a/eval_protocol/utils/show_results_url.py b/eval_protocol/utils/show_results_url.py
index 409ac858..492fe50c 100644
--- a/eval_protocol/utils/show_results_url.py
+++ b/eval_protocol/utils/show_results_url.py
@@ -5,6 +5,8 @@
 import socket
 import urllib.parse
 
+from eval_protocol.pytest.store_results_url import store_local_ui_url
+
 
 def is_server_running(host: str = "localhost", port: int = 8000) -> bool:
     """
@@ -58,25 +60,15 @@ def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://l
     return f"{base_url}?filterConfig={encoded_filter}"
 
 
-def show_results_url(invocation_id: str) -> None:
+def store_local_ui_results_url(invocation_id: str) -> None:
     """
-    Show URLs for viewing evaluation results filtered by invocation_id.
-
-    If the server is not running, prints a message to run "ep logs" to start the local UI.
-    If the server is running, prints URLs to view results filtered by invocation_id.
+    Store URLs for viewing evaluation results filtered by invocation_id in pytest stash.
 
     Args:
-            invocation_id: The invocation ID to filter results by
+                    invocation_id: The invocation ID to filter results by
     """
-    if is_server_running():
-        pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
-        table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
-        print("View your evaluation results:")
-        print(f"  📊 Aggregate scores: {pivot_url}")
-        print(f"  📋 Trajectories: {table_url}")
-    else:
-        pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
-        table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
-        print("Start the local UI with 'ep logs', then visit:")
-        print(f"  📊 Aggregate scores: {pivot_url}")
-        print(f"  📋 Trajectories: {table_url}")
+    pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
+    table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
+
+    # Store URLs in pytest stash for later printing in pytest_sessionfinish
+    store_local_ui_url(invocation_id, pivot_url, table_url)
diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
index 9cdac0ee..62c427eb 100644
--- a/tests/chinook/pydantic/test_pydantic_chinook.py
+++ b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -23,20 +23,25 @@
 
 def agent_factory(config: RolloutProcessorConfig) -> Agent:
     model_name = config.completion_params["model"]
-    provider = config.completion_params["provider"]
+    provider = config.completion_params["provider"] if "provider" in config.completion_params else "openai"
     model = OpenAIChatModel(model_name, provider=provider)
     return setup_agent(model)
 
 
-@pytest.mark.asyncio
-@evaluation_test(
-    input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {
             "model": "accounts/fireworks/models/kimi-k2-instruct",
             "provider": "fireworks",
         },
+        {
+            "model": "gpt-5",
+        },
     ],
+)
+@evaluation_test(
+    input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
     rollout_processor=PydanticAgentRolloutProcessor(agent_factory),
     mode="pointwise",
 )
diff --git a/tests/test_show_results_url.py b/tests/test_show_results_url.py
index c840c4fc..191e73c1 100644
--- a/tests/test_show_results_url.py
+++ b/tests/test_show_results_url.py
@@ -9,7 +9,7 @@
 from eval_protocol.utils.show_results_url import (
     is_server_running,
     generate_invocation_filter_url,
-    show_results_url,
+    store_evaluation_results_url,
 )
 
 
@@ -123,124 +123,73 @@ def test_pivot_and_table_urls(self):
         assert pivot_url.split("?")[1] == table_url.split("?")[1]
 
 
-class TestShowResultsUrl:
-    """Test the show_results_url function."""
+class TestStoreEvaluationResultsUrl:
+    """Test the store_evaluation_results_url function."""
 
-    @patch("eval_protocol.utils.show_results_url.is_server_running")
-    @patch("builtins.print")
-    def test_server_running_pivot_and_table(self, mock_print, mock_is_running):
-        """Test output when server is running."""
-        mock_is_running.return_value = True
+    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    def test_stores_urls_correctly(self, mock_store):
+        """Test that URLs are stored correctly."""
+        invocation_id = "test-invocation"
 
-        show_results_url("test-invocation")
+        store_evaluation_results_url(invocation_id)
 
-        # Should print both pivot and table URLs
-        assert mock_print.call_count == 3  # Header + 2 URLs
-        calls = [call[0][0] for call in mock_print.call_args_list]
+        # Should call store_results_url once with correct parameters
+        mock_store.assert_called_once()
+        call_args = mock_store.call_args[0]
 
-        assert "View your evaluation results:" in calls[0]
-        assert "📊 Aggregate scores:" in calls[1]
-        assert "📋 Trajectories:" in calls[2]
-        assert "pivot" in calls[1]
-        assert "table" in calls[2]
+        assert call_args[0] == invocation_id  # invocation_id
+        assert "pivot" in call_args[1]  # pivot_url
+        assert "table" in call_args[2]  # table_url
 
-    @patch("eval_protocol.utils.show_results_url.is_server_running")
-    @patch("builtins.print")
-    def test_server_not_running_instructions(self, mock_print, mock_is_running):
-        """Test output when server is not running."""
-        mock_is_running.return_value = False
-
-        show_results_url("test-invocation")
-
-        # Should print instructions and both URLs
-        assert mock_print.call_count == 3  # Instructions + 2 URLs
-        calls = [call[0][0] for call in mock_print.call_args_list]
-
-        assert "Start the local UI with 'ep logs'" in calls[0]
-        assert "📊 Aggregate scores:" in calls[1]
-        assert "📋 Trajectories:" in calls[2]
-        assert "pivot" in calls[1]
-        assert "table" in calls[2]
-
-    @patch("eval_protocol.utils.show_results_url.is_server_running")
-    @patch("builtins.print")
-    def test_invocation_id_in_urls(self, mock_print, mock_is_running):
+    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    def test_invocation_id_in_urls(self, mock_store):
         """Test that invocation_id appears in both URLs."""
-        mock_is_running.return_value = True
         invocation_id = "unique-test-id-123"
 
-        show_results_url(invocation_id)
+        store_evaluation_results_url(invocation_id)
 
-        calls = [call[0][0] for call in mock_print.call_args_list]
-        pivot_url = calls[1]
-        table_url = calls[2]
+        call_args = mock_store.call_args[0]
+        pivot_url = call_args[1]
+        table_url = call_args[2]
 
         assert invocation_id in pivot_url
         assert invocation_id in table_url
 
-    @patch("eval_protocol.utils.show_results_url.is_server_running")
-    @patch("builtins.print")
-    def test_different_invocation_ids(self, mock_print, mock_is_running):
+    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    def test_different_invocation_ids(self, mock_store):
         """Test that different invocation IDs produce different URLs."""
-        mock_is_running.return_value = True
-
         # Test with first invocation ID
-        show_results_url("id-1")
-        calls_1 = [call[0][0] for call in mock_print.call_args_list]
-        mock_print.reset_mock()
+        store_evaluation_results_url("id-1")
+        call_1 = mock_store.call_args[0]
+        mock_store.reset_mock()
 
         # Test with second invocation ID
-        show_results_url("id-2")
-        calls_2 = [call[0][0] for call in mock_print.call_args_list]
+        store_evaluation_results_url("id-2")
+        call_2 = mock_store.call_args[0]
 
         # URLs should be different
-        assert calls_1[1] != calls_2[1]  # Pivot URLs different
-        assert calls_1[2] != calls_2[2]  # Table URLs different
-        assert "id-1" in calls_1[1]
-        assert "id-2" in calls_2[1]
+        assert call_1[1] != call_2[1]  # Pivot URLs different
+        assert call_1[2] != call_2[2]  # Table URLs different
+        assert "id-1" in call_1[1]
+        assert "id-2" in call_2[1]
 
 
 class TestIntegration:
     """Integration tests for the module."""
 
-    @patch("socket.socket")
-    @patch("builtins.print")
-    def test_full_workflow_server_running(self, mock_print, mock_socket):
-        """Test the full workflow when server is running."""
-        # Mock server running
-        mock_socket_instance = MagicMock()
-        mock_socket_instance.connect_ex.return_value = 0
-        mock_socket.return_value.__enter__.return_value = mock_socket_instance
+    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    def test_full_workflow_stores_urls(self, mock_store):
+        """Test the full workflow stores URLs correctly."""
+        invocation_id = "integration-test"
 
-        show_results_url("integration-test")
+        store_evaluation_results_url(invocation_id)
 
-        # Verify socket was checked
-        mock_socket_instance.connect_ex.assert_called_once_with(("localhost", 8000))
-
-        # Verify output
-        assert mock_print.call_count == 3
-        calls = [call[0][0] for call in mock_print.call_args_list]
-        assert "View your evaluation results:" in calls[0]
-        assert "integration-test" in calls[1]
-        assert "integration-test" in calls[2]
-
-    @patch("socket.socket")
-    @patch("builtins.print")
-    def test_full_workflow_server_not_running(self, mock_print, mock_socket):
-        """Test the full workflow when server is not running."""
-        # Mock server not running
-        mock_socket_instance = MagicMock()
-        mock_socket_instance.connect_ex.return_value = 1
-        mock_socket.return_value.__enter__.return_value = mock_socket_instance
-
-        show_results_url("integration-test")
-
-        # Verify socket was checked
-        mock_socket_instance.connect_ex.assert_called_once_with(("localhost", 8000))
+        # Verify store_results_url was called
+        mock_store.assert_called_once()
+        call_args = mock_store.call_args[0]
 
-        # Verify output
-        assert mock_print.call_count == 3
-        calls = [call[0][0] for call in mock_print.call_args_list]
-        assert "Start the local UI with 'ep logs'" in calls[0]
-        assert "integration-test" in calls[1]
-        assert "integration-test" in calls[2]
+        assert call_args[0] == invocation_id
+        assert "pivot" in call_args[1]
+        assert "table" in call_args[2]
+        assert "integration-test" in call_args[1]
+        assert "integration-test" in call_args[2]

From 11c10ec9ed1057e19c4b5f7982dfe4fb359961e9 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 22 Sep 2025 09:29:36 -0700
Subject: [PATCH 2/3] fix

---
 tests/test_show_results_url.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/test_show_results_url.py b/tests/test_show_results_url.py
index 191e73c1..152f942c 100644
--- a/tests/test_show_results_url.py
+++ b/tests/test_show_results_url.py
@@ -9,7 +9,7 @@
 from eval_protocol.utils.show_results_url import (
     is_server_running,
     generate_invocation_filter_url,
-    store_evaluation_results_url,
+    store_local_ui_results_url,
 )
 
 
@@ -123,17 +123,17 @@ def test_pivot_and_table_urls(self):
         assert pivot_url.split("?")[1] == table_url.split("?")[1]
 
 
-class TestStoreEvaluationResultsUrl:
-    """Test the store_evaluation_results_url function."""
+class TestStoreLocalUiResultsUrl:
+    """Test the store_local_ui_results_url function."""
 
-    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    @patch("eval_protocol.utils.show_results_url.store_local_ui_url")
     def test_stores_urls_correctly(self, mock_store):
         """Test that URLs are stored correctly."""
         invocation_id = "test-invocation"
 
-        store_evaluation_results_url(invocation_id)
+        store_local_ui_results_url(invocation_id)
 
-        # Should call store_results_url once with correct parameters
+        # Should call store_local_ui_url once with correct parameters
         mock_store.assert_called_once()
         call_args = mock_store.call_args[0]
 
@@ -141,12 +141,12 @@ def test_stores_urls_correctly(self, mock_store):
         assert "pivot" in call_args[1]  # pivot_url
         assert "table" in call_args[2]  # table_url
 
-    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    @patch("eval_protocol.utils.show_results_url.store_local_ui_url")
     def test_invocation_id_in_urls(self, mock_store):
         """Test that invocation_id appears in both URLs."""
         invocation_id = "unique-test-id-123"
 
-        store_evaluation_results_url(invocation_id)
+        store_local_ui_results_url(invocation_id)
 
         call_args = mock_store.call_args[0]
         pivot_url = call_args[1]
@@ -155,16 +155,16 @@ def test_invocation_id_in_urls(self, mock_store):
         assert invocation_id in pivot_url
         assert invocation_id in table_url
 
-    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    @patch("eval_protocol.utils.show_results_url.store_local_ui_url")
     def test_different_invocation_ids(self, mock_store):
         """Test that different invocation IDs produce different URLs."""
         # Test with first invocation ID
-        store_evaluation_results_url("id-1")
+        store_local_ui_results_url("id-1")
         call_1 = mock_store.call_args[0]
         mock_store.reset_mock()
 
         # Test with second invocation ID
-        store_evaluation_results_url("id-2")
+        store_local_ui_results_url("id-2")
         call_2 = mock_store.call_args[0]
 
         # URLs should be different
@@ -177,14 +177,14 @@ def test_different_invocation_ids(self, mock_store):
 class TestIntegration:
     """Integration tests for the module."""
 
-    @patch("eval_protocol.utils.show_results_url.store_results_url")
+    @patch("eval_protocol.utils.show_results_url.store_local_ui_url")
     def test_full_workflow_stores_urls(self, mock_store):
         """Test the full workflow stores URLs correctly."""
         invocation_id = "integration-test"
 
-        store_evaluation_results_url(invocation_id)
+        store_local_ui_results_url(invocation_id)
 
-        # Verify store_results_url was called
+        # Verify store_local_ui_url was called
         mock_store.assert_called_once()
         call_args = mock_store.call_args[0]
 

From 7230caf49963dab4d9057379348f89106f7cba56 Mon Sep 17 00:00:00 2001
From: Dylan Huang <dhuang@fireworks.ai>
Date: Mon, 22 Sep 2025 09:46:07 -0700
Subject: [PATCH 3/3] fix test_pytest_propagate_error

---
 tests/pytest/test_pytest_propagate_error.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py
index 63ed21ce..c1e97cc7 100644
--- a/tests/pytest/test_pytest_propagate_error.py
+++ b/tests/pytest/test_pytest_propagate_error.py
@@ -72,6 +72,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
         assert row.eval_metadata.status.is_error()
 
     # make sure the error message includes details of the error
-    assert all("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
-    assert all("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
-    assert all("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())
+    assert any("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
+    assert any("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
+    assert any("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())