eval-protocol · dphuang2 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 22, 2025
diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py
@@ -59,7 +59,7 @@
     parse_ep_passed_threshold,
     rollout_processor_with_retry,
 )
-from eval_protocol.utils.show_results_url import show_results_url
+from eval_protocol.utils.show_results_url import store_local_ui_results_url
 
 from ..common_utils import load_jsonl
 
@@ -220,6 +220,9 @@ def create_wrapper_with_signature() -> Callable[[], None]:
             # Create the function body that will be used
             invocation_id = generate_id()
 
+            # Store URL for viewing results (after all postprocessing is complete)
+            store_local_ui_results_url(invocation_id)
+
             async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
                 eval_metadata = None
 
@@ -556,9 +559,6 @@ async def execute_run_with_progress(run_idx: int, config):
                             experiment_duration_seconds,
                         )
 
-                    # Show URL for viewing results (after all postprocessing is complete)
-                    show_results_url(invocation_id)
-
                 except AssertionError:
                     _log_eval_error(
                         Status.eval_finished(),

diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py
@@ -279,7 +279,7 @@ def pytest_configure(config) -> None:
         pass
 
 
-def pytest_sessionfinish(session, exitstatus):
+def _print_experiment_links(session):
     """Print all collected Fireworks experiment links from pytest stash."""
     try:
         # Late import to avoid circulars; if missing key, skip printing
@@ -291,9 +291,8 @@ def pytest_sessionfinish(session, exitstatus):
         except Exception:
             EXPERIMENT_LINKS_STASH_KEY = None
 
-        # Get links from pytest stash using shared key
+        # Get links from pytest stash
         links = []
-
         if EXPERIMENT_LINKS_STASH_KEY is not None and EXPERIMENT_LINKS_STASH_KEY in session.stash:
             links = session.stash[EXPERIMENT_LINKS_STASH_KEY]
 
@@ -309,6 +308,55 @@ def pytest_sessionfinish(session, exitstatus):
                     print(f"❌ Experiment {link['experiment_id']}: {link['job_link']}", file=sys.__stderr__)
 
             print("=" * 80, file=sys.__stderr__)
+            return True
+        return False
+    except Exception:
+        return False
+
+
+def _print_local_ui_results_urls(session):
+    """Print all collected evaluation results URLs from pytest stash."""
+    try:
+        # Late import to avoid circulars; if missing key, skip printing
+        RESULTS_URLS_STASH_KEY = None
+        try:
+            from .store_results_url import RESULTS_URLS_STASH_KEY as _URL_KEY  # type: ignore
+
+            RESULTS_URLS_STASH_KEY = _URL_KEY
+        except Exception:
+            RESULTS_URLS_STASH_KEY = None
+
+        # Get URLs from pytest stash
+        urls = []
+        if RESULTS_URLS_STASH_KEY is not None and RESULTS_URLS_STASH_KEY in session.stash:
+            urls = session.stash[RESULTS_URLS_STASH_KEY]
+
+        if urls:
+            print("\n" + "=" * 80, file=sys.__stderr__)
+            print("📊 LOCAL UI EVALUATION RESULTS", file=sys.__stderr__)
+            print("=" * 80, file=sys.__stderr__)
+
+            for url_data in urls:
+                print(f"📊 Invocation {url_data['invocation_id']}:", file=sys.__stderr__)
+                print(f"  📊 Aggregate scores: {url_data['pivot_url']}", file=sys.__stderr__)
+                print(f"  📋 Trajectories: {url_data['table_url']}", file=sys.__stderr__)
+
+            print("=" * 80, file=sys.__stderr__)
+            return True
+        return False
+    except Exception:
+        return False
+
+
+def pytest_sessionfinish(session, exitstatus):
+    """Print all collected Fireworks experiment links and evaluation results URLs from pytest stash."""
+    try:
+        # Print experiment links and results URLs separately
+        links_printed = _print_experiment_links(session)
+        urls_printed = _print_local_ui_results_urls(session)
+
+        # Flush stderr if anything was printed
+        if links_printed or urls_printed:
             err_stream = getattr(sys, "__stderr__", None)
             if err_stream is not None:
                 try:

diff --git a/eval_protocol/pytest/store_results_url.py b/eval_protocol/pytest/store_results_url.py
@@ -0,0 +1,46 @@
+from typing import TypedDict
+from pytest import StashKey
+
+
+class ResultsUrl(TypedDict):
+    invocation_id: str
+    pivot_url: str
+    table_url: str
+
+
+RESULTS_URLS_STASH_KEY = StashKey[list[ResultsUrl]]()
+
+
+def _store_local_ui_url_in_stash(invocation_id: str, pivot_url: str, table_url: str):
+    """Store results URL in pytest session stash."""
+    try:
+        import sys
+
+        # Walk up the call stack to find the pytest session
+        session = None
+        frame = sys._getframe()  # pyright: ignore[reportPrivateUsage]
+        while frame:
+            if "session" in frame.f_locals and hasattr(frame.f_locals["session"], "stash"):  # pyright: ignore[reportAny]
+                session = frame.f_locals["session"]  # pyright: ignore[reportAny]
+                break
+            frame = frame.f_back
+
+        if session is not None:
+            global RESULTS_URLS_STASH_KEY
+
+            if RESULTS_URLS_STASH_KEY not in session.stash:  # pyright: ignore[reportAny]
+                session.stash[RESULTS_URLS_STASH_KEY] = []  # pyright: ignore[reportAny]
+
+            session.stash[RESULTS_URLS_STASH_KEY].append(  # pyright: ignore[reportAny]
+                {"invocation_id": invocation_id, "pivot_url": pivot_url, "table_url": table_url}
+            )
+        else:
+            pass
+
+    except Exception as e:  # pyright: ignore[reportUnusedVariable]
+        pass
+
+
+def store_local_ui_url(invocation_id: str, pivot_url: str, table_url: str):
+    """Public function to store results URL in pytest session stash."""
+    _store_local_ui_url_in_stash(invocation_id, pivot_url, table_url)
diff --git a/eval_protocol/utils/show_results_url.py b/eval_protocol/utils/show_results_url.py
@@ -5,6 +5,8 @@
 import socket
 import urllib.parse
 
+from eval_protocol.pytest.store_results_url import store_local_ui_url
+
 
 def is_server_running(host: str = "localhost", port: int = 8000) -> bool:
     """
@@ -58,25 +60,15 @@ def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://l
     return f"{base_url}?filterConfig={encoded_filter}"
 
 
-def show_results_url(invocation_id: str) -> None:
+def store_local_ui_results_url(invocation_id: str) -> None:
     """
-    Show URLs for viewing evaluation results filtered by invocation_id.
-
-    If the server is not running, prints a message to run "ep logs" to start the local UI.
-    If the server is running, prints URLs to view results filtered by invocation_id.
+    Store URLs for viewing evaluation results filtered by invocation_id in pytest stash.
 
     Args:
-            invocation_id: The invocation ID to filter results by
+                    invocation_id: The invocation ID to filter results by
     """
-    if is_server_running():
-        pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
-        table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
-        print("View your evaluation results:")
-        print(f"  📊 Aggregate scores: {pivot_url}")
-        print(f"  📋 Trajectories: {table_url}")
-    else:
-        pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
-        table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
-        print("Start the local UI with 'ep logs', then visit:")
-        print(f"  📊 Aggregate scores: {pivot_url}")
-        print(f"  📋 Trajectories: {table_url}")
+    pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
+    table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
+
+    # Store URLs in pytest stash for later printing in pytest_sessionfinish
+    store_local_ui_url(invocation_id, pivot_url, table_url)
diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py
@@ -23,20 +23,25 @@
 
 def agent_factory(config: RolloutProcessorConfig) -> Agent:
     model_name = config.completion_params["model"]
-    provider = config.completion_params["provider"]
+    provider = config.completion_params["provider"] if "provider" in config.completion_params else "openai"
     model = OpenAIChatModel(model_name, provider=provider)
     return setup_agent(model)
 
 
-@pytest.mark.asyncio
-@evaluation_test(
-    input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
-    completion_params=[
+@pytest.mark.parametrize(
+    "completion_params",
+    [
         {
             "model": "accounts/fireworks/models/kimi-k2-instruct",
             "provider": "fireworks",
         },
+        {
+            "model": "gpt-5",
+        },
     ],
+)
+@evaluation_test(
+    input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
     rollout_processor=PydanticAgentRolloutProcessor(agent_factory),
     mode="pointwise",
 )

diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py
@@ -72,6 +72,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
         assert row.eval_metadata.status.is_error()
 
     # make sure the error message includes details of the error
-    assert all("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
-    assert all("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
-    assert all("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())
+    assert any("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
+    assert any("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
+    assert any("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())