From 5b0cda41f343650b104fcc64a2a38ac1c5065fd6 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 22 Sep 2025 09:15:23 -0700 Subject: [PATCH 1/3] fix printing of local ui URL to be at end of pytest session --- eval_protocol/pytest/evaluation_test.py | 8 +- eval_protocol/pytest/plugin.py | 54 ++++++- eval_protocol/pytest/store_results_url.py | 46 ++++++ eval_protocol/utils/show_results_url.py | 28 ++-- .../chinook/pydantic/test_pydantic_chinook.py | 15 +- tests/test_show_results_url.py | 139 ++++++------------ 6 files changed, 165 insertions(+), 125 deletions(-) create mode 100644 eval_protocol/pytest/store_results_url.py diff --git a/eval_protocol/pytest/evaluation_test.py b/eval_protocol/pytest/evaluation_test.py index 2e1254d6..333214cd 100644 --- a/eval_protocol/pytest/evaluation_test.py +++ b/eval_protocol/pytest/evaluation_test.py @@ -59,7 +59,7 @@ parse_ep_passed_threshold, rollout_processor_with_retry, ) -from eval_protocol.utils.show_results_url import show_results_url +from eval_protocol.utils.show_results_url import store_local_ui_results_url from ..common_utils import load_jsonl @@ -220,6 +220,9 @@ def create_wrapper_with_signature() -> Callable[[], None]: # Create the function body that will be used invocation_id = generate_id() + # Store URL for viewing results (after all postprocessing is complete) + store_local_ui_results_url(invocation_id) + async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None: eval_metadata = None @@ -556,9 +559,6 @@ async def execute_run_with_progress(run_idx: int, config): experiment_duration_seconds, ) - # Show URL for viewing results (after all postprocessing is complete) - show_results_url(invocation_id) - except AssertionError: _log_eval_error( Status.eval_finished(), diff --git a/eval_protocol/pytest/plugin.py b/eval_protocol/pytest/plugin.py index beb95bfc..ad48ad38 100644 --- a/eval_protocol/pytest/plugin.py +++ b/eval_protocol/pytest/plugin.py @@ -279,7 +279,7 @@ def pytest_configure(config) -> None: pass -def pytest_sessionfinish(session, exitstatus): +def _print_experiment_links(session): """Print all collected Fireworks experiment links from pytest stash.""" try: # Late import to avoid circulars; if missing key, skip printing @@ -291,9 +291,8 @@ def pytest_sessionfinish(session, exitstatus): except Exception: EXPERIMENT_LINKS_STASH_KEY = None - # Get links from pytest stash using shared key + # Get links from pytest stash links = [] - if EXPERIMENT_LINKS_STASH_KEY is not None and EXPERIMENT_LINKS_STASH_KEY in session.stash: links = session.stash[EXPERIMENT_LINKS_STASH_KEY] @@ -309,6 +308,55 @@ def pytest_sessionfinish(session, exitstatus): print(f"❌ Experiment {link['experiment_id']}: {link['job_link']}", file=sys.__stderr__) print("=" * 80, file=sys.__stderr__) + return True + return False + except Exception: + return False + + +def _print_local_ui_results_urls(session): + """Print all collected evaluation results URLs from pytest stash.""" + try: + # Late import to avoid circulars; if missing key, skip printing + RESULTS_URLS_STASH_KEY = None + try: + from .store_results_url import RESULTS_URLS_STASH_KEY as _URL_KEY # type: ignore + + RESULTS_URLS_STASH_KEY = _URL_KEY + except Exception: + RESULTS_URLS_STASH_KEY = None + + # Get URLs from pytest stash + urls = [] + if RESULTS_URLS_STASH_KEY is not None and RESULTS_URLS_STASH_KEY in session.stash: + urls = session.stash[RESULTS_URLS_STASH_KEY] + + if urls: + print("\n" + "=" * 80, file=sys.__stderr__) + print("📊 LOCAL UI EVALUATION RESULTS", file=sys.__stderr__) + print("=" * 80, file=sys.__stderr__) + + for url_data in urls: + print(f"📊 Invocation {url_data['invocation_id']}:", file=sys.__stderr__) + print(f" 📊 Aggregate scores: {url_data['pivot_url']}", file=sys.__stderr__) + print(f" 📋 Trajectories: {url_data['table_url']}", file=sys.__stderr__) + + print("=" * 80, file=sys.__stderr__) + return True + return False + except Exception: + return False + + +def pytest_sessionfinish(session, exitstatus): + """Print all collected Fireworks experiment links and evaluation results URLs from pytest stash.""" + try: + # Print experiment links and results URLs separately + links_printed = _print_experiment_links(session) + urls_printed = _print_local_ui_results_urls(session) + + # Flush stderr if anything was printed + if links_printed or urls_printed: err_stream = getattr(sys, "__stderr__", None) if err_stream is not None: try: diff --git a/eval_protocol/pytest/store_results_url.py b/eval_protocol/pytest/store_results_url.py new file mode 100644 index 00000000..ccf48541 --- /dev/null +++ b/eval_protocol/pytest/store_results_url.py @@ -0,0 +1,46 @@ +from typing import TypedDict +from pytest import StashKey + + +class ResultsUrl(TypedDict): + invocation_id: str + pivot_url: str + table_url: str + + +RESULTS_URLS_STASH_KEY = StashKey[list[ResultsUrl]]() + + +def _store_local_ui_url_in_stash(invocation_id: str, pivot_url: str, table_url: str): + """Store results URL in pytest session stash.""" + try: + import sys + + # Walk up the call stack to find the pytest session + session = None + frame = sys._getframe() # pyright: ignore[reportPrivateUsage] + while frame: + if "session" in frame.f_locals and hasattr(frame.f_locals["session"], "stash"): # pyright: ignore[reportAny] + session = frame.f_locals["session"] # pyright: ignore[reportAny] + break + frame = frame.f_back + + if session is not None: + global RESULTS_URLS_STASH_KEY + + if RESULTS_URLS_STASH_KEY not in session.stash: # pyright: ignore[reportAny] + session.stash[RESULTS_URLS_STASH_KEY] = [] # pyright: ignore[reportAny] + + session.stash[RESULTS_URLS_STASH_KEY].append( # pyright: ignore[reportAny] + {"invocation_id": invocation_id, "pivot_url": pivot_url, "table_url": table_url} + ) + else: + pass + + except Exception as e: # pyright: ignore[reportUnusedVariable] + pass + + +def store_local_ui_url(invocation_id: str, pivot_url: str, table_url: str): + """Public function to store results URL in pytest session stash.""" + _store_local_ui_url_in_stash(invocation_id, pivot_url, table_url) diff --git a/eval_protocol/utils/show_results_url.py b/eval_protocol/utils/show_results_url.py index 409ac858..492fe50c 100644 --- a/eval_protocol/utils/show_results_url.py +++ b/eval_protocol/utils/show_results_url.py @@ -5,6 +5,8 @@ import socket import urllib.parse +from eval_protocol.pytest.store_results_url import store_local_ui_url + def is_server_running(host: str = "localhost", port: int = 8000) -> bool: """ @@ -58,25 +60,15 @@ def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://l return f"{base_url}?filterConfig={encoded_filter}" -def show_results_url(invocation_id: str) -> None: +def store_local_ui_results_url(invocation_id: str) -> None: """ - Show URLs for viewing evaluation results filtered by invocation_id. - - If the server is not running, prints a message to run "ep logs" to start the local UI. - If the server is running, prints URLs to view results filtered by invocation_id. + Store URLs for viewing evaluation results filtered by invocation_id in pytest stash. Args: - invocation_id: The invocation ID to filter results by + invocation_id: The invocation ID to filter results by """ - if is_server_running(): - pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot") - table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table") - print("View your evaluation results:") - print(f" 📊 Aggregate scores: {pivot_url}") - print(f" 📋 Trajectories: {table_url}") - else: - pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot") - table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table") - print("Start the local UI with 'ep logs', then visit:") - print(f" 📊 Aggregate scores: {pivot_url}") - print(f" 📋 Trajectories: {table_url}") + pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot") + table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table") + + # Store URLs in pytest stash for later printing in pytest_sessionfinish + store_local_ui_url(invocation_id, pivot_url, table_url) diff --git a/tests/chinook/pydantic/test_pydantic_chinook.py b/tests/chinook/pydantic/test_pydantic_chinook.py index 9cdac0ee..62c427eb 100644 --- a/tests/chinook/pydantic/test_pydantic_chinook.py +++ b/tests/chinook/pydantic/test_pydantic_chinook.py @@ -23,20 +23,25 @@ def agent_factory(config: RolloutProcessorConfig) -> Agent: model_name = config.completion_params["model"] - provider = config.completion_params["provider"] + provider = config.completion_params["provider"] if "provider" in config.completion_params else "openai" model = OpenAIChatModel(model_name, provider=provider) return setup_agent(model) -@pytest.mark.asyncio -@evaluation_test( - input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]], - completion_params=[ +@pytest.mark.parametrize( + "completion_params", + [ { "model": "accounts/fireworks/models/kimi-k2-instruct", "provider": "fireworks", }, + { + "model": "gpt-5", + }, ], +) +@evaluation_test( + input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]], rollout_processor=PydanticAgentRolloutProcessor(agent_factory), mode="pointwise", ) diff --git a/tests/test_show_results_url.py b/tests/test_show_results_url.py index c840c4fc..191e73c1 100644 --- a/tests/test_show_results_url.py +++ b/tests/test_show_results_url.py @@ -9,7 +9,7 @@ from eval_protocol.utils.show_results_url import ( is_server_running, generate_invocation_filter_url, - show_results_url, + store_evaluation_results_url, ) @@ -123,124 +123,73 @@ def test_pivot_and_table_urls(self): assert pivot_url.split("?")[1] == table_url.split("?")[1] -class TestShowResultsUrl: - """Test the show_results_url function.""" +class TestStoreEvaluationResultsUrl: + """Test the store_evaluation_results_url function.""" - @patch("eval_protocol.utils.show_results_url.is_server_running") - @patch("builtins.print") - def test_server_running_pivot_and_table(self, mock_print, mock_is_running): - """Test output when server is running.""" - mock_is_running.return_value = True + @patch("eval_protocol.utils.show_results_url.store_results_url") + def test_stores_urls_correctly(self, mock_store): + """Test that URLs are stored correctly.""" + invocation_id = "test-invocation" - show_results_url("test-invocation") + store_evaluation_results_url(invocation_id) - # Should print both pivot and table URLs - assert mock_print.call_count == 3 # Header + 2 URLs - calls = [call[0][0] for call in mock_print.call_args_list] + # Should call store_results_url once with correct parameters + mock_store.assert_called_once() + call_args = mock_store.call_args[0] - assert "View your evaluation results:" in calls[0] - assert "📊 Aggregate scores:" in calls[1] - assert "📋 Trajectories:" in calls[2] - assert "pivot" in calls[1] - assert "table" in calls[2] + assert call_args[0] == invocation_id # invocation_id + assert "pivot" in call_args[1] # pivot_url + assert "table" in call_args[2] # table_url - @patch("eval_protocol.utils.show_results_url.is_server_running") - @patch("builtins.print") - def test_server_not_running_instructions(self, mock_print, mock_is_running): - """Test output when server is not running.""" - mock_is_running.return_value = False - - show_results_url("test-invocation") - - # Should print instructions and both URLs - assert mock_print.call_count == 3 # Instructions + 2 URLs - calls = [call[0][0] for call in mock_print.call_args_list] - - assert "Start the local UI with 'ep logs'" in calls[0] - assert "📊 Aggregate scores:" in calls[1] - assert "📋 Trajectories:" in calls[2] - assert "pivot" in calls[1] - assert "table" in calls[2] - - @patch("eval_protocol.utils.show_results_url.is_server_running") - @patch("builtins.print") - def test_invocation_id_in_urls(self, mock_print, mock_is_running): + @patch("eval_protocol.utils.show_results_url.store_results_url") + def test_invocation_id_in_urls(self, mock_store): """Test that invocation_id appears in both URLs.""" - mock_is_running.return_value = True invocation_id = "unique-test-id-123" - show_results_url(invocation_id) + store_evaluation_results_url(invocation_id) - calls = [call[0][0] for call in mock_print.call_args_list] - pivot_url = calls[1] - table_url = calls[2] + call_args = mock_store.call_args[0] + pivot_url = call_args[1] + table_url = call_args[2] assert invocation_id in pivot_url assert invocation_id in table_url - @patch("eval_protocol.utils.show_results_url.is_server_running") - @patch("builtins.print") - def test_different_invocation_ids(self, mock_print, mock_is_running): + @patch("eval_protocol.utils.show_results_url.store_results_url") + def test_different_invocation_ids(self, mock_store): """Test that different invocation IDs produce different URLs.""" - mock_is_running.return_value = True - # Test with first invocation ID - show_results_url("id-1") - calls_1 = [call[0][0] for call in mock_print.call_args_list] - mock_print.reset_mock() + store_evaluation_results_url("id-1") + call_1 = mock_store.call_args[0] + mock_store.reset_mock() # Test with second invocation ID - show_results_url("id-2") - calls_2 = [call[0][0] for call in mock_print.call_args_list] + store_evaluation_results_url("id-2") + call_2 = mock_store.call_args[0] # URLs should be different - assert calls_1[1] != calls_2[1] # Pivot URLs different - assert calls_1[2] != calls_2[2] # Table URLs different - assert "id-1" in calls_1[1] - assert "id-2" in calls_2[1] + assert call_1[1] != call_2[1] # Pivot URLs different + assert call_1[2] != call_2[2] # Table URLs different + assert "id-1" in call_1[1] + assert "id-2" in call_2[1] class TestIntegration: """Integration tests for the module.""" - @patch("socket.socket") - @patch("builtins.print") - def test_full_workflow_server_running(self, mock_print, mock_socket): - """Test the full workflow when server is running.""" - # Mock server running - mock_socket_instance = MagicMock() - mock_socket_instance.connect_ex.return_value = 0 - mock_socket.return_value.__enter__.return_value = mock_socket_instance + @patch("eval_protocol.utils.show_results_url.store_results_url") + def test_full_workflow_stores_urls(self, mock_store): + """Test the full workflow stores URLs correctly.""" + invocation_id = "integration-test" - show_results_url("integration-test") + store_evaluation_results_url(invocation_id) - # Verify socket was checked - mock_socket_instance.connect_ex.assert_called_once_with(("localhost", 8000)) - - # Verify output - assert mock_print.call_count == 3 - calls = [call[0][0] for call in mock_print.call_args_list] - assert "View your evaluation results:" in calls[0] - assert "integration-test" in calls[1] - assert "integration-test" in calls[2] - - @patch("socket.socket") - @patch("builtins.print") - def test_full_workflow_server_not_running(self, mock_print, mock_socket): - """Test the full workflow when server is not running.""" - # Mock server not running - mock_socket_instance = MagicMock() - mock_socket_instance.connect_ex.return_value = 1 - mock_socket.return_value.__enter__.return_value = mock_socket_instance - - show_results_url("integration-test") - - # Verify socket was checked - mock_socket_instance.connect_ex.assert_called_once_with(("localhost", 8000)) + # Verify store_results_url was called + mock_store.assert_called_once() + call_args = mock_store.call_args[0] - # Verify output - assert mock_print.call_count == 3 - calls = [call[0][0] for call in mock_print.call_args_list] - assert "Start the local UI with 'ep logs'" in calls[0] - assert "integration-test" in calls[1] - assert "integration-test" in calls[2] + assert call_args[0] == invocation_id + assert "pivot" in call_args[1] + assert "table" in call_args[2] + assert "integration-test" in call_args[1] + assert "integration-test" in call_args[2] From 11c10ec9ed1057e19c4b5f7982dfe4fb359961e9 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 22 Sep 2025 09:29:36 -0700 Subject: [PATCH 2/3] fix --- tests/test_show_results_url.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_show_results_url.py b/tests/test_show_results_url.py index 191e73c1..152f942c 100644 --- a/tests/test_show_results_url.py +++ b/tests/test_show_results_url.py @@ -9,7 +9,7 @@ from eval_protocol.utils.show_results_url import ( is_server_running, generate_invocation_filter_url, - store_evaluation_results_url, + store_local_ui_results_url, ) @@ -123,17 +123,17 @@ def test_pivot_and_table_urls(self): assert pivot_url.split("?")[1] == table_url.split("?")[1] -class TestStoreEvaluationResultsUrl: - """Test the store_evaluation_results_url function.""" +class TestStoreLocalUiResultsUrl: + """Test the store_local_ui_results_url function.""" - @patch("eval_protocol.utils.show_results_url.store_results_url") + @patch("eval_protocol.utils.show_results_url.store_local_ui_url") def test_stores_urls_correctly(self, mock_store): """Test that URLs are stored correctly.""" invocation_id = "test-invocation" - store_evaluation_results_url(invocation_id) + store_local_ui_results_url(invocation_id) - # Should call store_results_url once with correct parameters + # Should call store_local_ui_url once with correct parameters mock_store.assert_called_once() call_args = mock_store.call_args[0] @@ -141,12 +141,12 @@ def test_stores_urls_correctly(self, mock_store): assert "pivot" in call_args[1] # pivot_url assert "table" in call_args[2] # table_url - @patch("eval_protocol.utils.show_results_url.store_results_url") + @patch("eval_protocol.utils.show_results_url.store_local_ui_url") def test_invocation_id_in_urls(self, mock_store): """Test that invocation_id appears in both URLs.""" invocation_id = "unique-test-id-123" - store_evaluation_results_url(invocation_id) + store_local_ui_results_url(invocation_id) call_args = mock_store.call_args[0] pivot_url = call_args[1] @@ -155,16 +155,16 @@ def test_invocation_id_in_urls(self, mock_store): assert invocation_id in pivot_url assert invocation_id in table_url - @patch("eval_protocol.utils.show_results_url.store_results_url") + @patch("eval_protocol.utils.show_results_url.store_local_ui_url") def test_different_invocation_ids(self, mock_store): """Test that different invocation IDs produce different URLs.""" # Test with first invocation ID - store_evaluation_results_url("id-1") + store_local_ui_results_url("id-1") call_1 = mock_store.call_args[0] mock_store.reset_mock() # Test with second invocation ID - store_evaluation_results_url("id-2") + store_local_ui_results_url("id-2") call_2 = mock_store.call_args[0] # URLs should be different @@ -177,14 +177,14 @@ def test_different_invocation_ids(self, mock_store): class TestIntegration: """Integration tests for the module.""" - @patch("eval_protocol.utils.show_results_url.store_results_url") + @patch("eval_protocol.utils.show_results_url.store_local_ui_url") def test_full_workflow_stores_urls(self, mock_store): """Test the full workflow stores URLs correctly.""" invocation_id = "integration-test" - store_evaluation_results_url(invocation_id) + store_local_ui_results_url(invocation_id) - # Verify store_results_url was called + # Verify store_local_ui_url was called mock_store.assert_called_once() call_args = mock_store.call_args[0] From 7230caf49963dab4d9057379348f89106f7cba56 Mon Sep 17 00:00:00 2001 From: Dylan Huang Date: Mon, 22 Sep 2025 09:46:07 -0700 Subject: [PATCH 3/3] fix test_pytest_propagate_error --- tests/pytest/test_pytest_propagate_error.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/pytest/test_pytest_propagate_error.py b/tests/pytest/test_pytest_propagate_error.py index 63ed21ce..c1e97cc7 100644 --- a/tests/pytest/test_pytest_propagate_error.py +++ b/tests/pytest/test_pytest_propagate_error.py @@ -72,6 +72,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow: assert row.eval_metadata.status.is_error() # make sure the error message includes details of the error - assert all("HTTPStatusError" in row.rollout_status.message for row in rollouts.values()) - assert all("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values()) - assert all("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values()) + assert any("HTTPStatusError" in row.rollout_status.message for row in rollouts.values()) + assert any("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values()) + assert any("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())