Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@
parse_ep_passed_threshold,
rollout_processor_with_retry,
)
from eval_protocol.utils.show_results_url import show_results_url
from eval_protocol.utils.show_results_url import store_local_ui_results_url

from ..common_utils import load_jsonl

Expand Down Expand Up @@ -220,6 +220,9 @@ def create_wrapper_with_signature() -> Callable[[], None]:
# Create the function body that will be used
invocation_id = generate_id()

# Store URL for viewing results (after all postprocessing is complete)
store_local_ui_results_url(invocation_id)

async def wrapper_body(**kwargs: Unpack[ParameterizedTestKwargs]) -> None:
eval_metadata = None

Expand Down Expand Up @@ -556,9 +559,6 @@ async def execute_run_with_progress(run_idx: int, config):
experiment_duration_seconds,
)

# Show URL for viewing results (after all postprocessing is complete)
show_results_url(invocation_id)

except AssertionError:
_log_eval_error(
Status.eval_finished(),
Expand Down
54 changes: 51 additions & 3 deletions eval_protocol/pytest/plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def pytest_configure(config) -> None:
pass


def pytest_sessionfinish(session, exitstatus):
def _print_experiment_links(session):
"""Print all collected Fireworks experiment links from pytest stash."""
try:
# Late import to avoid circulars; if missing key, skip printing
Expand All @@ -291,9 +291,8 @@ def pytest_sessionfinish(session, exitstatus):
except Exception:
EXPERIMENT_LINKS_STASH_KEY = None

# Get links from pytest stash using shared key
# Get links from pytest stash
links = []

if EXPERIMENT_LINKS_STASH_KEY is not None and EXPERIMENT_LINKS_STASH_KEY in session.stash:
links = session.stash[EXPERIMENT_LINKS_STASH_KEY]

Expand All @@ -309,6 +308,55 @@ def pytest_sessionfinish(session, exitstatus):
print(f"❌ Experiment {link['experiment_id']}: {link['job_link']}", file=sys.__stderr__)

print("=" * 80, file=sys.__stderr__)
return True
return False
except Exception:
return False


def _print_local_ui_results_urls(session):
"""Print all collected evaluation results URLs from pytest stash."""
try:
# Late import to avoid circulars; if missing key, skip printing
RESULTS_URLS_STASH_KEY = None
try:
from .store_results_url import RESULTS_URLS_STASH_KEY as _URL_KEY # type: ignore

RESULTS_URLS_STASH_KEY = _URL_KEY
except Exception:
RESULTS_URLS_STASH_KEY = None

# Get URLs from pytest stash
urls = []
if RESULTS_URLS_STASH_KEY is not None and RESULTS_URLS_STASH_KEY in session.stash:
urls = session.stash[RESULTS_URLS_STASH_KEY]

if urls:
print("\n" + "=" * 80, file=sys.__stderr__)
print("📊 LOCAL UI EVALUATION RESULTS", file=sys.__stderr__)
print("=" * 80, file=sys.__stderr__)

for url_data in urls:
print(f"📊 Invocation {url_data['invocation_id']}:", file=sys.__stderr__)
print(f" 📊 Aggregate scores: {url_data['pivot_url']}", file=sys.__stderr__)
print(f" 📋 Trajectories: {url_data['table_url']}", file=sys.__stderr__)

print("=" * 80, file=sys.__stderr__)
return True
return False
except Exception:
return False


def pytest_sessionfinish(session, exitstatus):
"""Print all collected Fireworks experiment links and evaluation results URLs from pytest stash."""
try:
# Print experiment links and results URLs separately
links_printed = _print_experiment_links(session)
urls_printed = _print_local_ui_results_urls(session)

# Flush stderr if anything was printed
if links_printed or urls_printed:
err_stream = getattr(sys, "__stderr__", None)
if err_stream is not None:
try:
Expand Down
46 changes: 46 additions & 0 deletions eval_protocol/pytest/store_results_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from typing import TypedDict
from pytest import StashKey


class ResultsUrl(TypedDict):
invocation_id: str
pivot_url: str
table_url: str


RESULTS_URLS_STASH_KEY = StashKey[list[ResultsUrl]]()


def _store_local_ui_url_in_stash(invocation_id: str, pivot_url: str, table_url: str):
"""Store results URL in pytest session stash."""
try:
import sys

# Walk up the call stack to find the pytest session
session = None
frame = sys._getframe() # pyright: ignore[reportPrivateUsage]
while frame:
if "session" in frame.f_locals and hasattr(frame.f_locals["session"], "stash"): # pyright: ignore[reportAny]
session = frame.f_locals["session"] # pyright: ignore[reportAny]
break
frame = frame.f_back

if session is not None:
global RESULTS_URLS_STASH_KEY

if RESULTS_URLS_STASH_KEY not in session.stash: # pyright: ignore[reportAny]
session.stash[RESULTS_URLS_STASH_KEY] = [] # pyright: ignore[reportAny]

session.stash[RESULTS_URLS_STASH_KEY].append( # pyright: ignore[reportAny]
{"invocation_id": invocation_id, "pivot_url": pivot_url, "table_url": table_url}
)
else:
pass

except Exception as e: # pyright: ignore[reportUnusedVariable]
pass


def store_local_ui_url(invocation_id: str, pivot_url: str, table_url: str):
"""Public function to store results URL in pytest session stash."""
_store_local_ui_url_in_stash(invocation_id, pivot_url, table_url)
28 changes: 10 additions & 18 deletions eval_protocol/utils/show_results_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import socket
import urllib.parse

from eval_protocol.pytest.store_results_url import store_local_ui_url


def is_server_running(host: str = "localhost", port: int = 8000) -> bool:
"""
Expand Down Expand Up @@ -58,25 +60,15 @@ def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://l
return f"{base_url}?filterConfig={encoded_filter}"


def show_results_url(invocation_id: str) -> None:
def store_local_ui_results_url(invocation_id: str) -> None:
"""
Show URLs for viewing evaluation results filtered by invocation_id.

If the server is not running, prints a message to run "ep logs" to start the local UI.
If the server is running, prints URLs to view results filtered by invocation_id.
Store URLs for viewing evaluation results filtered by invocation_id in pytest stash.

Args:
invocation_id: The invocation ID to filter results by
invocation_id: The invocation ID to filter results by
"""
if is_server_running():
pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
print("View your evaluation results:")
print(f" 📊 Aggregate scores: {pivot_url}")
print(f" 📋 Trajectories: {table_url}")
else:
pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
print("Start the local UI with 'ep logs', then visit:")
print(f" 📊 Aggregate scores: {pivot_url}")
print(f" 📋 Trajectories: {table_url}")
pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")

# Store URLs in pytest stash for later printing in pytest_sessionfinish
store_local_ui_url(invocation_id, pivot_url, table_url)
15 changes: 10 additions & 5 deletions tests/chinook/pydantic/test_pydantic_chinook.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,20 +23,25 @@

def agent_factory(config: RolloutProcessorConfig) -> Agent:
model_name = config.completion_params["model"]
provider = config.completion_params["provider"]
provider = config.completion_params["provider"] if "provider" in config.completion_params else "openai"
model = OpenAIChatModel(model_name, provider=provider)
return setup_agent(model)


@pytest.mark.asyncio
@evaluation_test(
input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
completion_params=[
@pytest.mark.parametrize(
"completion_params",
[
{
"model": "accounts/fireworks/models/kimi-k2-instruct",
"provider": "fireworks",
},
{
"model": "gpt-5",
},
],
)
@evaluation_test(
input_messages=[[[Message(role="user", content="What is the total number of tracks in the database?")]]],
rollout_processor=PydanticAgentRolloutProcessor(agent_factory),
mode="pointwise",
)
Expand Down
6 changes: 3 additions & 3 deletions tests/pytest/test_pytest_propagate_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,6 @@ def eval_fn(row: EvaluationRow) -> EvaluationRow:
assert row.eval_metadata.status.is_error()

# make sure the error message includes details of the error
assert all("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
assert all("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
assert all("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())
assert any("HTTPStatusError" in row.rollout_status.message for row in rollouts.values())
assert any("405 Method Not Allowed" in row.rollout_status.message for row in rollouts.values())
assert any("https://docs.fireworks.ai/mcp-non-existent" in row.rollout_status.message for row in rollouts.values())
Loading
Loading