Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,8 @@
"editor.formatOnSave": true,
"[python]": {
"editor.defaultFormatter": "charliermarsh.ruff"
},
"[typescript]": {
"editor.defaultFormatter": "esbenp.prettier-vscode"
}
}
4 changes: 4 additions & 0 deletions eval_protocol/pytest/evaluation_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
parse_ep_passed_threshold,
rollout_processor_with_retry,
)
from eval_protocol.utils.show_results_url import show_results_url

from ..common_utils import load_jsonl

Expand Down Expand Up @@ -555,6 +556,9 @@ async def execute_run_with_progress(run_idx: int, config):
experiment_duration_seconds,
)

# Show URL for viewing results (after all postprocessing is complete)
show_results_url(invocation_id)

except AssertionError:
_log_eval_error(
Status.eval_finished(),
Expand Down
77 changes: 77 additions & 0 deletions eval_protocol/utils/check_server_status.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""
Utility functions for checking server status and generating UI URLs.
"""

import socket
import urllib.parse
from typing import List, Dict, Any


def is_server_running(host: str = "localhost", port: int = 8000) -> bool:
"""
Check if a server is running on the specified host and port.

Args:
host: The host to check (default: "localhost")
port: The port to check (default: 8000)

Returns:
True if server is running, False otherwise
"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1)
result = s.connect_ex((host, port))
return result == 0
except Exception:
return False


def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://localhost:8000") -> str:
"""
Generate a URL for viewing results filtered by invocation_id.

Args:
invocation_id: The invocation ID to filter results by
base_url: The base URL for the UI (default: "http://localhost:8000")

Returns:
URL-encoded URL with filter configuration
"""
filter_config = [
{
"logic": "AND",
"filters": [
{
"field": "$.execution_metadata.invocation_id",
"operator": "equals",
"value": invocation_id,
"type": "text",
}
],
}
]

# URL encode the filter config
filter_config_json = str(filter_config).replace("'", '"')
encoded_filter = urllib.parse.quote(filter_config_json)

return f"{base_url}/pivot?filterConfig={encoded_filter}"


def show_results_url(invocation_id: str) -> None:
"""
Show a URL for viewing evaluation results filtered by invocation_id.

If the server is not running, prints a message to run "ep logs" to start the local UI.
If the server is running, prints a URL to view results filtered by invocation_id.

Args:
invocation_id: The invocation ID to filter results by
"""
if is_server_running():
url = generate_invocation_filter_url(invocation_id)
print(f"View your evaluation results: {url}")
else:
url = generate_invocation_filter_url(invocation_id)
print(f"Start the local UI with 'ep logs', then visit: {url}")
82 changes: 82 additions & 0 deletions eval_protocol/utils/show_results_url.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Utility functions for showing evaluation results URLs and checking server status.
"""

import socket
import urllib.parse


def is_server_running(host: str = "localhost", port: int = 8000) -> bool:
"""
Check if a server is running on the specified host and port.

Args:
host: The host to check (default: "localhost")
port: The port to check (default: 8000)

Returns:
True if server is running, False otherwise
"""
try:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(1)
result = s.connect_ex((host, port))
return result == 0
except Exception:
return False


def generate_invocation_filter_url(invocation_id: str, base_url: str = "http://localhost:8000") -> str:
"""
Generate a URL for viewing results filtered by invocation_id.

Args:
invocation_id: The invocation ID to filter results by
base_url: The base URL for the UI (default: "http://localhost:8000")

Returns:
URL-encoded URL with filter configuration
"""
filter_config = [
{
"logic": "AND",
"filters": [
{
"field": "$.execution_metadata.invocation_id",
"operator": "==",
"value": invocation_id,
"type": "text",
}
],
}
]

# URL encode the filter config
filter_config_json = str(filter_config).replace("'", '"')
encoded_filter = urllib.parse.quote(filter_config_json)

return f"{base_url}?filterConfig={encoded_filter}"


def show_results_url(invocation_id: str) -> None:
"""
Show URLs for viewing evaluation results filtered by invocation_id.

If the server is not running, prints a message to run "ep logs" to start the local UI.
If the server is running, prints URLs to view results filtered by invocation_id.

Args:
invocation_id: The invocation ID to filter results by
"""
if is_server_running():
pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
print("View your evaluation results:")
print(f" 📊 Aggregate scores: {pivot_url}")
print(f" 📋 Trajectories: {table_url}")
else:
pivot_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/pivot")
table_url = generate_invocation_filter_url(invocation_id, "http://localhost:8000/table")
print("Start the local UI with 'ep logs', then visit:")
print(f" 📊 Aggregate scores: {pivot_url}")
print(f" 📋 Trajectories: {table_url}")
2 changes: 0 additions & 2 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ asyncio_mode = auto
asyncio_default_fixture_loop_scope = function
testpaths = tests ./eval_protocol/quickstart
python_files = test_*.py llm_judge_*.py
plugins =
eval_protocol.pytest.plugin
python_classes = Test*
python_functions = test_*
# Configure stdout/stderr capture for debugging
Expand Down
Loading
Loading