Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 35 additions & 0 deletions eval_protocol/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,37 @@ def parse_args(args=None):
rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")

# Local test command
local_test_parser = subparsers.add_parser(
"local-test",
help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
)
local_test_parser.add_argument(
"--entry",
help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
)
local_test_parser.add_argument(
"--ignore-docker",
action="store_true",
help="Ignore Dockerfile even if present; run pytest on host",
)
local_test_parser.add_argument(
"--yes",
"-y",
action="store_true",
help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
)
local_test_parser.add_argument(
"--docker-build-extra",
default="",
help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
)
local_test_parser.add_argument(
"--docker-run-extra",
default="",
help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
)

# Run command (for Hydra-based evaluations)
# This subparser intentionally defines no arguments itself.
# All arguments after 'run' will be passed to Hydra by parse_known_args.
Expand Down Expand Up @@ -559,6 +590,10 @@ def _extract_flag_value(argv_list, flag_name):
return create_rft_command(args)
print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
return 1
elif args.command == "local-test":
from .cli_commands.local_test import local_test_command

return local_test_command(args)
elif args.command == "run":
# For the 'run' command, Hydra takes over argument parsing.

Expand Down
175 changes: 175 additions & 0 deletions eval_protocol/cli_commands/local_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
import argparse
import os
import subprocess
import sys
import shlex
from typing import List

from .upload import _discover_tests, _prompt_select


def _find_dockerfiles(root: str) -> List[str]:
skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
dockerfiles: List[str] = []
for dirpath, dirnames, filenames in os.walk(root):
dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
for name in filenames:
if name == "Dockerfile":
dockerfiles.append(os.path.join(dirpath, name))
return dockerfiles


def _run_pytest_host(pytest_target: str) -> int:
print(f"Running locally: pytest {pytest_target} -vs")
proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
return proc.returncode


def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
context_dir = os.path.dirname(dockerfile_path)
print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
try:
base_cmd = ["docker", "build"]
if build_extras:
base_cmd += build_extras
base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
print(proc.stdout)
return proc.returncode == 0
except FileNotFoundError:
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
return False


def _run_pytest_in_docker(
project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
) -> int:
workdir = "/workspace"
# Host HOME logs directory to map into container
host_home = os.path.expanduser("~")
host_logs_dir = os.path.join(host_home, ".eval_protocol")
try:
os.makedirs(host_logs_dir, exist_ok=True)
except Exception:
pass
# Mount read-only is safer; but tests may write artifacts. Use read-write.
cmd = [
"docker",
"run",
"--rm",
"-v",
f"{project_root}:{workdir}",
"-v",
f"{host_logs_dir}:/container_home/.eval_protocol",
"-e",
"HOME=/container_home",
"-e",
"EVAL_PROTOCOL_DIR=/container_home/.eval_protocol",
"-w",
workdir,
]
# Try to match host user to avoid permission problems on mounted volume
try:
uid = os.getuid() # type: ignore[attr-defined]
gid = os.getgid() # type: ignore[attr-defined]
cmd += ["--user", f"{uid}:{gid}"]
except Exception:
pass
if run_extras:
cmd += run_extras
cmd += [image_tag, "pytest", pytest_target, "-vs"]
print("Running in Docker:", " ".join(cmd))
try:
proc = subprocess.run(cmd)
return proc.returncode
except FileNotFoundError:
print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
return 1


def local_test_command(args: argparse.Namespace) -> int:
project_root = os.getcwd()

# Selection and pytest target resolution
pytest_target: str = ""
entry = getattr(args, "entry", None)
if entry:
if "::" in entry:
file_part, func_part = entry.split("::", 1)
file_path = (
file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
)
# Convert to project-relative like the non-:: path
try:
rel = os.path.relpath(file_path, project_root)
except Exception:
rel = file_path
pytest_target = f"{rel}::{func_part}"
else:
file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
# Use path relative to project_root when possible
try:
rel = os.path.relpath(file_path, project_root)
except Exception:
rel = file_path
pytest_target = rel
else:
tests = _discover_tests(project_root)
if not tests:
print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
return 1
non_interactive = bool(getattr(args, "yes", False))
selected = _prompt_select(tests, non_interactive=non_interactive)
if not selected:
print("No tests selected.")
return 1
if len(selected) != 1:
print("Error: Please select exactly one evaluation test for 'local-test'.")
return 1
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Non-interactive --yes fails multiple tests.

When --yes is used without --entry and multiple tests exist, _prompt_select returns all tests (because non_interactive=True), causing the check if len(selected) != 1 to always fail. The error message doesn't guide users to use --entry, making the --yes flag unusable in multi-test scenarios. The function should either fail earlier with a helpful message about requiring --entry, or handle the non-interactive case differently.

Fix in Cursor Fix in Web

chosen = selected[0]
abs_path = os.path.abspath(chosen.file_path)
try:
rel = os.path.relpath(abs_path, project_root)
except Exception:
rel = abs_path
pytest_target = rel

ignore_docker = bool(getattr(args, "ignore_docker", False))
build_extras_str = getattr(args, "docker_build_extra", "") or ""
run_extras_str = getattr(args, "docker_run_extra", "") or ""
build_extras = shlex.split(build_extras_str) if build_extras_str else []
run_extras = shlex.split(run_extras_str) if run_extras_str else []
if ignore_docker:
if not pytest_target:
print("Error: Failed to resolve a pytest target to run.")
return 1
return _run_pytest_host(pytest_target)

dockerfiles = _find_dockerfiles(project_root)
if len(dockerfiles) > 1:
print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
for df in dockerfiles:
print(f" - {df}")
print("Hint: use --ignore-docker to bypass Docker.")
return 1
if len(dockerfiles) == 1:
# Ensure host home logs directory exists so container writes are visible to host ep logs
try:
os.makedirs(os.path.join(os.path.expanduser("~"), ".eval_protocol"), exist_ok=True)
except Exception:
pass
image_tag = "ep-evaluator:local"
ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
if not ok:
print("Docker build failed. See logs above.")
return 1
if not pytest_target:
print("Error: Failed to resolve a pytest target to run.")
return 1
return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)

# No Dockerfile: run on host
if not pytest_target:
print("Error: Failed to resolve a pytest target to run.")
return 1
return _run_pytest_host(pytest_target)
4 changes: 2 additions & 2 deletions eval_protocol/cli_commands/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
# Check if only one test - auto-select it
if len(tests) == 1:
print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask()
confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask()
if confirm:
return tests
else:
Expand Down Expand Up @@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]

print("=" * 80)
try:
choice = input("Enter the number to upload: ").strip()
choice = input("Enter the number to select: ").strip()
except KeyboardInterrupt:
print("\n\nUpload cancelled.")
return []
Expand Down
7 changes: 5 additions & 2 deletions eval_protocol/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
import logging
import importlib
from datetime import datetime
from datetime import datetime, timezone
from enum import Enum
from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union

Expand Down Expand Up @@ -825,7 +825,10 @@ class EvaluationRow(BaseModel):
description="Metadata about the execution of the evaluation.",
)

created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="The timestamp when the row was created (UTC).",
)

eval_metadata: Optional[EvalMetadata] = Field(
default=None, description="Metadata about the evaluation that was run."
Expand Down
Loading
Loading