diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py index 30ac1ad5..7fe78232 100644 --- a/eval_protocol/cli.py +++ b/eval_protocol/cli.py @@ -427,6 +427,37 @@ def parse_args(args=None): rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending") rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID") + # Local test command + local_test_parser = subparsers.add_parser( + "local-test", + help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.", + ) + local_test_parser.add_argument( + "--entry", + help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).", + ) + local_test_parser.add_argument( + "--ignore-docker", + action="store_true", + help="Ignore Dockerfile even if present; run pytest on host", + ) + local_test_parser.add_argument( + "--yes", + "-y", + action="store_true", + help="Non-interactive: if multiple tests exist and no --entry, fails with guidance", + ) + local_test_parser.add_argument( + "--docker-build-extra", + default="", + help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")", + ) + local_test_parser.add_argument( + "--docker-run-extra", + default="", + help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")", + ) + # Run command (for Hydra-based evaluations) # This subparser intentionally defines no arguments itself. # All arguments after 'run' will be passed to Hydra by parse_known_args. @@ -559,6 +590,10 @@ def _extract_flag_value(argv_list, flag_name): return create_rft_command(args) print("Error: missing subcommand for 'create'. Try: eval-protocol create rft") return 1 + elif args.command == "local-test": + from .cli_commands.local_test import local_test_command + + return local_test_command(args) elif args.command == "run": # For the 'run' command, Hydra takes over argument parsing. diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py new file mode 100644 index 00000000..49d34190 --- /dev/null +++ b/eval_protocol/cli_commands/local_test.py @@ -0,0 +1,175 @@ +import argparse +import os +import subprocess +import sys +import shlex +from typing import List + +from .upload import _discover_tests, _prompt_select + + +def _find_dockerfiles(root: str) -> List[str]: + skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"} + dockerfiles: List[str] = [] + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")] + for name in filenames: + if name == "Dockerfile": + dockerfiles.append(os.path.join(dirpath, name)) + return dockerfiles + + +def _run_pytest_host(pytest_target: str) -> int: + print(f"Running locally: pytest {pytest_target} -vs") + proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"]) + return proc.returncode + + +def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool: + context_dir = os.path.dirname(dockerfile_path) + print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...") + try: + base_cmd = ["docker", "build"] + if build_extras: + base_cmd += build_extras + base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir] + proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + print(proc.stdout) + return proc.returncode == 0 + except FileNotFoundError: + print("Error: docker not found in PATH. Install Docker or use --ignore-docker.") + return False + + +def _run_pytest_in_docker( + project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None +) -> int: + workdir = "/workspace" + # Host HOME logs directory to map into container + host_home = os.path.expanduser("~") + host_logs_dir = os.path.join(host_home, ".eval_protocol") + try: + os.makedirs(host_logs_dir, exist_ok=True) + except Exception: + pass + # Mount read-only is safer; but tests may write artifacts. Use read-write. + cmd = [ + "docker", + "run", + "--rm", + "-v", + f"{project_root}:{workdir}", + "-v", + f"{host_logs_dir}:/container_home/.eval_protocol", + "-e", + "HOME=/container_home", + "-e", + "EVAL_PROTOCOL_DIR=/container_home/.eval_protocol", + "-w", + workdir, + ] + # Try to match host user to avoid permission problems on mounted volume + try: + uid = os.getuid() # type: ignore[attr-defined] + gid = os.getgid() # type: ignore[attr-defined] + cmd += ["--user", f"{uid}:{gid}"] + except Exception: + pass + if run_extras: + cmd += run_extras + cmd += [image_tag, "pytest", pytest_target, "-vs"] + print("Running in Docker:", " ".join(cmd)) + try: + proc = subprocess.run(cmd) + return proc.returncode + except FileNotFoundError: + print("Error: docker not found in PATH. Install Docker or use --ignore-docker.") + return 1 + + +def local_test_command(args: argparse.Namespace) -> int: + project_root = os.getcwd() + + # Selection and pytest target resolution + pytest_target: str = "" + entry = getattr(args, "entry", None) + if entry: + if "::" in entry: + file_part, func_part = entry.split("::", 1) + file_path = ( + file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part)) + ) + # Convert to project-relative like the non-:: path + try: + rel = os.path.relpath(file_path, project_root) + except Exception: + rel = file_path + pytest_target = f"{rel}::{func_part}" + else: + file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry)) + # Use path relative to project_root when possible + try: + rel = os.path.relpath(file_path, project_root) + except Exception: + rel = file_path + pytest_target = rel + else: + tests = _discover_tests(project_root) + if not tests: + print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.") + return 1 + non_interactive = bool(getattr(args, "yes", False)) + selected = _prompt_select(tests, non_interactive=non_interactive) + if not selected: + print("No tests selected.") + return 1 + if len(selected) != 1: + print("Error: Please select exactly one evaluation test for 'local-test'.") + return 1 + chosen = selected[0] + abs_path = os.path.abspath(chosen.file_path) + try: + rel = os.path.relpath(abs_path, project_root) + except Exception: + rel = abs_path + pytest_target = rel + + ignore_docker = bool(getattr(args, "ignore_docker", False)) + build_extras_str = getattr(args, "docker_build_extra", "") or "" + run_extras_str = getattr(args, "docker_run_extra", "") or "" + build_extras = shlex.split(build_extras_str) if build_extras_str else [] + run_extras = shlex.split(run_extras_str) if run_extras_str else [] + if ignore_docker: + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_host(pytest_target) + + dockerfiles = _find_dockerfiles(project_root) + if len(dockerfiles) > 1: + print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.") + for df in dockerfiles: + print(f" - {df}") + print("Hint: use --ignore-docker to bypass Docker.") + return 1 + if len(dockerfiles) == 1: + # Ensure host home logs directory exists so container writes are visible to host ep logs + try: + os.makedirs(os.path.join(os.path.expanduser("~"), ".eval_protocol"), exist_ok=True) + except Exception: + pass + image_tag = "ep-evaluator:local" + ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras) + if not ok: + print("Docker build failed. See logs above.") + return 1 + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras) + + # No Dockerfile: run on host + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_host(pytest_target) diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index 51283b23..8c6e7baf 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe # Check if only one test - auto-select it if len(tests) == 1: print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}") - confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask() + confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask() if confirm: return tests else: @@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest] print("=" * 80) try: - choice = input("Enter the number to upload: ").strip() + choice = input("Enter the number to select: ").strip() except KeyboardInterrupt: print("\n\nUpload cancelled.") return [] diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 6ec94210..67d287ba 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1,7 +1,7 @@ import os import logging import importlib -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union @@ -825,7 +825,10 @@ class EvaluationRow(BaseModel): description="Metadata about the execution of the evaluation.", ) - created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.") + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + description="The timestamp when the row was created (UTC).", + ) eval_metadata: Optional[EvalMetadata] = Field( default=None, description="Metadata about the evaluation that was run." diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py new file mode 100644 index 00000000..6ab0b14e --- /dev/null +++ b/tests/test_cli_local_test.py @@ -0,0 +1,256 @@ +import os +from types import SimpleNamespace + +import pytest + + +def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Create a dummy test file + test_file = project / "metric" / "test_one.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + # Import module under test + from eval_protocol.cli_commands import local_test as lt + + # Avoid Docker path + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + captured = {"target": ""} + + def _fake_host(target: str) -> int: + captured["target"] = target + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect relative path target + assert captured["target"] == os.path.relpath(str(test_file), str(project)) + + +def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_two.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + # Pretend we have Dockerfile(s), but ignore_docker=True should skip + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + + called = {"host": False} + + def _fake_host(target: str) -> int: + called["host"] = True + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert called["host"] is True + + +def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_three.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr( + lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")] + ) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 1 + + +def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_four.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True) + + captured = {"target": "", "image": ""} + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: + captured["target"] = pytest_target + captured["image"] = image_tag + return 0 + + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert captured["image"] == "ep-evaluator:local" + assert captured["target"] == os.path.relpath(str(test_file), str(project)) + + +def test_local_test_selector_single_test(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_sel.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + # No entry; force discover + selector + disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file)) + monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc]) + monkeypatch.setattr(lt, "_prompt_select", lambda tests, non_interactive=False: tests[:1]) + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + called = {"host": False} + + def _fake_host(target: str) -> int: + called["host"] = True + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=None, ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert called["host"] is True + + +def test_local_test_passes_docker_build_extra(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_build_extra.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + + captured = {"extras": None} + + def _fake_build(dockerfile, tag, build_extras=None): + captured["extras"] = build_extras + return True + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: + return 0 + + monkeypatch.setattr(lt, "_build_docker_image", _fake_build) + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + # Extras string with multiple flags and equals-arg + args = SimpleNamespace( + entry=str(test_file), + ignore_docker=False, + yes=True, + docker_build_extra="--no-cache --pull --progress=plain --build-arg KEY=VAL", + docker_run_extra="", + ) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect split list preserving tokens order + assert captured["extras"] == ["--no-cache", "--pull", "--progress=plain", "--build-arg", "KEY=VAL"] + + +def test_local_test_passes_docker_run_extra(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_run_extra.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True) + + captured = {"extras": None} + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: + captured["extras"] = run_extras + return 0 + + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + args = SimpleNamespace( + entry=str(test_file), + ignore_docker=False, + yes=True, + docker_build_extra="", + docker_run_extra="--env-file .env --memory=8g --cpus=2 --add-host=host.docker.internal:host-gateway", + ) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert captured["extras"] == [ + "--env-file", + ".env", + "--memory=8g", + "--cpus=2", + "--add-host=host.docker.internal:host-gateway", + ] + + +def test_local_test_normalizes_entry_with_selector(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Create a dummy test file + test_file = project / "metric" / "test_sel_abs.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + abs_entry = f"{str(test_file)}::test_dummy" + + from eval_protocol.cli_commands import local_test as lt + + # Avoid Docker path + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + captured = {"target": ""} + + def _fake_host(target: str) -> int: + captured["target"] = target + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=abs_entry, ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect project-relative path plus selector + rel = os.path.relpath(str(test_file), str(project)) + assert captured["target"] == f"{rel}::test_dummy"