From cd9cc91c34f975482fe05b4bf3a60b4a0bcbd746 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 10 Nov 2025 17:41:27 -0800 Subject: [PATCH 01/11] local test command --- eval_protocol/cli.py | 25 ++++ eval_protocol/cli_commands/local_test.py | 140 ++++++++++++++++++++++ tests/test_cli_local_test.py | 145 +++++++++++++++++++++++ 3 files changed, 310 insertions(+) create mode 100644 eval_protocol/cli_commands/local_test.py create mode 100644 tests/test_cli_local_test.py diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py index 30ac1ad5..4e7ac8da 100644 --- a/eval_protocol/cli.py +++ b/eval_protocol/cli.py @@ -427,6 +427,27 @@ def parse_args(args=None): rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending") rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID") + # Local test command + local_test_parser = subparsers.add_parser( + "local-test", + help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.", + ) + local_test_parser.add_argument( + "--entry", + help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).", + ) + local_test_parser.add_argument( + "--ignore-docker", + action="store_true", + help="Ignore Dockerfile even if present; run pytest on host", + ) + local_test_parser.add_argument( + "--yes", + "-y", + action="store_true", + help="Non-interactive: if multiple tests exist and no --entry, fails with guidance", + ) + # Run command (for Hydra-based evaluations) # This subparser intentionally defines no arguments itself. # All arguments after 'run' will be passed to Hydra by parse_known_args. @@ -559,6 +580,10 @@ def _extract_flag_value(argv_list, flag_name): return create_rft_command(args) print("Error: missing subcommand for 'create'. Try: eval-protocol create rft") return 1 + elif args.command == "local-test": + from .cli_commands.local_test import local_test_command + + return local_test_command(args) elif args.command == "run": # For the 'run' command, Hydra takes over argument parsing. diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py new file mode 100644 index 00000000..a877be7d --- /dev/null +++ b/eval_protocol/cli_commands/local_test.py @@ -0,0 +1,140 @@ +import argparse +import os +import subprocess +import sys +from typing import List + +from .upload import _discover_tests, _prompt_select + + +def _find_dockerfiles(root: str) -> List[str]: + skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"} + dockerfiles: List[str] = [] + for dirpath, dirnames, filenames in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")] + for name in filenames: + if name == "Dockerfile": + dockerfiles.append(os.path.join(dirpath, name)) + return dockerfiles + + +def _run_pytest_host(pytest_target: str) -> int: + print(f"Running locally: pytest {pytest_target} -vs") + proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"]) + return proc.returncode + + +def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: + context_dir = os.path.dirname(dockerfile_path) + print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...") + try: + proc = subprocess.run( + ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) + print(proc.stdout) + return proc.returncode == 0 + except FileNotFoundError: + print("Error: docker not found in PATH. Install Docker or use --ignore-docker.") + return False + + +def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int: + workdir = "/workspace" + # Mount read-only is safer; but tests may write artifacts. Use read-write. + cmd = [ + "docker", + "run", + "--rm", + "-v", + f"{project_root}:{workdir}", + "-w", + workdir, + image_tag, + "pytest", + pytest_target, + "-vs", + ] + print("Running in Docker:", " ".join(cmd)) + try: + proc = subprocess.run(cmd) + return proc.returncode + except FileNotFoundError: + print("Error: docker not found in PATH. Install Docker or use --ignore-docker.") + return 1 + + +def local_test_command(args: argparse.Namespace) -> int: + project_root = os.getcwd() + + # Selection and pytest target resolution + pytest_target: str = "" + entry = getattr(args, "entry", None) + if entry: + if "::" in entry: + file_part = entry.split("::", 1)[0] + file_path = ( + file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part)) + ) + pytest_target = entry + else: + file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry)) + # Use path relative to project_root when possible + try: + rel = os.path.relpath(file_path, project_root) + except Exception: + rel = file_path + pytest_target = rel + else: + tests = _discover_tests(project_root) + if not tests: + print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.") + return 1 + non_interactive = bool(getattr(args, "yes", False)) + selected = _prompt_select(tests, non_interactive=non_interactive) + if not selected: + print("No tests selected.") + return 1 + if len(selected) != 1: + print("Error: Please select exactly one evaluation test for 'local-test'.") + return 1 + chosen = selected[0] + abs_path = os.path.abspath(chosen.file_path) + try: + rel = os.path.relpath(abs_path, project_root) + except Exception: + rel = abs_path + pytest_target = rel + + ignore_docker = bool(getattr(args, "ignore_docker", False)) + if ignore_docker: + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_host(pytest_target) + + dockerfiles = _find_dockerfiles(project_root) + if len(dockerfiles) > 1: + print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.") + for df in dockerfiles: + print(f" - {df}") + print("Hint: use --ignore-docker to bypass Docker.") + return 1 + if len(dockerfiles) == 1: + image_tag = "ep-evaluator:local" + ok = _build_docker_image(dockerfiles[0], image_tag) + if not ok: + print("Docker build failed. See logs above.") + return 1 + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_in_docker(project_root, image_tag, pytest_target) + + # No Dockerfile: run on host + if not pytest_target: + print("Error: Failed to resolve a pytest target to run.") + return 1 + return _run_pytest_host(pytest_target) diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py new file mode 100644 index 00000000..0763135f --- /dev/null +++ b/tests/test_cli_local_test.py @@ -0,0 +1,145 @@ +import os +from types import SimpleNamespace + +import pytest + + +def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Create a dummy test file + test_file = project / "metric" / "test_one.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + # Import module under test + from eval_protocol.cli_commands import local_test as lt + + # Avoid Docker path + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + captured = {"target": ""} + + def _fake_host(target: str) -> int: + captured["target"] = target + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect relative path target + assert captured["target"] == os.path.relpath(str(test_file), str(project)) + + +def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_two.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + # Pretend we have Dockerfile(s), but ignore_docker=True should skip + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + + called = {"host": False} + + def _fake_host(target: str) -> int: + called["host"] = True + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert called["host"] is True + + +def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_three.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr( + lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")] + ) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 1 + + +def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_four.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True) + + captured = {"target": "", "image": ""} + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int: + captured["target"] = pytest_target + captured["image"] = image_tag + return 0 + + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert captured["image"] == "ep-evaluator:local" + assert captured["target"] == os.path.relpath(str(test_file), str(project)) + + +def test_local_test_selector_single_test(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_sel.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + from eval_protocol.cli_commands import upload as up + + # No entry; force discover + selector + disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file)) + monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc]) + monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1]) + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + called = {"host": False} + + def _fake_host(target: str) -> int: + called["host"] = True + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=None, ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert called["host"] is True From e7615d7ec75524b19ed38241d1c6165cf32dd79f Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 10 Nov 2025 18:00:39 -0800 Subject: [PATCH 02/11] mount for ep logs --- eval_protocol/cli_commands/local_test.py | 19 +++++++++++++++---- eval_protocol/cli_commands/upload.py | 4 ++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index a877be7d..5c0132c3 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -50,13 +50,19 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) "--rm", "-v", f"{project_root}:{workdir}", + "-e", + f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol", "-w", workdir, - image_tag, - "pytest", - pytest_target, - "-vs", ] + # Try to match host user to avoid permission problems on mounted volume + try: + uid = os.getuid() # type: ignore[attr-defined] + gid = os.getgid() # type: ignore[attr-defined] + cmd += ["--user", f"{uid}:{gid}"] + except Exception: + pass + cmd += [image_tag, "pytest", pytest_target, "-vs"] print("Running in Docker:", " ".join(cmd)) try: proc = subprocess.run(cmd) @@ -123,6 +129,11 @@ def local_test_command(args: argparse.Namespace) -> int: print("Hint: use --ignore-docker to bypass Docker.") return 1 if len(dockerfiles) == 1: + # Ensure shared logs directory exists on host so container writes are visible to host ep logs + try: + os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True) + except Exception: + pass image_tag = "ep-evaluator:local" ok = _build_docker_image(dockerfiles[0], image_tag) if not ok: diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index 51283b23..8c6e7baf 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe # Check if only one test - auto-select it if len(tests) == 1: print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}") - confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask() + confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask() if confirm: return tests else: @@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest] print("=" * 80) try: - choice = input("Enter the number to upload: ").strip() + choice = input("Enter the number to select: ").strip() except KeyboardInterrupt: print("\n\nUpload cancelled.") return [] From 72b9178c97f8666f3a77b9ad33316af7952bcc1b Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 10 Nov 2025 18:09:52 -0800 Subject: [PATCH 03/11] update --- eval_protocol/cli_commands/local_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index 5c0132c3..ad350389 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -51,6 +51,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) "-v", f"{project_root}:{workdir}", "-e", + f"HOME={workdir}", + "-e", f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol", "-w", workdir, From 2907cf8ed922bc35fb0592b37e05b6704aaa4199 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Mon, 10 Nov 2025 18:34:55 -0800 Subject: [PATCH 04/11] try to force linux/amd64 --- eval_protocol/cli_commands/local_test.py | 20 +++++++++++++++----- tests/test_cli_local_test.py | 4 ++-- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index ad350389..a5f1e04a 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -24,12 +24,16 @@ def _run_pytest_host(pytest_target: str) -> int: return proc.returncode -def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: +def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | None = None) -> bool: context_dir = os.path.dirname(dockerfile_path) print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...") try: + cmd = ["docker", "build"] + if platform: + cmd += ["--platform", platform] + cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir] proc = subprocess.run( - ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir], + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, @@ -41,7 +45,7 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: return False -def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int: +def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str, platform: str | None = None) -> int: workdir = "/workspace" # Mount read-only is safer; but tests may write artifacts. Use read-write. cmd = [ @@ -57,6 +61,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) "-w", workdir, ] + if platform: + cmd += ["--platform", platform] # Try to match host user to avoid permission problems on mounted volume try: uid = os.getuid() # type: ignore[attr-defined] @@ -136,15 +142,19 @@ def local_test_command(args: argparse.Namespace) -> int: os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True) except Exception: pass + # Choose platform to emulate Linux host (default to amd64 on macOS, override with EP_DOCKER_PLATFORM) + selected_platform = os.environ.get("EP_DOCKER_PLATFORM") + if not selected_platform and sys.platform == "darwin": + selected_platform = "linux/amd64" image_tag = "ep-evaluator:local" - ok = _build_docker_image(dockerfiles[0], image_tag) + ok = _build_docker_image(dockerfiles[0], image_tag, platform=selected_platform) if not ok: print("Docker build failed. See logs above.") return 1 if not pytest_target: print("Error: Failed to resolve a pytest target to run.") return 1 - return _run_pytest_in_docker(project_root, image_tag, pytest_target) + return _run_pytest_in_docker(project_root, image_tag, pytest_target, platform=selected_platform) # No Dockerfile: run on host if not pytest_target: diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py index 0763135f..a2330a71 100644 --- a/tests/test_cli_local_test.py +++ b/tests/test_cli_local_test.py @@ -95,11 +95,11 @@ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch): from eval_protocol.cli_commands import local_test as lt monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) - monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, platform=None: True) captured = {"target": "", "image": ""} - def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int: + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, platform=None) -> int: captured["target"] = pytest_target captured["image"] = image_tag return 0 From 4f1ff85d281117c408d8fe139c86015e12f4b05e Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 11:36:21 -0800 Subject: [PATCH 05/11] revert --- eval_protocol/cli_commands/local_test.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index a5f1e04a..5c0132c3 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -24,16 +24,12 @@ def _run_pytest_host(pytest_target: str) -> int: return proc.returncode -def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | None = None) -> bool: +def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: context_dir = os.path.dirname(dockerfile_path) print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...") try: - cmd = ["docker", "build"] - if platform: - cmd += ["--platform", platform] - cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir] proc = subprocess.run( - cmd, + ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, @@ -45,7 +41,7 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | No return False -def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str, platform: str | None = None) -> int: +def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int: workdir = "/workspace" # Mount read-only is safer; but tests may write artifacts. Use read-write. cmd = [ @@ -55,14 +51,10 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str, "-v", f"{project_root}:{workdir}", "-e", - f"HOME={workdir}", - "-e", f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol", "-w", workdir, ] - if platform: - cmd += ["--platform", platform] # Try to match host user to avoid permission problems on mounted volume try: uid = os.getuid() # type: ignore[attr-defined] @@ -142,19 +134,15 @@ def local_test_command(args: argparse.Namespace) -> int: os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True) except Exception: pass - # Choose platform to emulate Linux host (default to amd64 on macOS, override with EP_DOCKER_PLATFORM) - selected_platform = os.environ.get("EP_DOCKER_PLATFORM") - if not selected_platform and sys.platform == "darwin": - selected_platform = "linux/amd64" image_tag = "ep-evaluator:local" - ok = _build_docker_image(dockerfiles[0], image_tag, platform=selected_platform) + ok = _build_docker_image(dockerfiles[0], image_tag) if not ok: print("Docker build failed. See logs above.") return 1 if not pytest_target: print("Error: Failed to resolve a pytest target to run.") return 1 - return _run_pytest_in_docker(project_root, image_tag, pytest_target, platform=selected_platform) + return _run_pytest_in_docker(project_root, image_tag, pytest_target) # No Dockerfile: run on host if not pytest_target: From 99169abe8d28e3a399987c25f57f56f1f2c56da5 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 11:43:48 -0800 Subject: [PATCH 06/11] set home --- eval_protocol/cli_commands/local_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index 5c0132c3..ad350389 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -51,6 +51,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) "-v", f"{project_root}:{workdir}", "-e", + f"HOME={workdir}", + "-e", f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol", "-w", workdir, From 75d4cb662b07405512702d3aeefacc6da7efeb4d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 13:06:53 -0800 Subject: [PATCH 07/11] try --- eval_protocol/cli_commands/local_test.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index ad350389..eeab7095 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -43,6 +43,13 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int: workdir = "/workspace" + # Host HOME logs directory to map into container + host_home = os.path.expanduser("~") + host_logs_dir = os.path.join(host_home, ".eval_protocol") + try: + os.makedirs(host_logs_dir, exist_ok=True) + except Exception: + pass # Mount read-only is safer; but tests may write artifacts. Use read-write. cmd = [ "docker", @@ -50,10 +57,12 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) "--rm", "-v", f"{project_root}:{workdir}", + "-v", + f"{host_logs_dir}:/container_home/.eval_protocol", "-e", - f"HOME={workdir}", + "HOME=/container_home", "-e", - f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol", + "EVAL_PROTOCOL_DIR=/container_home/.eval_protocol", "-w", workdir, ] @@ -131,9 +140,9 @@ def local_test_command(args: argparse.Namespace) -> int: print("Hint: use --ignore-docker to bypass Docker.") return 1 if len(dockerfiles) == 1: - # Ensure shared logs directory exists on host so container writes are visible to host ep logs + # Ensure host home logs directory exists so container writes are visible to host ep logs try: - os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True) + os.makedirs(os.path.join(os.path.expanduser("~"), ".eval_protocol"), exist_ok=True) except Exception: pass image_tag = "ep-evaluator:local" From 41b79daeafe6bcb53a8a3183738314596874696d Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 15:13:03 -0800 Subject: [PATCH 08/11] store in utc --- eval_protocol/models.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/eval_protocol/models.py b/eval_protocol/models.py index 6ec94210..67d287ba 100644 --- a/eval_protocol/models.py +++ b/eval_protocol/models.py @@ -1,7 +1,7 @@ import os import logging import importlib -from datetime import datetime +from datetime import datetime, timezone from enum import Enum from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union @@ -825,7 +825,10 @@ class EvaluationRow(BaseModel): description="Metadata about the execution of the evaluation.", ) - created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.") + created_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), + description="The timestamp when the row was created (UTC).", + ) eval_metadata: Optional[EvalMetadata] = Field( default=None, description="Metadata about the evaluation that was run." From 5eb5fac9069b59e619d4bf683e8db185ef390c05 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 15:43:12 -0800 Subject: [PATCH 09/11] tests --- eval_protocol/cli.py | 10 +++ eval_protocol/cli_commands/local_test.py | 28 +++++--- tests/test_cli_local_test.py | 83 +++++++++++++++++++++++- 3 files changed, 109 insertions(+), 12 deletions(-) diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py index 4e7ac8da..7fe78232 100644 --- a/eval_protocol/cli.py +++ b/eval_protocol/cli.py @@ -447,6 +447,16 @@ def parse_args(args=None): action="store_true", help="Non-interactive: if multiple tests exist and no --entry, fails with guidance", ) + local_test_parser.add_argument( + "--docker-build-extra", + default="", + help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")", + ) + local_test_parser.add_argument( + "--docker-run-extra", + default="", + help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")", + ) # Run command (for Hydra-based evaluations) # This subparser intentionally defines no arguments itself. diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index eeab7095..cfb52da0 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -2,6 +2,7 @@ import os import subprocess import sys +import shlex from typing import List from .upload import _discover_tests, _prompt_select @@ -24,16 +25,15 @@ def _run_pytest_host(pytest_target: str) -> int: return proc.returncode -def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: +def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool: context_dir = os.path.dirname(dockerfile_path) print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...") try: - proc = subprocess.run( - ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - text=True, - ) + base_cmd = ["docker", "build"] + if build_extras: + base_cmd += build_extras + base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir] + proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) print(proc.stdout) return proc.returncode == 0 except FileNotFoundError: @@ -41,7 +41,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool: return False -def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int: +def _run_pytest_in_docker( + project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None +) -> int: workdir = "/workspace" # Host HOME logs directory to map into container host_home = os.path.expanduser("~") @@ -73,6 +75,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) cmd += ["--user", f"{uid}:{gid}"] except Exception: pass + if run_extras: + cmd += run_extras cmd += [image_tag, "pytest", pytest_target, "-vs"] print("Running in Docker:", " ".join(cmd)) try: @@ -126,6 +130,10 @@ def local_test_command(args: argparse.Namespace) -> int: pytest_target = rel ignore_docker = bool(getattr(args, "ignore_docker", False)) + build_extras_str = getattr(args, "docker_build_extra", "") or "" + run_extras_str = getattr(args, "docker_run_extra", "") or "" + build_extras = shlex.split(build_extras_str) if build_extras_str else [] + run_extras = shlex.split(run_extras_str) if run_extras_str else [] if ignore_docker: if not pytest_target: print("Error: Failed to resolve a pytest target to run.") @@ -146,14 +154,14 @@ def local_test_command(args: argparse.Namespace) -> int: except Exception: pass image_tag = "ep-evaluator:local" - ok = _build_docker_image(dockerfiles[0], image_tag) + ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras) if not ok: print("Docker build failed. See logs above.") return 1 if not pytest_target: print("Error: Failed to resolve a pytest target to run.") return 1 - return _run_pytest_in_docker(project_root, image_tag, pytest_target) + return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras) # No Dockerfile: run on host if not pytest_target: diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py index a2330a71..9badea3f 100644 --- a/tests/test_cli_local_test.py +++ b/tests/test_cli_local_test.py @@ -95,11 +95,11 @@ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch): from eval_protocol.cli_commands import local_test as lt monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) - monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, platform=None: True) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True) captured = {"target": "", "image": ""} - def _fake_run_docker(root: str, image_tag: str, pytest_target: str, platform=None) -> int: + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: captured["target"] = pytest_target captured["image"] = image_tag return 0 @@ -143,3 +143,82 @@ def _fake_host(target: str) -> int: rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] assert rc == 0 assert called["host"] is True + + +def test_local_test_passes_docker_build_extra(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_build_extra.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + + captured = {"extras": None} + + def _fake_build(dockerfile, tag, build_extras=None): + captured["extras"] = build_extras + return True + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: + return 0 + + monkeypatch.setattr(lt, "_build_docker_image", _fake_build) + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + # Extras string with multiple flags and equals-arg + args = SimpleNamespace( + entry=str(test_file), + ignore_docker=False, + yes=True, + docker_build_extra="--no-cache --pull --progress=plain --build-arg KEY=VAL", + docker_run_extra="", + ) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect split list preserving tokens order + assert captured["extras"] == ["--no-cache", "--pull", "--progress=plain", "--build-arg", "KEY=VAL"] + + +def test_local_test_passes_docker_run_extra(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + test_file = project / "metric" / "test_run_extra.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + from eval_protocol.cli_commands import local_test as lt + + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")]) + monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True) + + captured = {"extras": None} + + def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int: + captured["extras"] = run_extras + return 0 + + monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker) + + args = SimpleNamespace( + entry=str(test_file), + ignore_docker=False, + yes=True, + docker_build_extra="", + docker_run_extra="--env-file .env --memory=8g --cpus=2 --add-host=host.docker.internal:host-gateway", + ) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + assert captured["extras"] == [ + "--env-file", + ".env", + "--memory=8g", + "--cpus=2", + "--add-host=host.docker.internal:host-gateway", + ] From 4a17784775fceefc6a39722c33310036e0933b44 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 15:47:52 -0800 Subject: [PATCH 10/11] fix bug --- eval_protocol/cli_commands/local_test.py | 9 +++++-- tests/test_cli_local_test.py | 33 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py index cfb52da0..49d34190 100644 --- a/eval_protocol/cli_commands/local_test.py +++ b/eval_protocol/cli_commands/local_test.py @@ -95,11 +95,16 @@ def local_test_command(args: argparse.Namespace) -> int: entry = getattr(args, "entry", None) if entry: if "::" in entry: - file_part = entry.split("::", 1)[0] + file_part, func_part = entry.split("::", 1) file_path = ( file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part)) ) - pytest_target = entry + # Convert to project-relative like the non-:: path + try: + rel = os.path.relpath(file_path, project_root) + except Exception: + rel = file_path + pytest_target = f"{rel}::{func_part}" else: file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry)) # Use path relative to project_root when possible diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py index 9badea3f..e223d4db 100644 --- a/tests/test_cli_local_test.py +++ b/tests/test_cli_local_test.py @@ -222,3 +222,36 @@ def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=N "--cpus=2", "--add-host=host.docker.internal:host-gateway", ] + + +def test_local_test_normalizes_entry_with_selector(tmp_path, monkeypatch): + project = tmp_path / "proj" + project.mkdir() + monkeypatch.chdir(project) + + # Create a dummy test file + test_file = project / "metric" / "test_sel_abs.py" + test_file.parent.mkdir(parents=True, exist_ok=True) + test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") + + abs_entry = f"{str(test_file)}::test_dummy" + + from eval_protocol.cli_commands import local_test as lt + + # Avoid Docker path + monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) + + captured = {"target": ""} + + def _fake_host(target: str) -> int: + captured["target"] = target + return 0 + + monkeypatch.setattr(lt, "_run_pytest_host", _fake_host) + + args = SimpleNamespace(entry=abs_entry, ignore_docker=False, yes=True) + rc = lt.local_test_command(args) # pyright: ignore[reportArgumentType] + assert rc == 0 + # Expect project-relative path plus selector + rel = os.path.relpath(str(test_file), str(project)) + assert captured["target"] == f"{rel}::test_dummy" From 9b476dca8320e138079f624cc4b1c4ddd6fe6b03 Mon Sep 17 00:00:00 2001 From: Derek Xu Date: Tue, 11 Nov 2025 15:56:57 -0800 Subject: [PATCH 11/11] test fix --- tests/test_cli_local_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py index e223d4db..6ab0b14e 100644 --- a/tests/test_cli_local_test.py +++ b/tests/test_cli_local_test.py @@ -123,12 +123,11 @@ def test_local_test_selector_single_test(tmp_path, monkeypatch): test_file.write_text("def test_dummy():\n assert True\n", encoding="utf-8") from eval_protocol.cli_commands import local_test as lt - from eval_protocol.cli_commands import upload as up # No entry; force discover + selector disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file)) monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc]) - monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1]) + monkeypatch.setattr(lt, "_prompt_select", lambda tests, non_interactive=False: tests[:1]) monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: []) called = {"host": False}