From cd9cc91c34f975482fe05b4bf3a60b4a0bcbd746 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 10 Nov 2025 17:41:27 -0800
Subject: [PATCH 01/11] local test command

---
 eval_protocol/cli.py                     |  25 ++++
 eval_protocol/cli_commands/local_test.py | 140 ++++++++++++++++++++++
 tests/test_cli_local_test.py             | 145 +++++++++++++++++++++++
 3 files changed, 310 insertions(+)
 create mode 100644 eval_protocol/cli_commands/local_test.py
 create mode 100644 tests/test_cli_local_test.py

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index 30ac1ad5..4e7ac8da 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -427,6 +427,27 @@ def parse_args(args=None):
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
     rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
 
+    # Local test command
+    local_test_parser = subparsers.add_parser(
+        "local-test",
+        help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
+    )
+    local_test_parser.add_argument(
+        "--entry",
+        help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
+    )
+    local_test_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host",
+    )
+    local_test_parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
+    )
+
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.
     # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -559,6 +580,10 @@ def _extract_flag_value(argv_list, flag_name):
             return create_rft_command(args)
         print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
         return 1
+    elif args.command == "local-test":
+        from .cli_commands.local_test import local_test_command
+
+        return local_test_command(args)
     elif args.command == "run":
         # For the 'run' command, Hydra takes over argument parsing.
 
diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
new file mode 100644
index 00000000..a877be7d
--- /dev/null
+++ b/eval_protocol/cli_commands/local_test.py
@@ -0,0 +1,140 @@
+import argparse
+import os
+import subprocess
+import sys
+from typing import List
+
+from .upload import _discover_tests, _prompt_select
+
+
+def _find_dockerfiles(root: str) -> List[str]:
+    skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
+    dockerfiles: List[str] = []
+    for dirpath, dirnames, filenames in os.walk(root):
+        dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
+        for name in filenames:
+            if name == "Dockerfile":
+                dockerfiles.append(os.path.join(dirpath, name))
+    return dockerfiles
+
+
+def _run_pytest_host(pytest_target: str) -> int:
+    print(f"Running locally: pytest {pytest_target} -vs")
+    proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
+    return proc.returncode
+
+
+def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
+    context_dir = os.path.dirname(dockerfile_path)
+    print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
+    try:
+        proc = subprocess.run(
+            ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+        print(proc.stdout)
+        return proc.returncode == 0
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return False
+
+
+def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
+    workdir = "/workspace"
+    # Mount read-only is safer; but tests may write artifacts. Use read-write.
+    cmd = [
+        "docker",
+        "run",
+        "--rm",
+        "-v",
+        f"{project_root}:{workdir}",
+        "-w",
+        workdir,
+        image_tag,
+        "pytest",
+        pytest_target,
+        "-vs",
+    ]
+    print("Running in Docker:", " ".join(cmd))
+    try:
+        proc = subprocess.run(cmd)
+        return proc.returncode
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return 1
+
+
+def local_test_command(args: argparse.Namespace) -> int:
+    project_root = os.getcwd()
+
+    # Selection and pytest target resolution
+    pytest_target: str = ""
+    entry = getattr(args, "entry", None)
+    if entry:
+        if "::" in entry:
+            file_part = entry.split("::", 1)[0]
+            file_path = (
+                file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
+            )
+            pytest_target = entry
+        else:
+            file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
+            # Use path relative to project_root when possible
+            try:
+                rel = os.path.relpath(file_path, project_root)
+            except Exception:
+                rel = file_path
+            pytest_target = rel
+    else:
+        tests = _discover_tests(project_root)
+        if not tests:
+            print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
+            return 1
+        non_interactive = bool(getattr(args, "yes", False))
+        selected = _prompt_select(tests, non_interactive=non_interactive)
+        if not selected:
+            print("No tests selected.")
+            return 1
+        if len(selected) != 1:
+            print("Error: Please select exactly one evaluation test for 'local-test'.")
+            return 1
+        chosen = selected[0]
+        abs_path = os.path.abspath(chosen.file_path)
+        try:
+            rel = os.path.relpath(abs_path, project_root)
+        except Exception:
+            rel = abs_path
+        pytest_target = rel
+
+    ignore_docker = bool(getattr(args, "ignore_docker", False))
+    if ignore_docker:
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_host(pytest_target)
+
+    dockerfiles = _find_dockerfiles(project_root)
+    if len(dockerfiles) > 1:
+        print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
+        for df in dockerfiles:
+            print(f" - {df}")
+        print("Hint: use --ignore-docker to bypass Docker.")
+        return 1
+    if len(dockerfiles) == 1:
+        image_tag = "ep-evaluator:local"
+        ok = _build_docker_image(dockerfiles[0], image_tag)
+        if not ok:
+            print("Docker build failed. See logs above.")
+            return 1
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target)
+
+    # No Dockerfile: run on host
+    if not pytest_target:
+        print("Error: Failed to resolve a pytest target to run.")
+        return 1
+    return _run_pytest_host(pytest_target)
diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
new file mode 100644
index 00000000..0763135f
--- /dev/null
+++ b/tests/test_cli_local_test.py
@@ -0,0 +1,145 @@
+import os
+from types import SimpleNamespace
+
+import pytest
+
+
+def test_local_test_runs_host_pytest_with_entry(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Create a dummy test file
+    test_file = project / "metric" / "test_one.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    # Import module under test
+    from eval_protocol.cli_commands import local_test as lt
+
+    # Avoid Docker path
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
+
+    captured = {"target": ""}
+
+    def _fake_host(target: str) -> int:
+        captured["target"] = target
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    # Expect relative path target
+    assert captured["target"] == os.path.relpath(str(test_file), str(project))
+
+
+def test_local_test_ignores_docker_when_flag_set(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_two.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    # Pretend we have Dockerfile(s), but ignore_docker=True should skip
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+
+    called = {"host": False}
+
+    def _fake_host(target: str) -> int:
+        called["host"] = True
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=True, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert called["host"] is True
+
+
+def test_local_test_errors_on_multiple_dockerfiles(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_three.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(
+        lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile"), str(project / "another" / "Dockerfile")]
+    )
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 1
+
+
+def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_four.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
+
+    captured = {"target": "", "image": ""}
+
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
+        captured["target"] = pytest_target
+        captured["image"] = image_tag
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
+
+    args = SimpleNamespace(entry=str(test_file), ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert captured["image"] == "ep-evaluator:local"
+    assert captured["target"] == os.path.relpath(str(test_file), str(project))
+
+
+def test_local_test_selector_single_test(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_sel.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+    from eval_protocol.cli_commands import upload as up
+
+    # No entry; force discover + selector
+    disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
+    monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
+    monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
+
+    called = {"host": False}
+
+    def _fake_host(target: str) -> int:
+        called["host"] = True
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=None, ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert called["host"] is True

From e7615d7ec75524b19ed38241d1c6165cf32dd79f Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 10 Nov 2025 18:00:39 -0800
Subject: [PATCH 02/11] mount for ep logs

---
 eval_protocol/cli_commands/local_test.py | 19 +++++++++++++++----
 eval_protocol/cli_commands/upload.py     |  4 ++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index a877be7d..5c0132c3 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -50,13 +50,19 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         "--rm",
         "-v",
         f"{project_root}:{workdir}",
+        "-e",
+        f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
         "-w",
         workdir,
-        image_tag,
-        "pytest",
-        pytest_target,
-        "-vs",
     ]
+    # Try to match host user to avoid permission problems on mounted volume
+    try:
+        uid = os.getuid()  # type: ignore[attr-defined]
+        gid = os.getgid()  # type: ignore[attr-defined]
+        cmd += ["--user", f"{uid}:{gid}"]
+    except Exception:
+        pass
+    cmd += [image_tag, "pytest", pytest_target, "-vs"]
     print("Running in Docker:", " ".join(cmd))
     try:
         proc = subprocess.run(cmd)
@@ -123,6 +129,11 @@ def local_test_command(args: argparse.Namespace) -> int:
         print("Hint: use --ignore-docker to bypass Docker.")
         return 1
     if len(dockerfiles) == 1:
+        # Ensure shared logs directory exists on host so container writes are visible to host ep logs
+        try:
+            os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
+        except Exception:
+            pass
         image_tag = "ep-evaluator:local"
         ok = _build_docker_image(dockerfiles[0], image_tag)
         if not ok:
diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py
index 51283b23..8c6e7baf 100644
--- a/eval_protocol/cli_commands/upload.py
+++ b/eval_protocol/cli_commands/upload.py
@@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
         # Check if only one test - auto-select it
         if len(tests) == 1:
             print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
-            confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask()
+            confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask()
             if confirm:
                 return tests
             else:
@@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
 
     print("=" * 80)
     try:
-        choice = input("Enter the number to upload: ").strip()
+        choice = input("Enter the number to select: ").strip()
     except KeyboardInterrupt:
         print("\n\nUpload cancelled.")
         return []

From 72b9178c97f8666f3a77b9ad33316af7952bcc1b Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 10 Nov 2025 18:09:52 -0800
Subject: [PATCH 03/11] update

---
 eval_protocol/cli_commands/local_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index 5c0132c3..ad350389 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -51,6 +51,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         "-v",
         f"{project_root}:{workdir}",
         "-e",
+        f"HOME={workdir}",
+        "-e",
         f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
         "-w",
         workdir,

From 2907cf8ed922bc35fb0592b37e05b6704aaa4199 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Mon, 10 Nov 2025 18:34:55 -0800
Subject: [PATCH 04/11] try to force linux/amd64

---
 eval_protocol/cli_commands/local_test.py | 20 +++++++++++++++-----
 tests/test_cli_local_test.py             |  4 ++--
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index ad350389..a5f1e04a 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -24,12 +24,16 @@ def _run_pytest_host(pytest_target: str) -> int:
     return proc.returncode
 
 
-def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
+def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | None = None) -> bool:
     context_dir = os.path.dirname(dockerfile_path)
     print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
     try:
+        cmd = ["docker", "build"]
+        if platform:
+            cmd += ["--platform", platform]
+        cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
         proc = subprocess.run(
-            ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
+            cmd,
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             text=True,
@@ -41,7 +45,7 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
         return False
 
 
-def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
+def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str, platform: str | None = None) -> int:
     workdir = "/workspace"
     # Mount read-only is safer; but tests may write artifacts. Use read-write.
     cmd = [
@@ -57,6 +61,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         "-w",
         workdir,
     ]
+    if platform:
+        cmd += ["--platform", platform]
     # Try to match host user to avoid permission problems on mounted volume
     try:
         uid = os.getuid()  # type: ignore[attr-defined]
@@ -136,15 +142,19 @@ def local_test_command(args: argparse.Namespace) -> int:
             os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
         except Exception:
             pass
+        # Choose platform to emulate Linux host (default to amd64 on macOS, override with EP_DOCKER_PLATFORM)
+        selected_platform = os.environ.get("EP_DOCKER_PLATFORM")
+        if not selected_platform and sys.platform == "darwin":
+            selected_platform = "linux/amd64"
         image_tag = "ep-evaluator:local"
-        ok = _build_docker_image(dockerfiles[0], image_tag)
+        ok = _build_docker_image(dockerfiles[0], image_tag, platform=selected_platform)
         if not ok:
             print("Docker build failed. See logs above.")
             return 1
         if not pytest_target:
             print("Error: Failed to resolve a pytest target to run.")
             return 1
-        return _run_pytest_in_docker(project_root, image_tag, pytest_target)
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target, platform=selected_platform)
 
     # No Dockerfile: run on host
     if not pytest_target:
diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
index 0763135f..a2330a71 100644
--- a/tests/test_cli_local_test.py
+++ b/tests/test_cli_local_test.py
@@ -95,11 +95,11 @@ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
     from eval_protocol.cli_commands import local_test as lt
 
     monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
-    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag: True)
+    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, platform=None: True)
 
     captured = {"target": "", "image": ""}
 
-    def _fake_run_docker(root: str, image_tag: str, pytest_target: str) -> int:
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str, platform=None) -> int:
         captured["target"] = pytest_target
         captured["image"] = image_tag
         return 0

From 4f1ff85d281117c408d8fe139c86015e12f4b05e Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 11:36:21 -0800
Subject: [PATCH 05/11] revert

---
 eval_protocol/cli_commands/local_test.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index a5f1e04a..5c0132c3 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -24,16 +24,12 @@ def _run_pytest_host(pytest_target: str) -> int:
     return proc.returncode
 
 
-def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | None = None) -> bool:
+def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
     context_dir = os.path.dirname(dockerfile_path)
     print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
     try:
-        cmd = ["docker", "build"]
-        if platform:
-            cmd += ["--platform", platform]
-        cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
         proc = subprocess.run(
-            cmd,
+            ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
             stdout=subprocess.PIPE,
             stderr=subprocess.STDOUT,
             text=True,
@@ -45,7 +41,7 @@ def _build_docker_image(dockerfile_path: str, image_tag: str, platform: str | No
         return False
 
 
-def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str, platform: str | None = None) -> int:
+def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
     workdir = "/workspace"
     # Mount read-only is safer; but tests may write artifacts. Use read-write.
     cmd = [
@@ -55,14 +51,10 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str,
         "-v",
         f"{project_root}:{workdir}",
         "-e",
-        f"HOME={workdir}",
-        "-e",
         f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
         "-w",
         workdir,
     ]
-    if platform:
-        cmd += ["--platform", platform]
     # Try to match host user to avoid permission problems on mounted volume
     try:
         uid = os.getuid()  # type: ignore[attr-defined]
@@ -142,19 +134,15 @@ def local_test_command(args: argparse.Namespace) -> int:
             os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
         except Exception:
             pass
-        # Choose platform to emulate Linux host (default to amd64 on macOS, override with EP_DOCKER_PLATFORM)
-        selected_platform = os.environ.get("EP_DOCKER_PLATFORM")
-        if not selected_platform and sys.platform == "darwin":
-            selected_platform = "linux/amd64"
         image_tag = "ep-evaluator:local"
-        ok = _build_docker_image(dockerfiles[0], image_tag, platform=selected_platform)
+        ok = _build_docker_image(dockerfiles[0], image_tag)
         if not ok:
             print("Docker build failed. See logs above.")
             return 1
         if not pytest_target:
             print("Error: Failed to resolve a pytest target to run.")
             return 1
-        return _run_pytest_in_docker(project_root, image_tag, pytest_target, platform=selected_platform)
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target)
 
     # No Dockerfile: run on host
     if not pytest_target:

From 99169abe8d28e3a399987c25f57f56f1f2c56da5 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 11:43:48 -0800
Subject: [PATCH 06/11] set home

---
 eval_protocol/cli_commands/local_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index 5c0132c3..ad350389 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -51,6 +51,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         "-v",
         f"{project_root}:{workdir}",
         "-e",
+        f"HOME={workdir}",
+        "-e",
         f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
         "-w",
         workdir,

From 75d4cb662b07405512702d3aeefacc6da7efeb4d Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 13:06:53 -0800
Subject: [PATCH 07/11] try

---
 eval_protocol/cli_commands/local_test.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index ad350389..eeab7095 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -43,6 +43,13 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
 
 def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
     workdir = "/workspace"
+    # Host HOME logs directory to map into container
+    host_home = os.path.expanduser("~")
+    host_logs_dir = os.path.join(host_home, ".eval_protocol")
+    try:
+        os.makedirs(host_logs_dir, exist_ok=True)
+    except Exception:
+        pass
     # Mount read-only is safer; but tests may write artifacts. Use read-write.
     cmd = [
         "docker",
@@ -50,10 +57,12 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         "--rm",
         "-v",
         f"{project_root}:{workdir}",
+        "-v",
+        f"{host_logs_dir}:/container_home/.eval_protocol",
         "-e",
-        f"HOME={workdir}",
+        "HOME=/container_home",
         "-e",
-        f"EVAL_PROTOCOL_DIR={workdir}/.eval_protocol",
+        "EVAL_PROTOCOL_DIR=/container_home/.eval_protocol",
         "-w",
         workdir,
     ]
@@ -131,9 +140,9 @@ def local_test_command(args: argparse.Namespace) -> int:
         print("Hint: use --ignore-docker to bypass Docker.")
         return 1
     if len(dockerfiles) == 1:
-        # Ensure shared logs directory exists on host so container writes are visible to host ep logs
+        # Ensure host home logs directory exists so container writes are visible to host ep logs
         try:
-            os.makedirs(os.path.join(project_root, ".eval_protocol"), exist_ok=True)
+            os.makedirs(os.path.join(os.path.expanduser("~"), ".eval_protocol"), exist_ok=True)
         except Exception:
             pass
         image_tag = "ep-evaluator:local"

From 41b79daeafe6bcb53a8a3183738314596874696d Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 15:13:03 -0800
Subject: [PATCH 08/11] store in utc

---
 eval_protocol/models.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
index 6ec94210..67d287ba 100644
--- a/eval_protocol/models.py
+++ b/eval_protocol/models.py
@@ -1,7 +1,7 @@
 import os
 import logging
 import importlib
-from datetime import datetime
+from datetime import datetime, timezone
 from enum import Enum
 from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
 
@@ -825,7 +825,10 @@ class EvaluationRow(BaseModel):
         description="Metadata about the execution of the evaluation.",
     )
 
-    created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+        description="The timestamp when the row was created (UTC).",
+    )
 
     eval_metadata: Optional[EvalMetadata] = Field(
         default=None, description="Metadata about the evaluation that was run."

From 5eb5fac9069b59e619d4bf683e8db185ef390c05 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 15:43:12 -0800
Subject: [PATCH 09/11] tests

---
 eval_protocol/cli.py                     | 10 +++
 eval_protocol/cli_commands/local_test.py | 28 +++++---
 tests/test_cli_local_test.py             | 83 +++++++++++++++++++++++-
 3 files changed, 109 insertions(+), 12 deletions(-)

diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
index 4e7ac8da..7fe78232 100644
--- a/eval_protocol/cli.py
+++ b/eval_protocol/cli.py
@@ -447,6 +447,16 @@ def parse_args(args=None):
         action="store_true",
         help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
     )
+    local_test_parser.add_argument(
+        "--docker-build-extra",
+        default="",
+        help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
+    )
+    local_test_parser.add_argument(
+        "--docker-run-extra",
+        default="",
+        help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
+    )
 
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.
diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index eeab7095..cfb52da0 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -2,6 +2,7 @@
 import os
 import subprocess
 import sys
+import shlex
 from typing import List
 
 from .upload import _discover_tests, _prompt_select
@@ -24,16 +25,15 @@ def _run_pytest_host(pytest_target: str) -> int:
     return proc.returncode
 
 
-def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
+def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
     context_dir = os.path.dirname(dockerfile_path)
     print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
     try:
-        proc = subprocess.run(
-            ["docker", "build", "-t", image_tag, "-f", dockerfile_path, context_dir],
-            stdout=subprocess.PIPE,
-            stderr=subprocess.STDOUT,
-            text=True,
-        )
+        base_cmd = ["docker", "build"]
+        if build_extras:
+            base_cmd += build_extras
+        base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
+        proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
         print(proc.stdout)
         return proc.returncode == 0
     except FileNotFoundError:
@@ -41,7 +41,9 @@ def _build_docker_image(dockerfile_path: str, image_tag: str) -> bool:
         return False
 
 
-def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str) -> int:
+def _run_pytest_in_docker(
+    project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
+) -> int:
     workdir = "/workspace"
     # Host HOME logs directory to map into container
     host_home = os.path.expanduser("~")
@@ -73,6 +75,8 @@ def _run_pytest_in_docker(project_root: str, image_tag: str, pytest_target: str)
         cmd += ["--user", f"{uid}:{gid}"]
     except Exception:
         pass
+    if run_extras:
+        cmd += run_extras
     cmd += [image_tag, "pytest", pytest_target, "-vs"]
     print("Running in Docker:", " ".join(cmd))
     try:
@@ -126,6 +130,10 @@ def local_test_command(args: argparse.Namespace) -> int:
         pytest_target = rel
 
     ignore_docker = bool(getattr(args, "ignore_docker", False))
+    build_extras_str = getattr(args, "docker_build_extra", "") or ""
+    run_extras_str = getattr(args, "docker_run_extra", "") or ""
+    build_extras = shlex.split(build_extras_str) if build_extras_str else []
+    run_extras = shlex.split(run_extras_str) if run_extras_str else []
     if ignore_docker:
         if not pytest_target:
             print("Error: Failed to resolve a pytest target to run.")
@@ -146,14 +154,14 @@ def local_test_command(args: argparse.Namespace) -> int:
         except Exception:
             pass
         image_tag = "ep-evaluator:local"
-        ok = _build_docker_image(dockerfiles[0], image_tag)
+        ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
         if not ok:
             print("Docker build failed. See logs above.")
             return 1
         if not pytest_target:
             print("Error: Failed to resolve a pytest target to run.")
             return 1
-        return _run_pytest_in_docker(project_root, image_tag, pytest_target)
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)
 
     # No Dockerfile: run on host
     if not pytest_target:
diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
index a2330a71..9badea3f 100644
--- a/tests/test_cli_local_test.py
+++ b/tests/test_cli_local_test.py
@@ -95,11 +95,11 @@ def test_local_test_builds_and_runs_in_docker(tmp_path, monkeypatch):
     from eval_protocol.cli_commands import local_test as lt
 
     monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
-    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, platform=None: True)
+    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True)
 
     captured = {"target": "", "image": ""}
 
-    def _fake_run_docker(root: str, image_tag: str, pytest_target: str, platform=None) -> int:
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
         captured["target"] = pytest_target
         captured["image"] = image_tag
         return 0
@@ -143,3 +143,82 @@ def _fake_host(target: str) -> int:
     rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
     assert rc == 0
     assert called["host"] is True
+
+
+def test_local_test_passes_docker_build_extra(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_build_extra.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+
+    captured = {"extras": None}
+
+    def _fake_build(dockerfile, tag, build_extras=None):
+        captured["extras"] = build_extras
+        return True
+
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
+        return 0
+
+    monkeypatch.setattr(lt, "_build_docker_image", _fake_build)
+    monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
+
+    # Extras string with multiple flags and equals-arg
+    args = SimpleNamespace(
+        entry=str(test_file),
+        ignore_docker=False,
+        yes=True,
+        docker_build_extra="--no-cache --pull --progress=plain --build-arg KEY=VAL",
+        docker_run_extra="",
+    )
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    # Expect split list preserving tokens order
+    assert captured["extras"] == ["--no-cache", "--pull", "--progress=plain", "--build-arg", "KEY=VAL"]
+
+
+def test_local_test_passes_docker_run_extra(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    test_file = project / "metric" / "test_run_extra.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [str(project / "Dockerfile")])
+    monkeypatch.setattr(lt, "_build_docker_image", lambda dockerfile, tag, build_extras=None: True)
+
+    captured = {"extras": None}
+
+    def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=None) -> int:
+        captured["extras"] = run_extras
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_in_docker", _fake_run_docker)
+
+    args = SimpleNamespace(
+        entry=str(test_file),
+        ignore_docker=False,
+        yes=True,
+        docker_build_extra="",
+        docker_run_extra="--env-file .env --memory=8g --cpus=2 --add-host=host.docker.internal:host-gateway",
+    )
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    assert captured["extras"] == [
+        "--env-file",
+        ".env",
+        "--memory=8g",
+        "--cpus=2",
+        "--add-host=host.docker.internal:host-gateway",
+    ]

From 4a17784775fceefc6a39722c33310036e0933b44 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 15:47:52 -0800
Subject: [PATCH 10/11] fix bug

---
 eval_protocol/cli_commands/local_test.py |  9 +++++--
 tests/test_cli_local_test.py             | 33 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
index cfb52da0..49d34190 100644
--- a/eval_protocol/cli_commands/local_test.py
+++ b/eval_protocol/cli_commands/local_test.py
@@ -95,11 +95,16 @@ def local_test_command(args: argparse.Namespace) -> int:
     entry = getattr(args, "entry", None)
     if entry:
         if "::" in entry:
-            file_part = entry.split("::", 1)[0]
+            file_part, func_part = entry.split("::", 1)
             file_path = (
                 file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
             )
-            pytest_target = entry
+            # Convert to project-relative like the non-:: path
+            try:
+                rel = os.path.relpath(file_path, project_root)
+            except Exception:
+                rel = file_path
+            pytest_target = f"{rel}::{func_part}"
         else:
             file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
             # Use path relative to project_root when possible
diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
index 9badea3f..e223d4db 100644
--- a/tests/test_cli_local_test.py
+++ b/tests/test_cli_local_test.py
@@ -222,3 +222,36 @@ def _fake_run_docker(root: str, image_tag: str, pytest_target: str, run_extras=N
         "--cpus=2",
         "--add-host=host.docker.internal:host-gateway",
     ]
+
+
+def test_local_test_normalizes_entry_with_selector(tmp_path, monkeypatch):
+    project = tmp_path / "proj"
+    project.mkdir()
+    monkeypatch.chdir(project)
+
+    # Create a dummy test file
+    test_file = project / "metric" / "test_sel_abs.py"
+    test_file.parent.mkdir(parents=True, exist_ok=True)
+    test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
+
+    abs_entry = f"{str(test_file)}::test_dummy"
+
+    from eval_protocol.cli_commands import local_test as lt
+
+    # Avoid Docker path
+    monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
+
+    captured = {"target": ""}
+
+    def _fake_host(target: str) -> int:
+        captured["target"] = target
+        return 0
+
+    monkeypatch.setattr(lt, "_run_pytest_host", _fake_host)
+
+    args = SimpleNamespace(entry=abs_entry, ignore_docker=False, yes=True)
+    rc = lt.local_test_command(args)  # pyright: ignore[reportArgumentType]
+    assert rc == 0
+    # Expect project-relative path plus selector
+    rel = os.path.relpath(str(test_file), str(project))
+    assert captured["target"] == f"{rel}::test_dummy"

From 9b476dca8320e138079f624cc4b1c4ddd6fe6b03 Mon Sep 17 00:00:00 2001
From: Derek Xu <xzrderek@gmail.com>
Date: Tue, 11 Nov 2025 15:56:57 -0800
Subject: [PATCH 11/11] test fix

---
 tests/test_cli_local_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/test_cli_local_test.py b/tests/test_cli_local_test.py
index e223d4db..6ab0b14e 100644
--- a/tests/test_cli_local_test.py
+++ b/tests/test_cli_local_test.py
@@ -123,12 +123,11 @@ def test_local_test_selector_single_test(tmp_path, monkeypatch):
     test_file.write_text("def test_dummy():\n    assert True\n", encoding="utf-8")
 
     from eval_protocol.cli_commands import local_test as lt
-    from eval_protocol.cli_commands import upload as up
 
     # No entry; force discover + selector
     disc = SimpleNamespace(qualname="metric.test_sel", file_path=str(test_file))
     monkeypatch.setattr(lt, "_discover_tests", lambda root: [disc])
-    monkeypatch.setattr(up, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
+    monkeypatch.setattr(lt, "_prompt_select", lambda tests, non_interactive=False: tests[:1])
     monkeypatch.setattr(lt, "_find_dockerfiles", lambda root: [])
 
     called = {"host": False}