eval-protocol · xzrderek · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025 · Nov 11, 2025
diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py
@@ -427,6 +427,37 @@ def parse_args(args=None):
     rft_parser.add_argument("--dry-run", action="store_true", help="Print planned REST calls without sending")
     rft_parser.add_argument("--force", action="store_true", help="Overwrite existing evaluator with the same ID")
 
+    # Local test command
+    local_test_parser = subparsers.add_parser(
+        "local-test",
+        help="Select an evaluation test and run it locally. If a Dockerfile exists, build and run via Docker; otherwise run on host.",
+    )
+    local_test_parser.add_argument(
+        "--entry",
+        help="Entrypoint to run (path::function or path). If not provided, a selector will be shown (unless --yes).",
+    )
+    local_test_parser.add_argument(
+        "--ignore-docker",
+        action="store_true",
+        help="Ignore Dockerfile even if present; run pytest on host",
+    )
+    local_test_parser.add_argument(
+        "--yes",
+        "-y",
+        action="store_true",
+        help="Non-interactive: if multiple tests exist and no --entry, fails with guidance",
+    )
+    local_test_parser.add_argument(
+        "--docker-build-extra",
+        default="",
+        help="Extra flags to pass to 'docker build' (quoted string, e.g. \"--no-cache --pull --progress=plain\")",
+    )
+    local_test_parser.add_argument(
+        "--docker-run-extra",
+        default="",
+        help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")",
+    )
+
     # Run command (for Hydra-based evaluations)
     # This subparser intentionally defines no arguments itself.
     # All arguments after 'run' will be passed to Hydra by parse_known_args.
@@ -559,6 +590,10 @@ def _extract_flag_value(argv_list, flag_name):
             return create_rft_command(args)
         print("Error: missing subcommand for 'create'. Try: eval-protocol create rft")
         return 1
+    elif args.command == "local-test":
+        from .cli_commands.local_test import local_test_command
+
+        return local_test_command(args)
     elif args.command == "run":
         # For the 'run' command, Hydra takes over argument parsing.
 

diff --git a/eval_protocol/cli_commands/local_test.py b/eval_protocol/cli_commands/local_test.py
@@ -0,0 +1,175 @@
+import argparse
+import os
+import subprocess
+import sys
+import shlex
+from typing import List
+
+from .upload import _discover_tests, _prompt_select
+
+
+def _find_dockerfiles(root: str) -> List[str]:
+    skip_dirs = {".venv", "venv", "node_modules", "dist", "build", "__pycache__", ".git", "vendor"}
+    dockerfiles: List[str] = []
+    for dirpath, dirnames, filenames in os.walk(root):
+        dirnames[:] = [d for d in dirnames if d not in skip_dirs and not d.startswith(".")]
+        for name in filenames:
+            if name == "Dockerfile":
+                dockerfiles.append(os.path.join(dirpath, name))
+    return dockerfiles
+
+
+def _run_pytest_host(pytest_target: str) -> int:
+    print(f"Running locally: pytest {pytest_target} -vs")
+    proc = subprocess.run([sys.executable, "-m", "pytest", pytest_target, "-vs"])
+    return proc.returncode
+
+
+def _build_docker_image(dockerfile_path: str, image_tag: str, build_extras: List[str] | None = None) -> bool:
+    context_dir = os.path.dirname(dockerfile_path)
+    print(f"Building Docker image '{image_tag}' from {dockerfile_path} ...")
+    try:
+        base_cmd = ["docker", "build"]
+        if build_extras:
+            base_cmd += build_extras
+        base_cmd += ["-t", image_tag, "-f", dockerfile_path, context_dir]
+        proc = subprocess.run(base_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+        print(proc.stdout)
+        return proc.returncode == 0
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return False
+
+
+def _run_pytest_in_docker(
+    project_root: str, image_tag: str, pytest_target: str, run_extras: List[str] | None = None
+) -> int:
+    workdir = "/workspace"
+    # Host HOME logs directory to map into container
+    host_home = os.path.expanduser("~")
+    host_logs_dir = os.path.join(host_home, ".eval_protocol")
+    try:
+        os.makedirs(host_logs_dir, exist_ok=True)
+    except Exception:
+        pass
+    # Mount read-only is safer; but tests may write artifacts. Use read-write.
+    cmd = [
+        "docker",
+        "run",
+        "--rm",
+        "-v",
+        f"{project_root}:{workdir}",
+        "-v",
+        f"{host_logs_dir}:/container_home/.eval_protocol",
+        "-e",
+        "HOME=/container_home",
+        "-e",
+        "EVAL_PROTOCOL_DIR=/container_home/.eval_protocol",
+        "-w",
+        workdir,
+    ]
+    # Try to match host user to avoid permission problems on mounted volume
+    try:
+        uid = os.getuid()  # type: ignore[attr-defined]
+        gid = os.getgid()  # type: ignore[attr-defined]
+        cmd += ["--user", f"{uid}:{gid}"]
+    except Exception:
+        pass
+    if run_extras:
+        cmd += run_extras
+    cmd += [image_tag, "pytest", pytest_target, "-vs"]
+    print("Running in Docker:", " ".join(cmd))
+    try:
+        proc = subprocess.run(cmd)
+        return proc.returncode
+    except FileNotFoundError:
+        print("Error: docker not found in PATH. Install Docker or use --ignore-docker.")
+        return 1
+
+
+def local_test_command(args: argparse.Namespace) -> int:
+    project_root = os.getcwd()
+
+    # Selection and pytest target resolution
+    pytest_target: str = ""
+    entry = getattr(args, "entry", None)
+    if entry:
+        if "::" in entry:
+            file_part, func_part = entry.split("::", 1)
+            file_path = (
+                file_part if os.path.isabs(file_part) else os.path.abspath(os.path.join(project_root, file_part))
+            )
+            # Convert to project-relative like the non-:: path
+            try:
+                rel = os.path.relpath(file_path, project_root)
+            except Exception:
+                rel = file_path
+            pytest_target = f"{rel}::{func_part}"
+        else:
+            file_path = entry if os.path.isabs(entry) else os.path.abspath(os.path.join(project_root, entry))
+            # Use path relative to project_root when possible
+            try:
+                rel = os.path.relpath(file_path, project_root)
+            except Exception:
+                rel = file_path
+            pytest_target = rel
+    else:
+        tests = _discover_tests(project_root)
+        if not tests:
+            print("No evaluation tests found.\nHint: Ensure @evaluation_test is applied.")
+            return 1
+        non_interactive = bool(getattr(args, "yes", False))
+        selected = _prompt_select(tests, non_interactive=non_interactive)
+        if not selected:
+            print("No tests selected.")
+            return 1
+        if len(selected) != 1:
+            print("Error: Please select exactly one evaluation test for 'local-test'.")
+            return 1
+        chosen = selected[0]
+        abs_path = os.path.abspath(chosen.file_path)
+        try:
+            rel = os.path.relpath(abs_path, project_root)
+        except Exception:
+            rel = abs_path
+        pytest_target = rel
+
+    ignore_docker = bool(getattr(args, "ignore_docker", False))
+    build_extras_str = getattr(args, "docker_build_extra", "") or ""
+    run_extras_str = getattr(args, "docker_run_extra", "") or ""
+    build_extras = shlex.split(build_extras_str) if build_extras_str else []
+    run_extras = shlex.split(run_extras_str) if run_extras_str else []
+    if ignore_docker:
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_host(pytest_target)
+
+    dockerfiles = _find_dockerfiles(project_root)
+    if len(dockerfiles) > 1:
+        print("Error: Multiple Dockerfiles found. Only one Dockerfile is allowed for local-test.")
+        for df in dockerfiles:
+            print(f" - {df}")
+        print("Hint: use --ignore-docker to bypass Docker.")
+        return 1
+    if len(dockerfiles) == 1:
+        # Ensure host home logs directory exists so container writes are visible to host ep logs
+        try:
+            os.makedirs(os.path.join(os.path.expanduser("~"), ".eval_protocol"), exist_ok=True)
+        except Exception:
+            pass
+        image_tag = "ep-evaluator:local"
+        ok = _build_docker_image(dockerfiles[0], image_tag, build_extras=build_extras)
+        if not ok:
+            print("Docker build failed. See logs above.")
+            return 1
+        if not pytest_target:
+            print("Error: Failed to resolve a pytest target to run.")
+            return 1
+        return _run_pytest_in_docker(project_root, image_tag, pytest_target, run_extras=run_extras)
+
+    # No Dockerfile: run on host
+    if not pytest_target:
+        print("Error: Failed to resolve a pytest target to run.")
+        return 1
+    return _run_pytest_host(pytest_target)
diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py
@@ -437,7 +437,7 @@ def _prompt_select_interactive(tests: list[DiscoveredTest]) -> list[DiscoveredTe
         # Check if only one test - auto-select it
         if len(tests) == 1:
             print(f"\nFound 1 test: {_format_test_choice(tests[0], 1)}")
-            confirm = questionary.confirm("Upload this test?", default=True, style=custom_style).ask()
+            confirm = questionary.confirm("Select this test?", default=True, style=custom_style).ask()
             if confirm:
                 return tests
             else:
@@ -500,7 +500,7 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
 
     print("=" * 80)
     try:
-        choice = input("Enter the number to upload: ").strip()
+        choice = input("Enter the number to select: ").strip()
     except KeyboardInterrupt:
         print("\n\nUpload cancelled.")
         return []

diff --git a/eval_protocol/models.py b/eval_protocol/models.py
@@ -1,7 +1,7 @@
 import os
 import logging
 import importlib
-from datetime import datetime
+from datetime import datetime, timezone
 from enum import Enum
 from typing import Any, ClassVar, Dict, List, Literal, Optional, TypedDict, Union
 
@@ -825,7 +825,10 @@ class EvaluationRow(BaseModel):
         description="Metadata about the execution of the evaluation.",
     )
 
-    created_at: datetime = Field(default_factory=datetime.now, description="The timestamp when the row was created.")
+    created_at: datetime = Field(
+        default_factory=lambda: datetime.now(timezone.utc),
+        description="The timestamp when the row was created (UTC).",
+    )
 
     eval_metadata: Optional[EvalMetadata] = Field(
         default=None, description="Metadata about the evaluation that was run."