From 3824bebd11d73927abfc777dde158cd254b17dba Mon Sep 17 00:00:00 2001
From: Alezander9 <alexander.j.yue@gmail.com>
Date: Tue, 12 May 2026 15:25:23 -0700
Subject: [PATCH 1/2] Add public framework reverification runner

---
 .env.example                                  |  10 +
 .gitignore                                    |   5 +-
 README.md                                     |  18 +
 browsers/__init__.py                          |  57 +-
 browsers/browser_use_cloud.py                 |  30 +-
 browsers/util.py                              |  16 +
 frameworks/__init__.py                        | 325 ++++++++++++
 frameworks/bcode/run_task.py                  | 384 ++++++++++++++
 frameworks/browser_use/__init__.py            |   1 +
 frameworks/browser_use/run_task.py            | 113 ++++
 .../browser_use_cloud_api_v2/run_task.py      | 201 +++++++
 .../browser_use_cloud_api_v3/run_task.py      | 286 ++++++++++
 frameworks/browserbase_agent/executor.mjs     | 181 +++++++
 frameworks/browserbase_agent/package.json     |   9 +
 frameworks/browserbase_agent/run_task.py      | 216 ++++++++
 frameworks/but/__init__.py                    |   1 +
 frameworks/but/run_task.py                    | 441 ++++++++++++++++
 frameworks/but/system_prompt.md               |  10 +
 frameworks/but_rust/__init__.py               |   1 +
 frameworks/but_rust/run_task.py               | 382 ++++++++++++++
 frameworks/but_rust/system_prompt.md          |  10 +
 frameworks/claude_code_harness/__init__.py    |   1 +
 frameworks/claude_code_harness/run_task.py    | 436 ++++++++++++++++
 .../claude_code_harness/system_prompt.md      |  13 +
 frameworks/claude_code_harness_ab/__init__.py |   1 +
 frameworks/claude_code_harness_ab/run_task.py | 460 ++++++++++++++++
 .../claude_code_harness_ab/system_prompt.md   |  29 ++
 .../claude_code_harness_bu_cli/__init__.py    |   1 +
 .../claude_code_harness_bu_cli/run_task.py    | 461 +++++++++++++++++
 .../system_prompt.md                          |  25 +
 frameworks/claude_code_harness_js/__init__.py |   1 +
 frameworks/claude_code_harness_js/run_task.py | 445 ++++++++++++++++
 .../claude_code_harness_js/system_prompt.md   |  21 +
 frameworks/claude_cua/__init__.py             |   1 +
 frameworks/claude_cua/run_task.py             |  97 ++++
 frameworks/codex_harness/__init__.py          |   1 +
 frameworks/codex_harness/run_task.py          | 489 ++++++++++++++++++
 frameworks/codex_harness/system_prompt.md     |  15 +
 frameworks/pi_harness/__init__.py             |   1 +
 frameworks/pi_harness/run_task.py             | 447 ++++++++++++++++
 frameworks/pi_harness/system_prompt.md        |  13 +
 frameworks/pibt/__init__.py                   |   1 +
 frameworks/pibt/run_task.py                   | 466 +++++++++++++++++
 frameworks/pibt/system_prompt.md              |  13 +
 frameworks/stagehand/__init__.py              |   1 +
 frameworks/stagehand/executor.js              |  65 +++
 frameworks/stagehand/package.json             |   8 +
 frameworks/stagehand/run_task.py              |  86 +++
 laminar.py                                    |  49 ++
 lmnr.py                                       |  38 ++
 models.py                                     |  50 ++
 run_framework_eval.py                         | 295 +++++++++++
 52 files changed, 6688 insertions(+), 39 deletions(-)
 create mode 100644 browsers/util.py
 create mode 100644 frameworks/__init__.py
 create mode 100644 frameworks/bcode/run_task.py
 create mode 100644 frameworks/browser_use/__init__.py
 create mode 100644 frameworks/browser_use/run_task.py
 create mode 100644 frameworks/browser_use_cloud_api_v2/run_task.py
 create mode 100644 frameworks/browser_use_cloud_api_v3/run_task.py
 create mode 100644 frameworks/browserbase_agent/executor.mjs
 create mode 100644 frameworks/browserbase_agent/package.json
 create mode 100644 frameworks/browserbase_agent/run_task.py
 create mode 100644 frameworks/but/__init__.py
 create mode 100644 frameworks/but/run_task.py
 create mode 100644 frameworks/but/system_prompt.md
 create mode 100644 frameworks/but_rust/__init__.py
 create mode 100644 frameworks/but_rust/run_task.py
 create mode 100644 frameworks/but_rust/system_prompt.md
 create mode 100644 frameworks/claude_code_harness/__init__.py
 create mode 100644 frameworks/claude_code_harness/run_task.py
 create mode 100644 frameworks/claude_code_harness/system_prompt.md
 create mode 100644 frameworks/claude_code_harness_ab/__init__.py
 create mode 100644 frameworks/claude_code_harness_ab/run_task.py
 create mode 100644 frameworks/claude_code_harness_ab/system_prompt.md
 create mode 100644 frameworks/claude_code_harness_bu_cli/__init__.py
 create mode 100644 frameworks/claude_code_harness_bu_cli/run_task.py
 create mode 100644 frameworks/claude_code_harness_bu_cli/system_prompt.md
 create mode 100644 frameworks/claude_code_harness_js/__init__.py
 create mode 100644 frameworks/claude_code_harness_js/run_task.py
 create mode 100644 frameworks/claude_code_harness_js/system_prompt.md
 create mode 100644 frameworks/claude_cua/__init__.py
 create mode 100644 frameworks/claude_cua/run_task.py
 create mode 100644 frameworks/codex_harness/__init__.py
 create mode 100644 frameworks/codex_harness/run_task.py
 create mode 100644 frameworks/codex_harness/system_prompt.md
 create mode 100644 frameworks/pi_harness/__init__.py
 create mode 100644 frameworks/pi_harness/run_task.py
 create mode 100644 frameworks/pi_harness/system_prompt.md
 create mode 100644 frameworks/pibt/__init__.py
 create mode 100644 frameworks/pibt/run_task.py
 create mode 100644 frameworks/pibt/system_prompt.md
 create mode 100644 frameworks/stagehand/__init__.py
 create mode 100644 frameworks/stagehand/executor.js
 create mode 100644 frameworks/stagehand/package.json
 create mode 100644 frameworks/stagehand/run_task.py
 create mode 100644 laminar.py
 create mode 100644 lmnr.py
 create mode 100644 models.py
 create mode 100644 run_framework_eval.py

diff --git a/.env.example b/.env.example
index 8c8b2d3..7149172 100644
--- a/.env.example
+++ b/.env.example
@@ -1,5 +1,15 @@
 BROWSER_USE_API_KEY=
 GOOGLE_API_KEY=
+OPENAI_API_KEY=
+ANTHROPIC_API_KEY=
+
+# Optional model providers used by some framework/model combinations
+GOOGLE_GENERATIVE_AI_API_KEY=
+FIREWORKS_API_KEY=
+OPENROUTER_API_KEY=
+DEEPSEEK_API_KEY=
+MOONSHOT_API_KEY=
+DASHSCOPE_API_KEY=
 
 # Optional: only needed if using --browser with the corresponding provider
 ANCHORBROWSER_API_KEY=
diff --git a/.gitignore b/.gitignore
index a2735bf..29f3c05 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,4 +4,7 @@ __pycache__/
 .env
 run_data/
 DESIGN.MD
-results/
\ No newline at end of file
+results/
+BU_Bench_V1.json
+Stealth_Bench_V1.json
+benchmarks/*.json
diff --git a/README.md b/README.md
index 7104e28..efe54a2 100644
--- a/README.md
+++ b/README.md
@@ -130,6 +130,24 @@ uv run python run_eval.py
 
 Results are saved to `results/` and detailed traces to `run_data/`.
 
+### Re-verifying Framework Results
+
+Use `run_framework_eval.py` to rerun BU_Bench_V1 through a framework adapter.
+It decrypts `BU_Bench_V1.enc` in memory and writes local outputs to ignored
+`results/` and `run_data/`.
+
+```bash
+uv run python run_framework_eval.py --list-frameworks
+uv run python run_framework_eval.py --framework browser-use --browser browser-use-cloud --model bu-2-0
+```
+
+See the comment at the top of `run_framework_eval.py` for framework-specific
+setup, options, and examples.
+
+Important: `run_data/` traces include decrypted task text, ground truth, model
+outputs, and screenshots. They are gitignored for local verification only. Do
+not publish or commit them.
+
 ### Swapping Models
 
 Edit `run_eval.py` to change the model:
diff --git a/browsers/__init__.py b/browsers/__init__.py
index 0862591..3784c1d 100644
--- a/browsers/__init__.py
+++ b/browsers/__init__.py
@@ -1,21 +1,21 @@
-"""Browser provider registry.
+"""Browser provider registry."""
 
-Each provider module exports:
-    async def connect() -> str    -- returns a CDP WebSocket URL
-    async def disconnect() -> None -- cleans up the session
-
-Usage:
-    from browsers import get_provider
-    provider = get_provider("anchor")
-    cdp_url = await provider.connect()
-    ...
-    await provider.disconnect()
-"""
-
-import asyncio
 import importlib
 
-import httpx
+from browsers.util import retry_on_429
+from browsers import (
+    anchor,
+    browser_use_cloud,
+    browserbase,
+    browserless,
+    driver,
+    hyperbrowser,
+    local_headful,
+    local_headless,
+    onkernel,
+    rebrowser,
+    steel,
+)
 
 PROVIDERS = [
     "anchor",
@@ -30,22 +30,23 @@ async def disconnect() -> None -- cleans up the session
     "steel",
 ]
 
+BROWSERS = {
+    "anchor": anchor,
+    "browser-use-cloud": browser_use_cloud,
+    "browserbase": browserbase,
+    "browserless": browserless,
+    "driver": driver,
+    "hyperbrowser": hyperbrowser,
+    "local_headful": local_headful,
+    "local_headless": local_headless,
+    "onkernel": onkernel,
+    "rebrowser": rebrowser,
+    "steel": steel,
+}
+
 
 def get_provider(name: str):
     """Import and return a browser provider module by name."""
     if name not in PROVIDERS:
         raise ValueError(f"Unknown browser provider: {name}. Available: {PROVIDERS}")
     return importlib.import_module(f"browsers.{name}")
-
-
-async def retry_on_429(fn, max_retries=10, max_wait=30):
-    """Call fn(), retrying with capped exponential backoff on 429 responses."""
-    for attempt in range(max_retries + 1):
-        try:
-            return await fn()
-        except httpx.HTTPStatusError as e:
-            if e.response.status_code != 429 or attempt == max_retries:
-                raise
-            wait = min(2**attempt, max_wait)
-            print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s")
-            await asyncio.sleep(wait)
diff --git a/browsers/browser_use_cloud.py b/browsers/browser_use_cloud.py
index c212fcf..336289e 100644
--- a/browsers/browser_use_cloud.py
+++ b/browsers/browser_use_cloud.py
@@ -1,25 +1,34 @@
-"""browser-use Cloud -- https://browser-use.com
-
-Requires: BROWSER_USE_API_KEY env var.
-"""
+"""browser-use cloud browser provider."""
 
 import os
 
 import httpx
 
-from browsers import retry_on_429
+from browsers.util import retry_on_429
+
+MAX_CONCURRENT = 200
 
 _session_id: str | None = None
 
 
+def _api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v2")
+    return f"{base}/api/{version}"
+
+
+def _api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
 async def connect() -> str:
     global _session_id
 
     async def _create():
         async with httpx.AsyncClient() as client:
             resp = await client.post(
-                "https://api.browser-use.com/api/v2/browsers",
-                headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]},
+                f"{_api_base()}/browsers",
+                headers={"X-Browser-Use-API-Key": _api_key()},
                 json={},
                 timeout=90,
             )
@@ -36,10 +45,11 @@ async def disconnect() -> None:
     if not _session_id:
         return
     async with httpx.AsyncClient() as client:
-        await client.patch(
-            f"https://api.browser-use.com/api/v2/browsers/{_session_id}",
-            headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]},
+        resp = await client.patch(
+            f"{_api_base()}/browsers/{_session_id}",
+            headers={"X-Browser-Use-API-Key": _api_key()},
             json={"action": "stop"},
             timeout=30,
         )
+        resp.raise_for_status()
     _session_id = None
diff --git a/browsers/util.py b/browsers/util.py
new file mode 100644
index 0000000..e4ab610
--- /dev/null
+++ b/browsers/util.py
@@ -0,0 +1,16 @@
+import asyncio
+
+import httpx
+
+
+async def retry_on_429(fn, max_retries=10, max_wait=30):
+    """Call fn(), retrying with capped exponential backoff on 429 responses."""
+    for attempt in range(max_retries + 1):
+        try:
+            return await fn()
+        except httpx.HTTPStatusError as e:
+            if e.response.status_code != 429 or attempt == max_retries:
+                raise
+            wait = min(2**attempt, max_wait)
+            print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s")
+            await asyncio.sleep(wait)
diff --git a/frameworks/__init__.py b/frameworks/__init__.py
new file mode 100644
index 0000000..18c6390
--- /dev/null
+++ b/frameworks/__init__.py
@@ -0,0 +1,325 @@
+﻿"""Framework registry and shared local evaluation flow.
+
+This public runner intentionally avoids the remote dispatch/tracing stack.
+It loads the encrypted benchmark file, executes one task with a framework
+adapter, judges the trace, and writes local JSON artifacts under ignored paths.
+"""
+
+import asyncio
+import base64
+import hashlib
+import json
+import os
+import traceback
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Awaitable, Callable
+
+from browser_use import ChatGoogle
+from cryptography.fernet import Fernet
+
+from judge import JudgementResult, construct_judge_messages
+
+ROOT_DIR = Path(__file__).resolve().parent.parent
+DEFAULT_TASK_TIMEOUT = 1800
+
+
+def _task_timeout() -> int:
+    raw = os.environ.get("TASK_TIMEOUT")
+    if not raw:
+        return DEFAULT_TASK_TIMEOUT
+    try:
+        return int(raw)
+    except ValueError:
+        return DEFAULT_TASK_TIMEOUT
+
+
+def parse_params() -> dict[str, str]:
+    """Parse PARAMS env var. Format: key=value,key=value."""
+    raw = os.environ.get("PARAMS", "")
+    if not raw:
+        return {}
+    params: dict[str, str] = {}
+    for part in raw.split(","):
+        if "=" not in part:
+            continue
+        key, value = part.split("=", 1)
+        key = key.strip()
+        if key:
+            params[key] = value.strip()
+    return params
+
+
+def validate_params(params: dict[str, str], accepted: dict[str, str]) -> dict[str, str]:
+    unknown = set(params) - set(accepted)
+    if unknown:
+        accepted_keys = ", ".join(sorted(accepted)) or "none"
+        raise ValueError(
+            f"Unknown params: {', '.join(sorted(unknown))}. Accepted: {accepted_keys}"
+        )
+    return params
+
+
+@dataclass
+class ExecutionResult:
+    final_result: str
+    steps: list[str]
+    screenshots_b64: list[str]
+    num_steps: int
+    duration_seconds: float
+    cost: float = 0.0
+
+
+@dataclass
+class FrameworkInfo:
+    browsers: list[str]
+    repo: str | None = None
+    max_concurrent_override: int | None = None
+    notes: str = ""
+
+
+FRAMEWORKS: dict[str, FrameworkInfo] = {
+    "browser-use": FrameworkInfo(
+        browsers=[
+            "browser-use-cloud",
+            "anchor",
+            "browserbase",
+            "browserless",
+            "driver",
+            "hyperbrowser",
+            "local_headful",
+            "local_headless",
+            "onkernel",
+            "rebrowser",
+            "steel",
+        ],
+        repo="browser-use/browser-use",
+    ),
+    "browser-use-cloud-api-v2": FrameworkInfo(browsers=["integrated"]),
+    "browser-use-cloud-api-v3": FrameworkInfo(browsers=["integrated"]),
+    "bcode": FrameworkInfo(browsers=["browser-use-cloud"], repo="browser-use/browsercode"),
+    "bcode-v012": FrameworkInfo(
+        browsers=["browser-use-cloud"],
+        repo="browser-use/browsercode",
+        notes="Alias for the bcode adapter used with framework_ref v0.1.2.",
+    ),
+    "browserbase-agent": FrameworkInfo(browsers=["integrated"]),
+    "stagehand": FrameworkInfo(
+        browsers=["browserbase", "local_headless"],
+        repo="browserbase/stagehand",
+        notes="Adapter scaffold; executor must be completed before use.",
+    ),
+    "claude-code-harness": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-harness"
+    ),
+    "claude-code-harness-js": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-harness-js"
+    ),
+    "claude-code-harness-ab": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="vercel-labs/agent-browser"
+    ),
+    "claude-code-harness-bu-cli": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-use"
+    ),
+    "codex-harness": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-harness"
+    ),
+    "pi-harness": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-harness"
+    ),
+    "pibt": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/pi-agent-extensions"
+    ),
+    "but": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-use-terminal"
+    ),
+    "but-rust": FrameworkInfo(
+        browsers=["browser-use-cloud"], repo="browser-use/browser-use-terminal"
+    ),
+    "claude-cua": FrameworkInfo(
+        browsers=["integrated"],
+        notes="Adapter scaffold; not used for the published BU_Bench_V1 runs.",
+    ),
+}
+
+
+def framework_to_module(framework: str) -> str:
+    if framework == "bcode-v012":
+        return "bcode"
+    return framework.replace("-", "_")
+
+
+def interleave(tasks: list[dict]) -> list[dict]:
+    """Reorder 100 tasks, 20 per section, matching the distributed runner."""
+    if os.environ.get("NO_INTERLEAVE") == "1":
+        return tasks
+    if len(tasks) != 100:
+        return tasks
+    reordered = []
+    for i in range(20):
+        for d in range(5):
+            reordered.append(tasks[d * 20 + i])
+    return reordered
+
+
+def _encrypted_task_file(benchmark: str) -> Path:
+    candidates = [
+        ROOT_DIR / f"{benchmark}.enc",
+        ROOT_DIR / f"{benchmark.upper()}.enc",
+        ROOT_DIR / "benchmarks" / f"{benchmark}.enc",
+    ]
+    for path in candidates:
+        if path.exists():
+            return path
+    raise FileNotFoundError(
+        f"Could not find encrypted task file for {benchmark}. Expected "
+        f"{ROOT_DIR / (benchmark + '.enc')}"
+    )
+
+
+def load_tasks(benchmark: str) -> list[dict]:
+    """Load tasks from the encrypted public artifact without writing plaintext."""
+    task_file = _encrypted_task_file(benchmark)
+    key = base64.urlsafe_b64encode(hashlib.sha256(benchmark.encode()).digest())
+    encrypted = base64.b64decode(task_file.read_text())
+    return json.loads(Fernet(key).decrypt(encrypted))
+
+
+JUDGE_LLM = None
+
+
+def _get_judge_llm():
+    global JUDGE_LLM
+    if JUDGE_LLM is None:
+        JUDGE_LLM = ChatGoogle(
+            model=os.environ.get("JUDGE_MODEL", "gemini-2.5-flash"),
+            api_key=os.getenv("GOOGLE_API_KEY"),
+        )
+    return JUDGE_LLM
+
+
+async def _evaluate_task(judge_messages) -> JudgementResult:
+    response = await _get_judge_llm().ainvoke(
+        judge_messages, output_format=JudgementResult
+    )
+    return response.completion
+
+
+def _write_json(path: Path, data: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(data, indent=2), encoding="utf-8")
+
+
+def _maybe_write_task_artifact(
+    task: dict,
+    result: ExecutionResult | None,
+    judgement: dict[str, Any] | None,
+    score: int,
+    error: str | None = None,
+    tb: str | None = None,
+) -> None:
+    run_data_dir_raw = os.environ.get("RUN_DATA_DIR")
+    if not run_data_dir_raw:
+        return
+    task_id = task.get("task_id", f"task_{task.get('_index', 'unknown')}")
+    payload: dict[str, Any] = {
+        "task_id": task_id,
+        "score": score,
+        "judgement": judgement,
+    }
+    if result is not None:
+        payload["agent_trace"] = {
+            "agent_task": task.get("confirmed_task"),
+            "final_result": result.final_result,
+            "agent_steps": result.steps,
+            "ground_truth": task.get("answer"),
+            "screenshots_b64": result.screenshots_b64,
+        }
+        payload["metrics"] = {
+            "steps": result.num_steps,
+            "duration": result.duration_seconds,
+            "cost": result.cost,
+        }
+    if error:
+        payload["error"] = error
+    if tb:
+        payload["traceback"] = tb
+    _write_json(Path(run_data_dir_raw) / f"{task_id}.json", payload)
+
+
+def _maybe_write_local_result(data: dict[str, Any]) -> None:
+    output = os.environ.get("LOCAL_RESULT_FILE")
+    if output:
+        _write_json(Path(output), data)
+
+
+async def run_and_judge(
+    task: dict,
+    execute_fn: Callable[[str], Awaitable[ExecutionResult]],
+) -> dict[str, Any]:
+    """Execute one task, judge it, and return a task-level result dict."""
+    task_id = task.get("task_id", "unknown")
+    print(f"Running task: {task_id}")
+
+    try:
+        result = await asyncio.wait_for(
+            execute_fn(task["confirmed_task"]), timeout=_task_timeout()
+        )
+        judge_messages = construct_judge_messages(
+            task=task["confirmed_task"],
+            final_result=result.final_result,
+            agent_steps=result.steps,
+            ground_truth=task.get("answer"),
+            screenshots_b64=result.screenshots_b64,
+        )
+        judgement = await _evaluate_task(judge_messages)
+        judgement_data = judgement.model_dump()
+        score = 1 if judgement.verdict else 0
+        print(f"Task {task_id} completed: score={score}")
+
+        data = {
+            "task_id": task_id,
+            "task_index": task.get("_index"),
+            "score": score,
+            "steps": result.num_steps,
+            "duration": result.duration_seconds,
+            "cost": result.cost,
+            "judgement": judgement_data,
+        }
+        _maybe_write_task_artifact(task, result, judgement_data, score)
+        _maybe_write_local_result(data)
+        return data
+
+    except asyncio.TimeoutError:
+        error_msg = f"Timed out after {_task_timeout()}s"
+        print(f"Task {task_id} timed out after {_task_timeout()}s")
+        data = {
+            "task_id": task_id,
+            "task_index": task.get("_index"),
+            "score": 0,
+            "steps": 0,
+            "duration": _task_timeout(),
+            "cost": 0,
+            "error": error_msg,
+        }
+        _maybe_write_task_artifact(task, None, None, 0, error=error_msg)
+        _maybe_write_local_result(data)
+        return data
+
+    except BaseException as e:
+        error_msg = f"{type(e).__name__}: {e}"
+        tb = traceback.format_exc()
+        print(f"Task {task_id} failed: {error_msg}")
+        data = {
+            "task_id": task_id,
+            "task_index": task.get("_index"),
+            "score": 0,
+            "steps": 0,
+            "duration": 0,
+            "cost": 0,
+            "error": error_msg,
+            "traceback": tb,
+        }
+        _maybe_write_task_artifact(task, None, None, 0, error=error_msg, tb=tb)
+        _maybe_write_local_result(data)
+        return data
diff --git a/frameworks/bcode/run_task.py b/frameworks/bcode/run_task.py
new file mode 100644
index 0000000..434fb55
--- /dev/null
+++ b/frameworks/bcode/run_task.py
@@ -0,0 +1,384 @@
+﻿"""Run a single benchmark task using bcode (browsercode).
+
+bcode is a coding agent (opencode fork) with a built-in browser harness.
+We pre-provision a browser-use-cloud session and pass its CDP URL through
+`BU_CDP_WS`, which the in-process CDP `Session.connect()` reads as a
+default endpoint when the agent calls `session.connect()` with no args
+(v0.1.1+). bcode then runs headlessly:
+
+    bcode run --model <provider/slug> --format json -- "<task>"
+
+Stdout is one JSON event per line (tool_use, step_start, step_finish, text,
+reasoning, error). We extract steps, final answer, and cost from these
+events.
+
+Screenshots (v0.1.2+): the bcode browser-execute hook taps every
+`Page.captureScreenshot` CDP call and (a) auto-attaches the image to the
+agent's next assistant turn so the model sees it inline, and (b) when
+`BCODE_SCREENSHOT_DIR=<path>` is set, writes the same PNG to disk for the
+eval-judge. Files are named `<sessionID>-<startedAt>-<seq>.<ext>` so
+sort-by-name is sort-by-time. We point the dump dir at a per-task subdir,
+read the PNGs back as base64, and hand them to the judge -- matching the
+v0.0.x `/tmp/shots/` flow. Pin `framework_ref >= v0.1.2` to use this hook.
+
+v0.1.0 vs v0.1.1 vs v0.1.2: v0.1.0 ported the harness from Python (uv +
+helpers.py + daemon) to in-process TypeScript (the agent writes JS that
+drives a CDP `Session` directly). v0.1.0 dropped honoring `BU_CDP_WS`;
+v0.1.1 restored it as a default in `Session.connect()`. v0.1.2 added the
+`Page.captureScreenshot` tap (auto-attach + `BCODE_SCREENSHOT_DIR` disk
+dump). Pin `framework_ref >= v0.1.2` for screenshot-judging.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from lmnr import Laminar
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+    "fetch_use": "Enable/disable the Browser-Use fetch-use proxy for bcode's webfetch tool (true/false, default: true on v0.1.3+ when BROWSER_USE_API_KEY is set). Setting false injects {\"experimental\":{\"fetch_use\":false}} via OPENCODE_CONFIG_CONTENT so webfetch uses the native HttpClient instead of the proxy. Use for A/B isolating v0.1.3's fetch-use rewrite from other v0.1.3 changes.",
+}
+
+PRE_PROMPT = (
+    "You are a coding agent with browser access working fully autonomously. "
+    "A browser is preconfigured for you: calling `await session.connect()` "
+    "(no args) inside a `browser_execute` snippet attaches to it. "
+    "Calling `session.Page.captureScreenshot()` returns the image and the "
+    "harness auto-attaches it to your next turn so you can see it inline. "
+    "Take screenshots whenever you need to verify page state. Your final "
+    "assistant message is what the judge will read as your answer to the "
+    "task.\n\n"
+    "Work to complete the following task: {task}"
+)
+
+# bcode is installed via the official curl installer (eval.yaml) which drops
+# the binary at $HOME/.bcode/bin/bcode. Resolve once at import time.
+BCODE_BIN = str(Path(os.environ["HOME"]) / ".bcode" / "bin" / "bcode")
+# Where bcode v0.1.2+ writes Page.captureScreenshot dumps when
+# BCODE_SCREENSHOT_DIR is set. Per-task: reset before run, drained after.
+SHOTS_DIR = Path("/tmp/bcode_shots")
+
+
+def _reset_shots_dir() -> None:
+    if SHOTS_DIR.exists():
+        shutil.rmtree(SHOTS_DIR)
+    SHOTS_DIR.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG/JPEG bcode wrote during this task as base64.
+
+    File naming (v0.1.2): `<sessionID>-<startedAt>-<seq>.<png|jpeg>`. Sort
+    by name to recover capture order across parallel browser_execute calls
+    (in practice opencode serializes tool calls within one assistant
+    message, so this is just a stable order).
+    """
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.iterdir() if p.is_file() and p.suffix in (".png", ".jpeg", ".jpg"))
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+def _bu_api_base() -> str:
+    """Resolve the Browser-Use Cloud API base. Default prod, override via env."""
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+# Map a benchmark model alias to opencode's `provider/model` slug by checking
+# substrings. Avoids a per-model lookup table; new models pass through as long
+# as the provider keyphrase is present in the alias.
+PROVIDER_KEYPHRASES = (
+    ("claude", "anthropic"),
+    ("gemini", "google"),
+    ("gemma", "google"),
+    ("gpt", "openai"),
+    ("codex", "openai"),
+)
+
+
+def _resolve_model_slug(model_name: str) -> str:
+    if "/" in model_name:
+        return model_name
+    lower = model_name.lower()
+    for key, provider in PROVIDER_KEYPHRASES:
+        if key in lower:
+            return f"{provider}/{model_name}"
+    raise ValueError(
+        f"bcode: cannot infer provider for model {model_name!r}. "
+        f"Pass an explicit `provider/model` slug as --model, or add a keyphrase "
+        f"to PROVIDER_KEYPHRASES in frameworks/bcode/run_task.py."
+    )
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser() -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+def _format_step(event: dict) -> str | None:
+    """Turn one --format=json event into a step string (or None to skip)."""
+    etype = event.get("type")
+    part = event.get("part") or {}
+    if etype == "tool_use":
+        tool = part.get("tool", "?")
+        inp = (part.get("state") or {}).get("input") or {}
+        if tool == "bash":
+            return f"bash: {(inp.get('command') or '').strip()[:2000]}"
+        if tool in ("read", "write", "edit"):
+            return f"{tool}: {inp.get('filePath') or inp.get('path') or ''}"
+        if tool in ("browser-execute", "browser_execute"):
+            # v0.1.0+ renamed the snippet field python -> code; v0.0.x used
+            # python. Read both so this runner works against either harness.
+            snippet = (inp.get("code") or inp.get("python") or "").strip()
+            return f"browser_execute: {snippet[:2000]}"
+        if tool == "webfetch":
+            return f"webfetch: {inp.get('url') or ''}"
+        if tool in ("glob", "grep", "codesearch", "websearch"):
+            return f"{tool}: {inp.get('pattern') or inp.get('query') or ''}"
+        try:
+            return f"{tool}: {json.dumps(inp, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return tool
+    if etype == "text":
+        text = (part.get("text") or "").strip()
+        return f"text: {text[:2000]}" if text else None
+    if etype == "reasoning":
+        text = (part.get("text") or "").strip()
+        return f"thinking: {text[:2000]}" if text else None
+    return None
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while line := await proc.stderr.readline():
+        s = line.decode("utf-8", errors="replace").rstrip("\n")
+        buf.append(s)
+        print(f"[bcode-stderr] {s}", flush=True)
+
+
+async def _iter_lines(stream: asyncio.StreamReader):
+    """Yield one line at a time, tolerant of arbitrarily long lines."""
+    buf = bytearray()
+    while chunk := await stream.read(1 << 16):
+        buf.extend(chunk)
+        while (nl := buf.find(b"\n")) >= 0:
+            yield bytes(buf[:nl])
+            del buf[: nl + 1]
+    if buf:
+        yield bytes(buf)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = parse_params()
+    validate_params(params, ACCEPTED_PARAMS)
+    model_slug = _resolve_model_slug(os.environ["MODEL"])
+
+    # browser-use-cloud session via direct API. BU_CDP_WS is read by bcode's
+    # in-process CDP `Session.connect()` (v0.1.1+) as a default endpoint when
+    # the agent calls `session.connect()` with no args. v0.0.x's Python
+    # harness daemon read the same env var. Single env-var keeps the runner
+    # compatible with both harness eras.
+    browser_id, cdp_ws = _start_browser()
+
+    parent_span_context = Laminar.serialize_span_context()
+    # Reset and route the screenshot dump dir BEFORE bcode starts. v0.1.2+
+    # writes every Page.captureScreenshot result here (in addition to the
+    # auto-attach to the agent's next turn -- same tap, two consumers).
+    _reset_shots_dir()
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        "BCODE_SCREENSHOT_DIR": str(SHOTS_DIR),
+    }
+    if parent_span_context:
+        env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context
+    # fetch_use=false -> inject opencode.json-equivalent config disabling the
+    # fetch-use proxy. OPENCODE_CONFIG_CONTENT is merged with local-scope
+    # precedence at startup (see packages/opencode/src/config/config.ts:593),
+    # so this overrides any default bcode would have applied. Schema:
+    # experimental.fetch_use: bool (v0.1.3+). When BROWSER_USE_API_KEY is set
+    # AND this flag is true (default), webfetch routes via fetch.browser-use.com;
+    # setting false falls back to native HttpClient. No-op on <v0.1.3 where the
+    # config key is unknown -- schema validation strips unknown keys silently
+    # but does not error, so this is safe to pass on older refs.
+    fetch_use_param = params.get("fetch_use", "").strip().lower()
+    if fetch_use_param in ("false", "0", "no", "off"):
+        env["OPENCODE_CONFIG_CONTENT"] = json.dumps({"experimental": {"fetch_use": False}})
+    elif fetch_use_param and fetch_use_param not in ("true", "1", "yes", "on"):
+        raise ValueError(f"Invalid fetch_use={fetch_use_param!r}; expected true|false")
+    # --dangerously-skip-permissions: without it, run mode auto-REJECTS every
+    # permission ask (e.g. external_directory for the harness helpers cache),
+    # silently failing the agent partway through. In an unattended GHA runner
+    # there's no human to approve and nothing dangerous about a fresh VM.
+    cmd = [
+        BCODE_BIN, "run",
+        "--model", model_slug,
+        "--format", "json",
+        "--dangerously-skip-permissions",
+        "--", PRE_PROMPT.format(task=task_description),
+    ]
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    errors: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd="/tmp",
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    try:
+        async for raw in _iter_lines(proc.stdout):
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[bcode-stdout-raw] {line[:500]}", flush=True)
+                continue
+
+            if (s := _format_step(event)):
+                steps.append(s)
+                print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            if event.get("type") == "text":
+                if t := ((event.get("part") or {}).get("text") or "").strip():
+                    final_text = t
+            elif event.get("type") == "step_finish":
+                total_cost += float((event.get("part") or {}).get("cost") or 0.0)
+            elif event.get("type") == "error":
+                err = event.get("error")
+                errors.append(err if isinstance(err, str) else json.dumps(err))
+                print(f"[bcode-error] {errors[-1][:500]}", flush=True)
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            proc.kill()
+            await proc.wait()
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+
+    if proc.returncode not in (0, None) and not final_text and not steps:
+        raise RuntimeError(
+            f"bcode exited with code {proc.returncode} before producing output. "
+            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+        )
+
+    answer = (final_text or "").strip()
+    if errors and not answer:
+        final_result = f"[bcode_error] {errors[0][:500]}"
+    elif errors:
+        final_result = f"[bcode_error_recovered] {answer}"
+    else:
+        final_result = answer or "[bcode_no_output]"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        # v0.1.2+ taps Page.captureScreenshot and writes PNGs to
+        # BCODE_SCREENSHOT_DIR (set above to SHOTS_DIR). Drain them now so
+        # the judge sees the same visual signal as on v0.0.x.
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    # Propagate task_timeout param to run_and_judge before it wraps execute().
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/browser_use/__init__.py b/frameworks/browser_use/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/browser_use/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/browser_use/run_task.py b/frameworks/browser_use/run_task.py
new file mode 100644
index 0000000..ac521ab
--- /dev/null
+++ b/frameworks/browser_use/run_task.py
@@ -0,0 +1,113 @@
+﻿"""Run a single benchmark task using the browser-use agent framework."""
+
+import os
+import sys
+import asyncio
+import base64
+from pathlib import Path
+from functools import partial
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from browser_use import Agent, Browser
+from lmnr import observe
+from browsers import BROWSERS
+from models import MODELS
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "use_vision": "Enable/disable vision (screenshots) for the agent (true/false, default: true)",
+    "framework_repo": "Override GitHub repo for browser-use install (e.g. Alezander9/alex-browser-use). Consumed by the workflow install step, not the runner.",
+}
+
+
+def encode_screenshots(paths: list[str]) -> list[str]:
+    result = []
+    for p in paths:
+        path = Path(p)
+        if path.exists():
+            result.append(base64.b64encode(path.read_bytes()).decode())
+    return result
+
+
+@observe(span_type="EXECUTOR")
+async def execute(
+    task_description: str, llm, browser_name: str, use_vision: bool = True
+) -> ExecutionResult:
+    """Run a browser-use agent on the task and return a standardized result."""
+    provider = BROWSERS[browser_name]
+    cdp_url = await provider.connect()
+    if cdp_url:
+        browser = Browser(cdp_url=cdp_url)
+    else:
+        headless = getattr(provider, "HEADLESS", True)
+        browser = Browser(headless=headless)
+
+    agent = Agent(
+        task=task_description,
+        llm=llm,
+        browser=browser,
+        use_judge=False,
+        use_vision=use_vision,
+    )
+    try:
+        history = await agent.run()
+    finally:
+        try:
+            await browser.kill()
+        except Exception:
+            pass
+        await provider.disconnect()
+
+    return ExecutionResult(
+        final_result=history.final_result() or "Agent did not return a result",
+        steps=history.agent_steps(),
+        screenshots_b64=encode_screenshots(
+            [p for p in history.screenshot_paths() if p is not None]
+        ),
+        num_steps=history.number_of_steps(),
+        duration_seconds=history.total_duration_seconds(),
+        cost=history.usage.total_cost if history.usage else 0,
+    )
+
+
+async def main():
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    model_name = os.environ["MODEL"]
+    eval_id = os.environ["EVAL_ID"]
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    use_vision = params.get("use_vision", "true").lower() != "false"
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    llm = MODELS[model_name]()
+    execute_fn = partial(
+        execute, llm=llm, browser_name=browser_name, use_vision=use_vision
+    )
+    await run_and_judge(task, execute_fn)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/browser_use_cloud_api_v2/run_task.py b/frameworks/browser_use_cloud_api_v2/run_task.py
new file mode 100644
index 0000000..31c294b
--- /dev/null
+++ b/frameworks/browser_use_cloud_api_v2/run_task.py
@@ -0,0 +1,201 @@
+﻿"""Run a single benchmark task using the Browser Use Cloud API v2.
+
+Dispatches a task via POST /api/v2/tasks, polls GET /api/v2/tasks/{id}
+until completion, then maps the response into ExecutionResult for the judge.
+"""
+
+import asyncio
+import base64
+import os
+import sys
+from functools import partial
+from pathlib import Path
+
+import httpx
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {}
+
+API_BASE = "https://api.browser-use.com/api/v2"
+POLL_INTERVAL = 5
+TERMINAL_STATUSES = {"finished", "stopped"}
+
+# V2 SupportedLLMs mapped from our model registry names.
+# Only models the v2 API actually supports are listed here.
+V2_MODEL_MAP = {
+    "bu-2-0": "browser-use-2.0",
+    "bu-1-0": "browser-use-llm",
+    "gpt-4.1": "gpt-4.1",
+    "gpt-4.1-mini": "gpt-4.1-mini",
+    "o4-mini": "o4-mini",
+    "o3": "o3",
+    "gemini-2.5-flash": "gemini-2.5-flash",
+    "gemini-2.5-pro": "gemini-2.5-pro",
+    "gemini-3-pro-preview": "gemini-3-pro-preview",
+    "gemini-3-flash-preview": "gemini-3-flash-preview",
+    "gpt-4o": "gpt-4o",
+    "gpt-4o-mini": "gpt-4o-mini",
+    "claude-sonnet-4-5": "claude-sonnet-4-5-20250929",
+    "claude-sonnet-4-6": "claude-sonnet-4-6",
+    "claude-opus-4-5": "claude-opus-4-5-20251101",
+    "claude-3-7-sonnet": "claude-3-7-sonnet-20250219",
+}
+
+
+def _headers() -> dict:
+    return {"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]}
+
+
+def _create_task(task_description: str, model: str) -> dict:
+    """Create a v2 task and return the response (id, sessionId)."""
+    api_model = V2_MODEL_MAP.get(model, model)
+    resp = httpx.post(
+        f"{API_BASE}/tasks",
+        headers=_headers(),
+        json={"task": task_description, "llm": api_model},
+        timeout=30,
+    )
+    resp.raise_for_status()
+    return resp.json()
+
+
+def _get_task(task_id: str) -> dict:
+    """Poll task status."""
+    resp = httpx.get(
+        f"{API_BASE}/tasks/{task_id}",
+        headers=_headers(),
+        timeout=30,
+    )
+    resp.raise_for_status()
+    return resp.json()
+
+
+def _fetch_screenshot_b64(url: str) -> str | None:
+    """Download a screenshot URL and return base64-encoded bytes."""
+    try:
+        resp = httpx.get(url, timeout=30)
+        resp.raise_for_status()
+        return base64.b64encode(resp.content).decode()
+    except Exception:
+        return None
+
+
+def _format_step(step: dict) -> str:
+    """Format a v2 TaskStepView to match browser-use agent_steps() format.
+
+    Ground truth format:
+        Step N:
+        Actions: [json array with indent=1]
+        Result M: <extracted content>  (not available from v2 API)
+    """
+    import json as _json
+
+    step_text = f"Step {step.get('number', '?')}:\n"
+
+    actions_raw = step.get("actions", [])
+    if actions_raw:
+        parsed = []
+        for a in actions_raw:
+            try:
+                parsed.append(_json.loads(a))
+            except (_json.JSONDecodeError, TypeError):
+                parsed.append(a)
+        step_text += f"Actions: {_json.dumps(parsed, indent=1)}\n"
+
+    return step_text
+
+
+def _duration_seconds(task_data: dict) -> float:
+    """Compute duration from startedAt/finishedAt timestamps."""
+    from datetime import datetime
+
+    started = task_data.get("startedAt")
+    finished = task_data.get("finishedAt")
+    if not started or not finished:
+        return 0.0
+    try:
+        t0 = datetime.fromisoformat(started.replace("Z", "+00:00"))
+        t1 = datetime.fromisoformat(finished.replace("Z", "+00:00"))
+        return max((t1 - t0).total_seconds(), 0.0)
+    except Exception:
+        return 0.0
+
+
+async def execute(task_description: str, model_name: str) -> ExecutionResult:
+    """Create a v2 task, poll until done, return ExecutionResult."""
+    created = _create_task(task_description, model_name)
+    task_id = created["id"]
+    print(f"V2 task created: {task_id}")
+
+    # Poll until terminal
+    while True:
+        await asyncio.sleep(POLL_INTERVAL)
+        task_data = _get_task(task_id)
+        status = task_data.get("status", "")
+        if status in TERMINAL_STATUSES:
+            break
+        print(f"  V2 task {task_id} status: {status}")
+
+    steps = task_data.get("steps", [])
+    agent_steps = [_format_step(s) for s in steps]
+
+    # Collect screenshots from step URLs
+    screenshots_b64 = []
+    for step in steps:
+        url = step.get("screenshotUrl")
+        if url:
+            img = _fetch_screenshot_b64(url)
+            if img:
+                screenshots_b64.append(img)
+
+    output = task_data.get("output") or "No output returned"
+    cost_str = task_data.get("cost") or "0"
+    cost = float(cost_str)
+    duration = _duration_seconds(task_data)
+
+    return ExecutionResult(
+        final_result=output,
+        steps=agent_steps,
+        screenshots_b64=screenshots_b64,
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=cost,
+    )
+
+
+async def main():
+    validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    model_name = os.environ["MODEL"]
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    execute_fn = partial(execute, model_name=model_name)
+    await run_and_judge(task, execute_fn)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/browser_use_cloud_api_v3/run_task.py b/frameworks/browser_use_cloud_api_v3/run_task.py
new file mode 100644
index 0000000..53f2c6f
--- /dev/null
+++ b/frameworks/browser_use_cloud_api_v3/run_task.py
@@ -0,0 +1,286 @@
+﻿"""Run a single benchmark task using the Browser Use Cloud API v3 (BU Agent).
+
+Dispatches a task via POST /api/v3/sessions, polls GET /api/v3/sessions/{id}
+until completion, fetches session messages to reconstruct step data, then maps
+into ExecutionResult for the judge.
+
+Step data is reconstructed from the messages endpoint to match the browser-use
+agent_steps() ground truth format as closely as possible. Screenshots are not
+available from this API.
+"""
+
+import asyncio
+import json
+import os
+import sys
+from functools import partial
+from pathlib import Path
+
+import httpx
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "skills": "Enable/disable skill memory (true/false, default: true)",
+}
+
+API_BASE = "https://api.browser-use.com/api/v3"
+POLL_INTERVAL = 5
+TERMINAL_STATUSES = {"stopped", "error", "timed_out"}
+
+V3_MODEL_MAP = {
+    "bu-mini": "bu-mini",
+    "bu-max": "bu-max",
+    "bu-ultra": "bu-ultra",
+}
+
+# Map V3 tool names to browser-use action names for ground-truth-like formatting.
+TOOL_NAME_MAP = {
+    "browser_navigate": "navigate",
+    "browser_type_text": "input",
+    "browser_wait": "wait",
+    "browser_click": "click",
+    "browser_scroll": "scroll",
+    "browser_go_back": "go_back",
+    "browser_search_google": "search_google",
+    "browser_analyze_state": "analyze_state",
+    "done_autonomous": "done",
+}
+
+
+def _headers() -> dict:
+    return {"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]}
+
+
+def _create_session(task_description: str, model: str, skills: bool = True) -> dict:
+    api_model = V3_MODEL_MAP.get(model, model)
+    resp = httpx.post(
+        f"{API_BASE}/sessions",
+        headers=_headers(),
+        json={"task": task_description, "model": api_model, "skills": skills},
+        timeout=30,
+    )
+    resp.raise_for_status()
+    return resp.json()
+
+
+def _get_session(session_id: str) -> dict:
+    resp = httpx.get(
+        f"{API_BASE}/sessions/{session_id}",
+        headers=_headers(),
+        timeout=30,
+    )
+    resp.raise_for_status()
+    return resp.json()
+
+
+def _get_all_messages(session_id: str) -> list[dict]:
+    """Paginate through all messages for a session."""
+    all_msgs = []
+    after = None
+    while True:
+        params = {"limit": 100}
+        if after:
+            params["after"] = after
+        resp = httpx.get(
+            f"{API_BASE}/sessions/{session_id}/messages",
+            headers=_headers(),
+            params=params,
+            timeout=30,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        msgs = data.get("messages", [])
+        all_msgs.extend(msgs)
+        if not data.get("hasMore") or not msgs:
+            break
+        after = msgs[-1]["id"]
+    return all_msgs
+
+
+def _parse_messages_to_steps(messages: list[dict]) -> list[str]:
+    """Convert V3 session messages into ground-truth-style step strings.
+
+    Groups assistant tool_calls with their corresponding tool results.
+    Formats each group as:
+        Step N:
+        Actions: [json array, indent=1]
+        Result M: <tool result content>
+        Error M: <tool error content>
+    """
+    # Parse data fields
+    parsed = []
+    for msg in messages:
+        data_str = msg.get("data", "{}")
+        try:
+            data = json.loads(data_str)
+        except (json.JSONDecodeError, TypeError):
+            continue
+        data["_role"] = msg.get("role", data.get("role", ""))
+        parsed.append(data)
+
+    # Index tool results by tool_call_id
+    tool_results: dict[str, dict] = {}
+    for m in parsed:
+        if m["_role"] == "tool":
+            tcid = m.get("tool_call_id")
+            if tcid:
+                tool_results[tcid] = m
+
+    # Collect all tool_call_ids claimed by assistant messages
+    claimed_ids: set[str] = set()
+
+    # Build steps from assistant messages that have tool_calls
+    steps = []
+    step_num = 0
+    for m in parsed:
+        if m["_role"] != "assistant":
+            continue
+        tool_calls = m.get("tool_calls")
+        if not tool_calls:
+            continue
+
+        step_num += 1
+        step_text = f"Step {step_num}:\n"
+
+        # Build actions list matching ground truth format
+        actions = []
+        for tc in tool_calls:
+            claimed_ids.add(tc.get("id", ""))
+            func = tc.get("function", {})
+            raw_name = func.get("name", "unknown")
+            action_name = TOOL_NAME_MAP.get(raw_name, raw_name)
+            try:
+                args = json.loads(func.get("arguments", "{}"))
+            except (json.JSONDecodeError, TypeError):
+                args = {}
+            actions.append({action_name: args})
+
+        step_text += f"Actions: {json.dumps(actions, indent=1)}\n"
+
+        # Append results/errors from tool messages
+        for j, tc in enumerate(tool_calls):
+            tcid = tc.get("id")
+            tr = tool_results.get(tcid)
+            if not tr:
+                continue
+            content = tr.get("content", "")
+            is_error = tr.get("is_error", False)
+            if is_error and content:
+                step_text += f"Error {j + 1}: {content}\n"
+            elif content:
+                step_text += f"Result {j + 1}: {content}\n"
+
+        steps.append(step_text)
+
+    # Handle orphaned tool results (e.g. done_autonomous whose assistant
+    # message was not returned by the API)
+    for tcid, tr in tool_results.items():
+        if tcid in claimed_ids:
+            continue
+        tool_name = tr.get("tool_name", "")
+        action_name = TOOL_NAME_MAP.get(tool_name, tool_name)
+        content = tr.get("content", "")
+        if not content:
+            continue
+        step_num += 1
+        step_text = f"Step {step_num}:\n"
+        action_obj = [{action_name: {}}]
+        step_text += f"Actions: {json.dumps(action_obj, indent=1)}\n"
+        step_text += f"Result 1: {content}\n"
+        steps.append(step_text)
+
+    return steps
+
+
+def _duration_seconds(session_data: dict) -> float:
+    from datetime import datetime
+
+    created = session_data.get("createdAt")
+    updated = session_data.get("updatedAt")
+    if not created or not updated:
+        return 0.0
+    try:
+        t0 = datetime.fromisoformat(created.replace("Z", "+00:00"))
+        t1 = datetime.fromisoformat(updated.replace("Z", "+00:00"))
+        return max((t1 - t0).total_seconds(), 0.0)
+    except Exception:
+        return 0.0
+
+
+async def execute(
+    task_description: str, model_name: str, skills: bool = True
+) -> ExecutionResult:
+    """Create a v3 session, poll until done, fetch messages, return ExecutionResult."""
+    session_data = _create_session(task_description, model_name, skills=skills)
+    session_id = session_data["id"]
+    print(f"V3 session created: {session_id}")
+
+    while True:
+        await asyncio.sleep(POLL_INTERVAL)
+        session_data = _get_session(session_id)
+        status = session_data.get("status", "")
+        if status in TERMINAL_STATUSES:
+            break
+        print(f"  V3 session {session_id} status: {status}")
+
+    output = session_data.get("output")
+    if isinstance(output, dict):
+        output = json.dumps(output)
+    output = output or "No output returned"
+
+    cost_str = session_data.get("totalCostUsd") or "0"
+    cost = float(cost_str)
+    duration = _duration_seconds(session_data)
+
+    # Fetch messages and reconstruct steps
+    messages = _get_all_messages(session_id)
+    agent_steps = _parse_messages_to_steps(messages)
+
+    return ExecutionResult(
+        final_result=output,
+        steps=agent_steps,
+        screenshots_b64=[],  # Not available from V3 API
+        num_steps=len(agent_steps),
+        duration_seconds=duration,
+        cost=cost,
+    )
+
+
+async def main():
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    model_name = os.environ["MODEL"]
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    skills = params.get("skills", "true").lower() != "false"
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    execute_fn = partial(execute, model_name=model_name, skills=skills)
+    await run_and_judge(task, execute_fn)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/browserbase_agent/executor.mjs b/frameworks/browserbase_agent/executor.mjs
new file mode 100644
index 0000000..dc500f3
--- /dev/null
+++ b/frameworks/browserbase_agent/executor.mjs
@@ -0,0 +1,181 @@
+﻿/**
+ * Browserbase Stagehand agent executor (client-side SDK path).
+ *
+ * Why client-side and not the hosted REST API:
+ *   `api.stagehand.browserbase.com` is an alpha hosted endpoint running a
+ *   Stagehand server build that predates the opus-4-7 temperature fix
+ *   (Stagehand PRs #2006/#2018, shipped in stagehand-server-v3 v3.6.5 on
+ *   May 6 2026). That endpoint silently rejects opus-4-7 with the
+ *   "`temperature` is deprecated for this model" error from inside the
+ *   Stagehand `fillForm` tool. Running the client SDK locally gives us
+ *   whichever Stagehand version we pin in package.json, fix included.
+ *
+ *   This is also the path Browserbase tells customers to use for
+ *   production (https://docs.stagehand.dev/v3/best-practices/deployments):
+ *   embed the SDK in your backend, point it at Browserbase. The REST API
+ *   is marketed for their Python SDK transport, not for scale-out.
+ *
+ * Joint system benchmarked: (Stagehand agent SDK + Browserbase cloud
+ * browser + model). Same surface as the original .mjs example we built
+ * for the .bcode workspace, just dispatched programmatically.
+ *
+ * Model routing: defaults to Browserbase Model Gateway (Stagehand
+ * auto-routes through the gateway when only `apiKey` is set on the
+ * constructor and no provider env key is present). The runner unsets
+ * provider env keys before spawning this script when STAGEHAND_USE_GATEWAY
+ * is "1" (default) so the SDK doesn't grab them out of process env.
+ *
+ * Env input (read at startup, all required unless noted):
+ *   TASK_DESCRIPTION         the task string to run
+ *   STAGEHAND_MODEL          gateway slug e.g. anthropic/claude-opus-4-7
+ *   MAX_STEPS                int, default 25
+ *   BROWSERBASE_API_KEY      required (for browser + gateway)
+ *   BROWSERBASE_PROJECT_ID   required (Stagehand SDK still wants it)
+ *   STAGEHAND_VERBOSE        int 0/1/2, default 1
+ *
+ * Stdout: exactly one JSON object -- the ExecutionResult-shaped dict the
+ * Python wrapper reads. All progress / logs go to stderr.
+ */
+
+import { Stagehand } from "@browserbasehq/stagehand";
+
+const MODEL = process.env.STAGEHAND_MODEL || "anthropic/claude-sonnet-4-6";
+const MAX_STEPS = parseInt(process.env.MAX_STEPS || "25", 10);
+const VERBOSE = parseInt(process.env.STAGEHAND_VERBOSE || "1", 10);
+
+const SYSTEM_PROMPT =
+  "You are a browser agent running inside an evaluation harness. " +
+  "Solve the user's task by navigating and interacting with the live web.\n\n" +
+  "When you finish, your final message MUST contain the concrete answer " +
+  "to the task -- the actual names, numbers, list items, or values you " +
+  "found. Do not paraphrase the answer as 'I extracted X' or 'I found the " +
+  "data' -- write the data itself. For lists, write items one per line.";
+
+function fail(msg, extra = {}) {
+  // Emit an ExecutionResult-shaped object so the Python side records the
+  // failure on the datapoint instead of raising -- matches the
+  // "[browserbase_incomplete] ..." convention from the REST runner.
+  const out = {
+    final_result: `[browserbase_incomplete] ${msg}`,
+    steps: [],
+    screenshots_b64: [],
+    num_steps: 0,
+    duration_seconds: 0,
+    cost: 0,
+    error: msg,
+    ...extra,
+  };
+  process.stdout.write(JSON.stringify(out));
+  process.exit(0);
+}
+
+function formatStep(act, i) {
+  // Stagehand 3.x agent action shape: { type, action?, reasoning?,
+  // instruction?, pageUrl?, taskCompleted? }. We render one judge-readable
+  // step per action, same as the REST runner's _format_steps.
+  const parts = [`Step ${i}:`];
+  if (act?.type) parts.push(`Type: ${act.type}`);
+  if (act?.instruction) parts.push(`Instruction: ${act.instruction}`);
+  if (act?.action) parts.push(`Action: ${act.action}`);
+  if (act?.reasoning) parts.push(`Reasoning: ${act.reasoning}`);
+  if (act?.pageUrl) parts.push(`URL: ${act.pageUrl}`);
+  if (act?.taskCompleted) parts.push("TaskCompleted: true");
+  return parts.join("\n");
+}
+
+async function main() {
+  const task = process.env.TASK_DESCRIPTION;
+  if (!task) fail("TASK_DESCRIPTION env var is required");
+  for (const k of ["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"]) {
+    if (!process.env[k]) fail(`missing env var: ${k}`);
+  }
+
+  process.stderr.write(
+    `[browserbase-agent] model=${MODEL} maxSteps=${MAX_STEPS}\n`
+  );
+
+  const stagehand = new Stagehand({
+    env: "BROWSERBASE",
+    apiKey: process.env.BROWSERBASE_API_KEY,
+    projectId: process.env.BROWSERBASE_PROJECT_ID,
+    // With `model` on the constructor and no provider key on env, Stagehand
+    // routes inference through the Browserbase Model Gateway. The Python
+    // wrapper scrubs provider keys before spawning us when gateway mode is
+    // requested (the default).
+    model: MODEL,
+    verbose: VERBOSE,
+    disablePino: true,
+    logger: (line) => {
+      if ((line?.level ?? 1) > VERBOSE) return;
+      const tag = line.level === 0 ? "ERR" : line.level === 2 ? "DBG" : "INF";
+      const cat = line.category ? `[${line.category}] ` : "";
+      process.stderr.write(`[stagehand:${tag}] ${cat}${line.message}\n`);
+    },
+  });
+
+  const t0 = Date.now();
+  try {
+    await stagehand.init();
+  } catch (err) {
+    fail(`stagehand.init failed: ${err?.message || err}`);
+  }
+
+  const sessionId = stagehand.browserbaseSessionID;
+  const recordingUrl = `https://browserbase.com/sessions/${sessionId}`;
+  process.stderr.write(`[browserbase-agent] session=${sessionId}\n`);
+  process.stderr.write(`[browserbase-agent] watch=${recordingUrl}\n`);
+
+  const agent = stagehand.agent({ systemPrompt: SYSTEM_PROMPT });
+
+  let result;
+  let agentError = null;
+  try {
+    result = await agent.execute({
+      instruction: task,
+      maxSteps: MAX_STEPS,
+    });
+  } catch (err) {
+    agentError = err?.stack || String(err);
+    process.stderr.write(`[browserbase-agent] agent error: ${agentError}\n`);
+  } finally {
+    await stagehand.close().catch(() => {});
+  }
+
+  const durationSeconds = (Date.now() - t0) / 1000;
+
+  if (agentError && !result) {
+    fail(`agent.execute threw: ${agentError}`, {
+      duration_seconds: durationSeconds,
+      session_id: sessionId,
+      recording_url: recordingUrl,
+    });
+  }
+
+  const actions = Array.isArray(result?.actions) ? result.actions : [];
+  const message = result?.message || "[browserbase_no_output]";
+  const completed = !!result?.completed;
+  const finalResult =
+    completed || message.startsWith("[browserbase_")
+      ? message
+      : `[browserbase_incomplete] ${message}`;
+
+  const out = {
+    final_result: finalResult,
+    steps: actions.map((a, i) => formatStep(a, i + 1)),
+    screenshots_b64: [], // Stagehand agent.execute doesn't surface shots directly.
+    num_steps: actions.length,
+    duration_seconds: durationSeconds,
+    // Token counts are in result.usage but Browserbase gateway pricing
+    // isn't exposed per-token. Leave at 0 (matches the REST runner) until
+    // we wire static prices through.
+    cost: 0,
+    session_id: sessionId,
+    recording_url: recordingUrl,
+  };
+  process.stdout.write(JSON.stringify(out));
+}
+
+main().catch((err) => {
+  process.stderr.write(`[browserbase-agent] fatal: ${err?.stack || err}\n`);
+  fail(`fatal: ${err?.message || err}`);
+});
diff --git a/frameworks/browserbase_agent/package.json b/frameworks/browserbase_agent/package.json
new file mode 100644
index 0000000..58d1e87
--- /dev/null
+++ b/frameworks/browserbase_agent/package.json
@@ -0,0 +1,9 @@
+﻿{
+  "name": "benchmark-browserbase-agent-executor",
+  "private": true,
+  "type": "module",
+  "description": "Node executor for the browserbase-agent eval framework: drives Stagehand SDK against Browserbase cloud. Pinned to a Stagehand version that has the opus-4-7 temperature fix (PRs #2006/#2018, shipped in client 3.4.0).",
+  "dependencies": {
+    "@browserbasehq/stagehand": "^3.4.0"
+  }
+}
diff --git a/frameworks/browserbase_agent/run_task.py b/frameworks/browserbase_agent/run_task.py
new file mode 100644
index 0000000..2e5f1d3
--- /dev/null
+++ b/frameworks/browserbase_agent/run_task.py
@@ -0,0 +1,216 @@
+﻿"""Run a single benchmark task using the Stagehand agent SDK (client-side) on
+Browserbase cloud.
+
+We used to dispatch against the hosted Stagehand REST API at
+`api.stagehand.browserbase.com/v1` (no Node deps, pure Python HTTP). That
+endpoint is alpha and pinned to an old Stagehand server build that predates
+the opus-4-7 temperature fix (Stagehand PRs #2006/#2018, shipped in
+stagehand-server-v3 v3.6.5 on May 6 2026), so opus-4-7 + the hosted API
+silently dies inside the Stagehand `fillForm` tool. Out of our control.
+
+Instead this runner shells out to a Node executor (`executor.mjs`) that
+imports `@browserbasehq/stagehand` directly, pinned in package.json to a
+client release that has the fix. Same approach Browserbase tells customers
+to use for production (deploy the SDK in your own runtime). Joint system
+benchmarked is unchanged: (Stagehand agent + Browserbase cloud browser +
+model).
+
+Model routing: by default Stagehand auto-routes through the Browserbase
+Model Gateway when only the Browserbase API key is set on the constructor
+and no provider env key is present. We scrub provider keys
+(ANTHROPIC/OPENAI/GOOGLE/GOOGLE_GENERATIVE_AI/GEMINI) from the spawn env
+when `use_gateway` (default true) so the SDK picks the gateway path even
+though our workflow secrets normally inject them globally. Set
+`use_gateway=false` via params to fall back to direct-provider billing
+(useful for models the gateway hasn't onboarded yet).
+
+Concurrency: limited by the Browserbase plan, NOT by our infra. The
+framework registry sets `max_concurrent_override` to match
+`browsers/browserbase.py` (currently 20).
+"""
+
+import asyncio
+import json
+import os
+import sys
+import time
+from functools import partial
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "max_steps": "Max Stagehand agent steps per task (default: 25).",
+    "use_gateway": (
+        "Route inference via Browserbase Model Gateway (true/false, "
+        "default: true). When true, the runner scrubs provider env keys "
+        "from the Node subprocess so Stagehand auto-routes via gateway. "
+        "When false, provider env keys pass through and the SDK bills the "
+        "provider directly. Use false for models the gateway hasn't "
+        "onboarded yet."
+    ),
+}
+
+# Map benchmark model aliases to Stagehand gateway slugs. Slugs
+# already containing '/' pass through verbatim.
+MODEL_MAP = {
+    "claude-sonnet-4-6": "anthropic/claude-sonnet-4-6",
+    "claude-sonnet-4-5": "anthropic/claude-sonnet-4-5",
+    "claude-opus-4-5": "anthropic/claude-opus-4-5",
+    "claude-opus-4-6": "anthropic/claude-opus-4-6",
+    "claude-opus-4-7": "anthropic/claude-opus-4-7",
+    "gpt-5": "openai/gpt-5",
+    "gpt-5-mini": "openai/gpt-5-mini",
+    "gemini-2.5-flash": "google/gemini-2.5-flash",
+    "gemini-2.5-pro": "google/gemini-2.5-pro",
+}
+
+EXECUTOR_DIR = Path(__file__).resolve().parent
+EXECUTOR_SCRIPT = EXECUTOR_DIR / "executor.mjs"
+
+# Provider env keys to scrub when running in gateway mode. Anything Stagehand
+# might autoload (per https://docs.stagehand.dev/v3/configuration/models --
+# "Error: API key not found" section).
+_PROVIDER_ENV_KEYS = (
+    "ANTHROPIC_API_KEY",
+    "OPENAI_API_KEY",
+    "GOOGLE_API_KEY",
+    "GOOGLE_GENERATIVE_AI_API_KEY",
+    "GEMINI_API_KEY",
+)
+
+
+def _resolve_model(model_name: str) -> str:
+    if "/" in model_name:
+        return model_name
+    if model_name in MODEL_MAP:
+        return MODEL_MAP[model_name]
+    raise ValueError(
+        f"Model '{model_name}' is not in MODEL_MAP and is not an explicit "
+        f"`provider/model` slug. Extend MODEL_MAP or pass an explicit slug."
+    )
+
+
+def _build_env(model_slug: str, max_steps: int, use_gateway: bool) -> dict:
+    """Construct the env dict for the Node subprocess.
+
+    Forwards Browserbase creds + task config. If `use_gateway`, strips
+    provider keys so Stagehand auto-routes via the Model Gateway.
+    """
+    env = dict(os.environ)
+    env["STAGEHAND_MODEL"] = model_slug
+    env["MAX_STEPS"] = str(max_steps)
+    # Pass through BROWSERBASE_* unchanged (required by SDK).
+    if use_gateway:
+        for k in _PROVIDER_ENV_KEYS:
+            env.pop(k, None)
+    return env
+
+
+async def execute(
+    task_description: str, model_name: str, max_steps: int, use_gateway: bool
+) -> ExecutionResult:
+    """Spawn the Node executor, parse its single-JSON stdout into ExecutionResult."""
+    model_slug = _resolve_model(model_name)
+    print(
+        f"Browserbase Stagehand SDK model_slug={model_slug} "
+        f"max_steps={max_steps} use_gateway={use_gateway}"
+    )
+
+    env = _build_env(model_slug, max_steps, use_gateway)
+    env["TASK_DESCRIPTION"] = task_description
+
+    t0 = time.time()
+    # Use asyncio subprocess so run_and_judge's outer asyncio.wait_for can
+    # cancel us cleanly on timeout.
+    proc = await asyncio.create_subprocess_exec(
+        "node",
+        str(EXECUTOR_SCRIPT),
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        env=env,
+        cwd=str(EXECUTOR_DIR),
+    )
+    stdout, stderr = await proc.communicate()
+    duration = time.time() - t0
+
+    stderr_text = stderr.decode("utf-8", errors="replace")
+    if stderr_text:
+        # Stream Stagehand's logger output to our stdout for runner-log
+        # debugging. Each line is already prefixed by the executor.
+        print(stderr_text, end="")
+
+    if proc.returncode != 0:
+        # The executor's `fail()` path always exits 0 with a valid JSON
+        # payload, so a non-zero return is a true crash (e.g. Node missing,
+        # uncaught throw outside main, OOM). Surface as a failed datapoint
+        # via the run_and_judge exception path.
+        raise RuntimeError(
+            f"executor.mjs crashed: returncode={proc.returncode}, "
+            f"stderr_tail={stderr_text[-500:]!r}"
+        )
+
+    try:
+        data = json.loads(stdout.decode("utf-8", errors="replace"))
+    except json.JSONDecodeError as e:
+        raise RuntimeError(
+            f"executor.mjs produced invalid JSON: {e}; "
+            f"stdout_head={stdout[:500]!r}"
+        )
+
+    # Prefer the executor's measured duration if it set one.
+    duration_seconds = float(data.get("duration_seconds") or duration)
+
+    return ExecutionResult(
+        final_result=data.get("final_result", ""),
+        steps=data.get("steps") or [],
+        screenshots_b64=data.get("screenshots_b64") or [],
+        num_steps=int(data.get("num_steps") or 0),
+        duration_seconds=duration_seconds,
+        cost=float(data.get("cost") or 0.0),
+    )
+
+
+async def main():
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    model_name = os.environ["MODEL"]
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    max_steps = int(params.get("max_steps", "25"))
+    use_gateway = params.get("use_gateway", "true").lower() != "false"
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    execute_fn = partial(
+        execute,
+        model_name=model_name,
+        max_steps=max_steps,
+        use_gateway=use_gateway,
+    )
+    await run_and_judge(task, execute_fn)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/but/__init__.py b/frameworks/but/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/but/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/but/run_task.py b/frameworks/but/run_task.py
new file mode 100644
index 0000000..30066cb
--- /dev/null
+++ b/frameworks/but/run_task.py
@@ -0,0 +1,441 @@
+﻿"""Run a single benchmark task using browser-use-terminal (`but`).
+
+`but` is a browser-specific LLM agent harness: it owns its own agent loop,
+provides an editable Python REPL tool with raw CDP helpers
+(`goto_url`, `js`, `capture_screenshot`, `click_at_xy`, `fill_input`, ...),
+streams screenshots inline to the model, and persists a JSONL event log
+per session. See https://github.com/browser-use/browser-use-terminal.
+
+Browser wiring: we pre-provision a `browser-use-cloud` session via the
+v3 API (same pattern as bcode/cch) and hand `but` the WebSocket CDP URL
+via `--browser cdp --cdp-ws <ws>` (also exported as `BU_CDP_WS` env;
+`but`'s `_first_env("BU_CDP_WS", ...)` honors it as a fallback). `but`
+attaches to our pre-allocated browser instead of provisioning one.
+
+Invocation:
+    uv run browser-use-terminal run \\
+        --state-dir <per-task-dir> \\
+        --provider <p> --model <m> \\
+        --browser cdp --cdp-ws <ws> \\
+        --max-turns 80 \\
+        "<system_prompt>\\n\\n<task>"
+
+`but run` is synchronous: it blocks until the agent calls `done` or hits
+`--max-turns`, prints a session metadata JSON to stdout, then exits. The
+agent's per-turn signals (tool calls, model usage, screenshots, final
+result) live in `<state-dir>/sessions/<session_id>/events.jsonl`. We
+parse that file to extract steps, cost, and the final result, and walk
+`<artifact_dir>/browser/screenshots/` to feed PNGs to the judge.
+
+Provider resolution: benchmark aliases get an explicit `--provider`
+chosen by substring (claude->anthropic, gpt->openai, glm->zai, qwen->qwen).
+The `openai` provider in `but` reads `OPENAI_API_KEY` (already a
+workflow secret); the `codex` provider needs Codex subscription auth
+that we do not have on CI, so it is NOT auto-selected.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from lmnr import Laminar
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+    "max_turns": "Maximum model/tool turns before failing the run (default: 80, passed to `but run --max-turns`).",
+    "framework_repo": "Override the GitHub repo for browser-use-terminal install (default: browser-use/browser-use-terminal). Consumed by the workflow install step.",
+    "agent_mode": "Override the agent instruction mode for `but` (auto|browser|codex, default: leave unset -> `but` picks).",
+}
+
+# `but` is installed as a uv-managed Python package at /tmp/but in the
+# workflow install step. We invoke it via `uv run --project /tmp/but
+# browser-use-terminal run ...` from that workdir so the project's
+# console_scripts entry point resolves.
+BUT_PROJECT_DIR = "/tmp/but"
+
+# system_prompt.md sits next to this file.
+SYSTEM_PROMPT_PATH = Path(__file__).resolve().parent / "system_prompt.md"
+
+# State dir + screenshot scan path. One per task to avoid cross-talk.
+STATE_ROOT = Path("/tmp/but_state")
+
+# Map benchmark model alias to (provider, model). Order matters: claude
+# before gpt because "gpt" is a common prefix and we want claude to win
+# on `claude-*` slugs.
+_PROVIDER_KEYPHRASES: tuple[tuple[str, str], ...] = (
+    ("claude", "anthropic"),
+    ("gpt", "openai"),
+    ("o1", "openai"),
+    ("o3", "openai"),
+    ("o4", "openai"),
+    ("glm", "zai"),
+    ("qwen", "qwen"),
+)
+
+
+def _resolve_provider(model_name: str) -> str:
+    lower = model_name.lower()
+    for key, provider in _PROVIDER_KEYPHRASES:
+        if key in lower:
+            return provider
+    raise ValueError(
+        f"but: cannot infer provider for model {model_name!r}. "
+        f"Add a keyphrase to _PROVIDER_KEYPHRASES in frameworks/but/run_task.py."
+    )
+
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser() -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+def _format_step_from_event(event: dict) -> str | None:
+    """Turn one events.jsonl entry into a short step string (or None)."""
+    etype = event.get("type") or ""
+    payload = event.get("payload") or {}
+    if etype == "tool.started":
+        name = payload.get("name") or "?"
+        args = payload.get("arguments") or {}
+        # Python REPL: dump the code field (sometimes named 'code' or 'source').
+        if name in ("python", "python_browser"):
+            code = (args.get("code") or args.get("source") or "").strip()
+            return f"python: {code[:2000]}"
+        if name in ("bash", "shell", "shell_start"):
+            cmd = (args.get("command") or args.get("script") or "").strip()
+            return f"{name}: {cmd[:2000]}"
+        if name in ("read", "write", "edit"):
+            path = args.get("path") or args.get("filePath") or ""
+            return f"{name}: {path}"
+        if name == "done":
+            result = (args.get("result") or "").strip()
+            return f"done: {result[:2000]}"
+        try:
+            return f"{name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if etype == "assistant.message" or etype == "message.assistant":
+        text = (payload.get("text") or payload.get("content") or "").strip()
+        return f"text: {text[:2000]}" if text else None
+    if etype == "reasoning" or etype == "assistant.reasoning":
+        text = (payload.get("text") or payload.get("content") or "").strip()
+        return f"thinking: {text[:2000]}" if text else None
+    return None
+
+
+def _read_events(events_path: Path) -> list[dict]:
+    if not events_path.exists():
+        return []
+    events: list[dict] = []
+    with events_path.open("r", encoding="utf-8") as fh:
+        for line in fh:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                events.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+    return events
+
+
+def _collect_screenshots(artifact_dir: Path) -> list[str]:
+    """Read every PNG/JPEG `but` wrote to <artifact>/browser/screenshots/."""
+    shots_dir = artifact_dir / "browser" / "screenshots"
+    if not shots_dir.exists():
+        return []
+    paths = sorted(p for p in shots_dir.iterdir() if p.is_file() and p.suffix.lower() in (".png", ".jpeg", ".jpg"))
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+def _find_session_dir(state_dir: Path, session_id: str | None) -> Path | None:
+    """Resolve the session dir from state_dir. If session_id is unknown,
+    pick the most recently modified session subdir."""
+    sessions_root = state_dir / "sessions"
+    if not sessions_root.exists():
+        return None
+    if session_id:
+        candidate = sessions_root / session_id
+        if candidate.exists():
+            return candidate
+    subdirs = [p for p in sessions_root.iterdir() if p.is_dir()]
+    if not subdirs:
+        return None
+    return max(subdirs, key=lambda p: p.stat().st_mtime)
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while line := await proc.stderr.readline():
+        s = line.decode("utf-8", errors="replace").rstrip("\n")
+        buf.append(s)
+        print(f"[but-stderr] {s}", flush=True)
+
+
+async def _iter_lines(stream: asyncio.StreamReader):
+    buf = bytearray()
+    while chunk := await stream.read(1 << 16):
+        buf.extend(chunk)
+        while (nl := buf.find(b"\n")) >= 0:
+            yield bytes(buf[:nl])
+            del buf[: nl + 1]
+    if buf:
+        yield bytes(buf)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = parse_params()
+    validate_params(params, ACCEPTED_PARAMS)
+    model = os.environ["MODEL"]
+    provider = _resolve_provider(model)
+    max_turns = int(params.get("max_turns") or 80)
+    task_idx = os.environ.get("TASK_INDEX", "0")
+
+    # Pre-provision the browser. `but` honors `BU_CDP_WS` natively AND we
+    # pass `--cdp-ws` explicitly with `--browser cdp` to make the attach
+    # deterministic and visible in the spawn cmdline.
+    browser_id, cdp_ws = _start_browser()
+
+    # Isolate state dir per task so concurrent runs in the same workflow
+    # don't collide on the JSONL event log or screenshot dir.
+    state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}"
+    if state_dir.exists():
+        shutil.rmtree(state_dir)
+    state_dir.mkdir(parents=True)
+
+    # Laminar parent-span: same pattern as bcode. `but` does not (yet)
+    # honor LMNR_PARENT_SPAN_CONTEXT, so this is a forward-compat hook --
+    # passing the env var costs nothing on the current version.
+    parent_span_context = Laminar.serialize_span_context()
+
+    system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
+    full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
+
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        # Default state-dir for `but`. Explicitly passed below too.
+        "LLM_BROWSER_STATE_DIR": str(state_dir),
+    }
+    if parent_span_context:
+        env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context
+
+    # NOTE: `--state-dir` is a TOP-LEVEL arg on browser-use-terminal -- it
+    # must come BEFORE the `run` subcommand, otherwise argparse rejects it
+    # as an unrecognized argument on `run`. Same for `--config`.
+    cmd = [
+        "uv", "run", "--project", BUT_PROJECT_DIR, "--no-sync",
+        "browser-use-terminal",
+        "--state-dir", str(state_dir),
+        "run",
+        "--provider", provider,
+        "--model", model,
+        "--browser", "cdp",
+        "--cdp-ws", cdp_ws,
+        "--max-turns", str(max_turns),
+    ]
+    agent_mode = (params.get("agent_mode") or "").strip().lower()
+    if agent_mode:
+        cmd.extend(["--agent-mode", agent_mode])
+    cmd.append(full_task)
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    errors: list[str] = []
+    stderr_buf: list[str] = []
+    stdout_chunks: list[str] = []
+    session_id: str | None = None
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=BUT_PROJECT_DIR,
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    try:
+        async for raw in _iter_lines(proc.stdout):
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if line:
+                stdout_chunks.append(line)
+                print(f"[but-stdout] {line[:500]}", flush=True)
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            proc.kill()
+            await proc.wait()
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+
+    # Parse the trailing JSON metadata `but run` prints (session.to_dict()).
+    # Even if we fail to parse, we can still recover from the events.jsonl.
+    try:
+        joined = "\n".join(stdout_chunks).strip()
+        # Take the last balanced {...} block -- `but run` prints exactly one.
+        last_brace_open = joined.rfind("{")
+        if last_brace_open != -1:
+            meta = json.loads(joined[last_brace_open:])
+            session_id = str(meta.get("id") or "") or None
+    except Exception:
+        session_id = None
+
+    session_dir = _find_session_dir(state_dir, session_id)
+    events: list[dict] = []
+    artifact_dir: Path | None = None
+    if session_dir is not None:
+        events_path = session_dir / "events.jsonl"
+        events = _read_events(events_path)
+        artifact_dir = session_dir / "artifacts"
+
+    for event in events:
+        if (s := _format_step_from_event(event)):
+            steps.append(s)
+        etype = event.get("type") or ""
+        payload = event.get("payload") or {}
+        if etype == "session.done":
+            done_result = (payload.get("result") or "").strip()
+            if done_result:
+                final_text = done_result
+        elif etype == "model.usage":
+            cost_usd = payload.get("cost_usd")
+            if cost_usd is not None:
+                try:
+                    total_cost += float(cost_usd)
+                except (TypeError, ValueError):
+                    pass
+        elif etype in ("tool.failed", "error", "session.failed"):
+            err = payload.get("error") or payload.get("message") or ""
+            if err:
+                errors.append(str(err))
+                print(f"[but-error] {str(err)[:500]}", flush=True)
+
+    # Fallback: scrape the last assistant message if `done` was never called.
+    if not final_text:
+        for event in reversed(events):
+            if (event.get("type") or "") in ("assistant.message", "message.assistant"):
+                text = ((event.get("payload") or {}).get("text") or "").strip()
+                if text:
+                    final_text = text
+                    break
+
+    if proc.returncode not in (0, None) and not final_text and not steps:
+        raise RuntimeError(
+            f"but exited with code {proc.returncode} before producing output. "
+            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+        )
+
+    answer = (final_text or "").strip()
+    if errors and not answer:
+        final_result = f"[but_error] {errors[0][:500]}"
+    elif errors:
+        final_result = f"[but_error_recovered] {answer}"
+    else:
+        final_result = answer or "[but_no_output]"
+
+    screenshots = _collect_screenshots(artifact_dir) if artifact_dir is not None else []
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=screenshots,
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/but/system_prompt.md b/frameworks/but/system_prompt.md
new file mode 100644
index 0000000..2a38214
--- /dev/null
+++ b/frameworks/but/system_prompt.md
@@ -0,0 +1,10 @@
+﻿You are evaluating a benchmark task by driving a real browser via browser-use-terminal (`but`).
+
+Hard rules:
+- A live remote browser is pre-attached to your session via the explicit CDP backend (`--browser cdp`). Do NOT call `cdp_connect` with a different URL, do NOT spawn a new browser, do NOT launch a local Chromium. Just use the browser that is already attached.
+- Drive the browser through the Python REPL tool. Useful built-ins: `goto_url(url)`, `js(expr)`, `wait_for_load()`, `wait_for_network_idle()`, `capture_screenshot(path=None, attach=True)`, `click_at_xy(x, y)`, `fill_input(selector, text)`, `type_text(text)`, `press_key(key)`, `scroll()`, `recent_console()`, `recent_network_failures()`, and raw `cdp("Method", {...})`.
+- Take screenshots whenever you need to verify page state. Calling `capture_screenshot(attach=True)` attaches the image to your next turn so you can see it inline. Screenshots are also saved to disk for the judge.
+- Do not ask clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end).
+- When the task is complete, call the `done` tool with your final answer as the `result` argument. The judge reads the `result` you pass to `done` as your final answer to the task.
+- If the task has no textual answer (e.g. "book a flight"), pass `result="done"` to the `done` tool and describe what you did in your preceding text.
diff --git a/frameworks/but_rust/__init__.py b/frameworks/but_rust/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/but_rust/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/but_rust/run_task.py b/frameworks/but_rust/run_task.py
new file mode 100644
index 0000000..f54b9b1
--- /dev/null
+++ b/frameworks/but_rust/run_task.py
@@ -0,0 +1,382 @@
+﻿"""Run a single benchmark task using the Rust browser-use-terminal (`but-rust`).
+
+This is the rust-rewrite branch of `browser-use/browser-use-terminal`. The
+old Python `but` framework wraps `main`; this one wraps `rust-rewrite`.
+Completely independent install path + invocation, gated on
+`inputs.framework == 'but-rust'` in the workflow so it cannot affect any
+other framework.
+
+Architecture differences vs Python `but`:
+- Cargo workspace; the CLI is a Rust binary at
+  `<repo>/target/release/browser-use-terminal`.
+- Subcommand-per-provider: `run-openai <text> --model <m>`, plus
+  `run-codex`, `run-anthropic`, `run-openrouter`. No `--provider` flag.
+- No `--browser` flag at all. Browser ops live in a Python worker
+  process (`python/llm_browser_worker/worker.py`) spawned by Rust; that
+  worker honors `BU_CDP_URL`/`BU_CDP_WS` and connects through the
+  browser-harness Python package. We pre-provision a browser-use-cloud
+  CDP WS the same way `but`/`bcode` do and pass it via `BU_CDP_WS`.
+- State lives in SQLite at `<state_dir>/state.db`; events are read out
+  via `events <session_id>` (JSON lines).
+Browser harness needs to be importable in the worker venv as
+`browser_harness`. The workflow's install step `uv pip install` the
+browser-harness repo at `BUT_RUST_HARNESS_REF` (default: main) into the
+project venv at `/tmp/but-rust/.venv` so `import browser_harness.admin`
+in the worker resolves.
+
+The runner shells out twice per task:
+1. `run-openai/run-codex/...` -- agent loop, prints session_id on stdout.
+2. `events <session_id>` -- JSON-lines event dump, parsed into steps +
+   final result + cost + screenshot paths.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from lmnr import Laminar
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+    "max_turns": "Maximum model/tool turns before failing (currently informational -- but-rust does not expose this on the run-* subcommand; the dataset-run subcommand has it but we are not using that path).",
+    "framework_repo": "Override the GitHub repo for browser-use-terminal install (default: browser-use/browser-use-terminal). Consumed by the workflow install step.",
+    "harness_repo": "Override the browser-harness GitHub repo installed into the worker venv (default: browser-use/browser-harness). Consumed by the workflow install step.",
+    "harness_ref": "Override the browser-harness ref/branch/commit (default: main). Consumed by the workflow install step.",
+}
+
+# Workflow install step builds the binary here.
+BUT_RUST_REPO_DIR = "/tmp/but-rust"
+BUT_RUST_BIN = f"{BUT_RUST_REPO_DIR}/target/release/browser-use-terminal"
+
+SYSTEM_PROMPT_PATH = Path(__file__).resolve().parent / "system_prompt.md"
+
+STATE_ROOT = Path("/tmp/but_rust_state")
+
+# Map benchmark model alias to (rust subcommand, model arg). Order matters.
+_PROVIDER_SUBCMDS: tuple[tuple[str, str], ...] = (
+    ("claude", "run-anthropic"),
+    ("gpt", "run-openai"),
+    ("o1", "run-openai"),
+    ("o3", "run-openai"),
+    ("o4", "run-openai"),
+    # No native zai/qwen in but-rust; route via OpenRouter when the model
+    # name carries an OpenRouter-compatible provider/model slug.
+    ("glm", "run-openrouter"),
+    ("qwen", "run-openrouter"),
+)
+
+
+def _resolve_subcommand(model_name: str) -> str:
+    lower = model_name.lower()
+    for key, subcmd in _PROVIDER_SUBCMDS:
+        if key in lower:
+            return subcmd
+    raise ValueError(
+        f"but-rust: cannot infer subcommand for model {model_name!r}. "
+        f"Add a keyphrase to _PROVIDER_SUBCMDS in frameworks/but_rust/run_task.py."
+    )
+
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser() -> tuple[str, str]:
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+def _format_step_from_event(event: dict) -> str | None:
+    etype = event.get("type") or ""
+    payload = event.get("payload") or {}
+    if etype == "tool.started":
+        name = payload.get("name") or "?"
+        args = payload.get("arguments") or {}
+        if name == "python":
+            code = (args.get("code") or args.get("source") or "").strip()
+            return f"python: {code[:2000]}"
+        if name in ("bash", "shell"):
+            cmd = (args.get("command") or args.get("script") or "").strip()
+            return f"{name}: {cmd[:2000]}"
+        if name in ("read", "write", "edit"):
+            path = args.get("path") or args.get("filePath") or ""
+            return f"{name}: {path}"
+        if name == "done":
+            result = (args.get("result") or "").strip()
+            return f"done: {result[:2000]}"
+        try:
+            return f"{name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if etype in ("assistant.message", "message.assistant"):
+        text = (payload.get("text") or payload.get("content") or "").strip()
+        return f"text: {text[:2000]}" if text else None
+    if etype in ("reasoning", "assistant.reasoning"):
+        text = (payload.get("text") or payload.get("content") or "").strip()
+        return f"thinking: {text[:2000]}" if text else None
+    return None
+
+
+async def _read_stream(stream: asyncio.StreamReader, label: str, buf: list[str], echo: bool = True) -> None:
+    while line := await stream.readline():
+        s = line.decode("utf-8", errors="replace").rstrip("\n")
+        buf.append(s)
+        if echo:
+            print(f"[{label}] {s[:500]}", flush=True)
+
+
+def _collect_screenshots(state_dir: Path, session_id: str) -> list[str]:
+    """Read PNGs/JPEGs from `<state_dir>/artifacts/<session_id>/images/`."""
+    images_dir = state_dir / "artifacts" / session_id / "images"
+    if not images_dir.exists():
+        return []
+    paths = sorted(p for p in images_dir.iterdir() if p.is_file() and p.suffix.lower() in (".png", ".jpeg", ".jpg"))
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = parse_params()
+    validate_params(params, ACCEPTED_PARAMS)
+    model = os.environ["MODEL"]
+    subcommand = _resolve_subcommand(model)
+    task_idx = os.environ.get("TASK_INDEX", "0")
+
+    browser_id, cdp_ws = _start_browser()
+
+    state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}"
+    if state_dir.exists():
+        shutil.rmtree(state_dir)
+    state_dir.mkdir(parents=True)
+
+    parent_span_context = Laminar.serialize_span_context()
+
+    system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
+    full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
+
+    env = {
+        **os.environ,
+        # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS
+        # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short
+        # circuits. Pass both URL forms for robustness.
+        "BU_CDP_WS": cdp_ws,
+        # Force flush on one-shot CLI runs so OTLP spans actually leave the
+        # process before exit (see docs/README on this branch).
+        "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1",
+    }
+    if parent_span_context:
+        # Forward-compat: but-rust telemetry doesn't honor this yet, but it
+        # doesn't error on unknown env either.
+        env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context
+
+    # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE
+    # the subcommand.
+    cmd_run = [
+        BUT_RUST_BIN,
+        "--state-dir", str(state_dir),
+        subcommand,
+        full_task,
+        "--model", model,
+    ]
+
+    start = time.time()
+    stdout_buf: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd_run,
+        cwd=BUT_RUST_REPO_DIR,
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+    stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf))
+    stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf))
+
+    try:
+        await proc.wait()
+        await asyncio.wait_for(stdout_task, timeout=10)
+        await asyncio.wait_for(stderr_task, timeout=10)
+    except asyncio.TimeoutError:
+        for t in (stdout_task, stderr_task):
+            if not t.done():
+                t.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+
+    # `run-openai`/etc print the session_id as the final non-empty stdout line.
+    session_id = ""
+    for line in reversed(stdout_buf):
+        line = line.strip()
+        if line and not line.startswith("{"):
+            session_id = line
+            break
+
+    if not session_id:
+        _stop_browser(browser_id)
+        raise RuntimeError(
+            f"but-rust: no session_id captured from stdout (exit={proc.returncode}). "
+            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+        )
+
+    # Dump events for this session.
+    cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id]
+    events_proc = await asyncio.create_subprocess_exec(
+        *cmd_events,
+        cwd=BUT_RUST_REPO_DIR,
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+    events_stdout, events_stderr = await events_proc.communicate()
+    _stop_browser(browser_id)
+    duration = time.time() - start
+
+    events: list[dict] = []
+    for line in events_stdout.decode("utf-8", errors="replace").splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        try:
+            events.append(json.loads(line))
+        except json.JSONDecodeError:
+            continue
+
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    errors: list[str] = []
+
+    for event in events:
+        if (s := _format_step_from_event(event)):
+            steps.append(s)
+        etype = event.get("type") or ""
+        payload = event.get("payload") or {}
+        if etype == "session.done":
+            done_result = (payload.get("result") or "").strip()
+            if done_result:
+                final_text = done_result
+        elif etype in ("model.usage", "llm.usage"):
+            cost_usd = payload.get("cost_usd") or payload.get("cost")
+            if cost_usd is not None:
+                try:
+                    total_cost += float(cost_usd)
+                except (TypeError, ValueError):
+                    pass
+        elif etype in ("tool.failed", "session.failed", "error"):
+            err = payload.get("error") or payload.get("message") or ""
+            if err:
+                errors.append(str(err))
+                print(f"[but-rust-error] {str(err)[:500]}", flush=True)
+
+    if not final_text:
+        for event in reversed(events):
+            if (event.get("type") or "") in ("assistant.message", "message.assistant"):
+                text = ((event.get("payload") or {}).get("text") or "").strip()
+                if text:
+                    final_text = text
+                    break
+
+    if proc.returncode not in (0, None) and not final_text and not steps:
+        raise RuntimeError(
+            f"but-rust exited with code {proc.returncode} before producing output. "
+            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+        )
+
+    answer = (final_text or "").strip()
+    if errors and not answer:
+        final_result = f"[but_rust_error] {errors[0][:500]}"
+    elif errors:
+        final_result = f"[but_rust_error_recovered] {answer}"
+    else:
+        final_result = answer or "[but_rust_no_output]"
+
+    screenshots = _collect_screenshots(state_dir, session_id)
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=screenshots,
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/but_rust/system_prompt.md b/frameworks/but_rust/system_prompt.md
new file mode 100644
index 0000000..dca9a86
--- /dev/null
+++ b/frameworks/but_rust/system_prompt.md
@@ -0,0 +1,10 @@
+﻿You are evaluating a benchmark task by driving a real browser via the Rust browser-use-terminal (`but-rust`).
+
+Hard rules:
+- A live remote browser is pre-attached for you. The Python worker that owns browser ops reads `BU_CDP_WS` from its env and connects through browser-harness, so do NOT spawn a new browser, do NOT change the CDP endpoint.
+- Drive the browser through the Python tool. Useful browser-harness helpers exposed in the Python namespace include `goto_url(url)`, `js(expr)`, `wait_for_load()`, `wait_for_network_idle()`, `capture_screenshot(path=None, attach=True)`, `click_at_xy(x, y)`, `fill_input(selector, text)`, `type_text(text)`, `press_key(key)`, `scroll()`, `recent_console()`, `recent_network_failures()`, and raw `cdp("Method", {...})`.
+- Take screenshots whenever you need to verify page state. Calling `capture_screenshot(attach=True)` attaches the image to your next turn so you can see it inline. Screenshots are also saved to the session artifact dir for the judge.
+- Do not ask clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end).
+- When the task is complete, call the `done` tool with your final answer as the `result` argument. The judge reads the `result` you pass to `done` as your final answer to the task.
+- If the task has no textual answer (e.g. "book a flight"), pass `result="done"` to the `done` tool and describe what you did in your preceding text.
diff --git a/frameworks/claude_code_harness/__init__.py b/frameworks/claude_code_harness/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/claude_code_harness/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/claude_code_harness/run_task.py b/frameworks/claude_code_harness/run_task.py
new file mode 100644
index 0000000..dc0e18e
--- /dev/null
+++ b/frameworks/claude_code_harness/run_task.py
@@ -0,0 +1,436 @@
+﻿"""Run a single benchmark task using Claude Code driving browser-harness.
+
+This framework wraps Claude Code (the CLI coding agent) around the browser-harness
+repo: Claude Code owns the agent loop, we just hand it a task and a workdir
+pre-loaded with the harness + a live browser daemon, then stream-parse its output.
+
+The joint system being benchmarked is (Claude Code + browser-harness + Claude model).
+Pin `claude_code_version` and `framework_ref` for reproducible comparisons.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow,
+# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules.
+HARNESS_DIR = "/tmp/browser-harness"
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "max_turns": "Max Claude Code agentic turns (default: 100)",
+    "max_budget_usd": "Per-task API budget cap in USD (default: 10)",
+    "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.",
+    "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+# Subtypes Claude Code emits in the terminal `result` event. Anything other than
+# 'success' means the agent did not complete the task (usually a limit was hit).
+# See: https://docs.claude.com/en/docs/claude-code/headless (stream-json spec)
+RESULT_SUCCESS = "success"
+LIMIT_SUBTYPES = {
+    "error_max_turns",
+    "error_max_tokens",
+    "error_max_budget_usd",
+    "error_during_execution",
+    "error_api_error",
+}
+
+
+def _require_claude_model(model_name: str) -> str:
+    """This framework only supports Claude models (Claude Code requires them)."""
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"claude-code-harness requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _reset_shots_dir() -> None:
+    if SHOTS_DIR.exists():
+        shutil.rmtree(SHOTS_DIR)
+    SHOTS_DIR.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+def _start_browser(browser_name: str, bu_name: str) -> dict:
+    """Provision a browser for the harness to attach to. Returns the cloud browser dict."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for claude-code-harness: {browser_name}")
+    sys.path.insert(0, HARNESS_DIR)
+    from admin import start_remote_daemon  # type: ignore
+
+    return start_remote_daemon(name=bu_name)
+
+
+def _stop_browser(browser_name: str, bu_name: str) -> None:
+    try:
+        sys.path.insert(0, HARNESS_DIR)
+        from admin import stop_remote_daemon  # type: ignore
+
+        if browser_name == "browser-use-cloud":
+            stop_remote_daemon(name=bu_name)
+    except Exception as e:
+        print(f"Warning: failed to stop harness daemon: {e}")
+
+
+def _build_claude_cmd(
+    task_description: str,
+    model_name: str,
+    max_turns: int,
+    max_budget_usd: float,
+    use_bare: bool,
+) -> list[str]:
+    cmd = [
+        "claude",
+        "-p",
+        task_description,
+        "--model",
+        model_name,
+        "--dangerously-skip-permissions",
+        "--output-format",
+        "stream-json",
+        "--verbose",
+        "--max-turns",
+        str(max_turns),
+        "--max-budget-usd",
+        str(max_budget_usd),
+        "--append-system-prompt-file",
+        str(SYSTEM_PROMPT_FILE),
+        "--no-session-persistence",
+    ]
+    if use_bare:
+        cmd.append("--bare")
+    return cmd
+
+
+def _format_assistant_block(block: dict) -> str | None:
+    """Turn a single assistant message content block into a step string."""
+    btype = block.get("type")
+    if btype == "tool_use":
+        name = block.get("name", "?")
+        inp = block.get("input", {}) or {}
+        if name == "Bash":
+            return f"Bash: {(inp.get('command') or '').strip()[:2000]}"
+        if name in ("Edit", "Write", "Read"):
+            path = inp.get("file_path") or inp.get("path") or ""
+            return f"{name}: {path}"
+        try:
+            return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if btype == "text":
+        text = (block.get("text") or "").strip()
+        if not text:
+            return None
+        return f"text: {text[:2000]}"
+    if btype == "thinking":
+        text = (block.get("thinking") or "").strip()
+        if not text:
+            return None
+        return f"thinking: {text[:2000]}"
+    return None
+
+
+def _format_tool_result_block(block: dict) -> str | None:
+    """Turn a user message tool_result block into a step string."""
+    if block.get("type") != "tool_result":
+        return None
+    content = block.get("content")
+    is_error = bool(block.get("is_error"))
+    prefix = "tool_error" if is_error else "tool_result"
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                if c.get("type") == "text":
+                    parts.append(c.get("text", ""))
+                elif c.get("type") == "image":
+                    parts.append("<image>")
+        content = "\n".join(parts)
+    if not isinstance(content, str):
+        try:
+            content = json.dumps(content, default=str)
+        except Exception:
+            content = str(content)
+    content = content.strip()
+    if not content:
+        return None
+    # Cap per-step size so Laminar payloads stay reasonable.
+    return f"{prefix}: {content[:2000]}"
+
+
+def _format_event_steps(event: dict) -> list[str]:
+    """Extract step strings from any stream-json event. Empty list = not a step."""
+    etype = event.get("type")
+    if etype == "assistant":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_assistant_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    if etype == "user":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_tool_result_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    return []
+
+
+def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]:
+    """Parse the terminal `result` event. Returns (subtype, is_error, errors)."""
+    subtype = event.get("subtype") or RESULT_SUCCESS
+    is_error = bool(event.get("is_error"))
+    errors_raw = event.get("errors") or []
+    errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)]
+    return subtype, is_error, errors
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    """Read stderr line-by-line, echo to our stdout, and buffer for later reporting."""
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        # Surface to GitHub Actions log in real time.
+        print(f"[claude-stderr] {s}", flush=True)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    task_index = os.environ.get("TASK_INDEX", "0")
+    max_turns = int(params.get("max_turns", "100"))
+    max_budget_usd = float(params.get("max_budget_usd", "10"))
+    use_bare = params.get("use_bare", "true").lower() != "false"
+    # task_timeout is consumed in main() before run_and_judge wraps execute.
+
+    bu_name = f"eval-{task_index}"
+    _reset_shots_dir()
+
+    # Pre-provision the browser so Claude starts with a live CDP attach.
+    _start_browser(browser_name, bu_name)
+
+    env = {
+        **os.environ,
+        "BU_NAME": bu_name,
+        "DISABLE_TELEMETRY": "1",
+        "DISABLE_AUTOUPDATER": "1",
+        "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+    }
+
+    cmd = _build_claude_cmd(
+        task_description, model_name, max_turns, max_budget_usd, use_bare
+    )
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    result_subtype: str | None = None
+    result_is_error = False
+    result_errors: list[str] = []
+    stderr_buf: list[str] = []
+
+    # claude stream-json lines can be huge (tool_result blocks with full page
+    # HTML/text, assistant messages with signed thinking blocks). Default
+    # asyncio StreamReader line buffer is 64 KiB which raises ValueError on
+    # long lines, and even a larger limit has a ceiling. Read raw chunks and
+    # split on newlines ourselves.
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=HARNESS_DIR,
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,  # 256 MiB safety cap
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        """Yield one stream-json line at a time, regardless of line length."""
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16  # 64 KiB
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            # Emit every complete line in the buffer.
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                # Non-JSON line from claude (shouldn't happen in stream-json, but be safe)
+                print(f"[claude-stdout-raw] {line}", flush=True)
+                continue
+
+            new_steps = _format_event_steps(event)
+            for s in new_steps:
+                steps.append(s)
+                # Echo each step so GitHub Actions log shows live progress.
+                print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            # Terminal event
+            if event.get("type") == "result":
+                final_text = event.get("result") or ""
+                total_cost = float(event.get("total_cost_usd") or 0.0)
+                result_subtype, result_is_error, result_errors = _summarize_result_event(event)
+                print(
+                    f"[claude-result] subtype={result_subtype} is_error={result_is_error} "
+                    f"cost=${total_cost:.4f} errors={result_errors}",
+                    flush=True,
+                )
+
+        # Wait for the process (stdout closed implies near-exit)
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        # Drain remaining stderr
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_name, bu_name)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    # If we never saw a `result` event AND claude exited non-zero, that is a true
+    # hard error (e.g. CLI startup failure, killed by OS). Surface it.
+    if result_subtype is None and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"claude exited with code {proc.returncode} before emitting a result event. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    # Determine final_result text.
+    match = FINAL_ANSWER_RE.search(final_text or "")
+    answer = match.group(1).strip() if match else (final_text.strip() or "")
+
+    if result_subtype and result_subtype != RESULT_SUCCESS:
+        # Agent hit a limit or errored but Claude Code reported it cleanly.
+        # Preserve the datapoint: tag the final_result with the subtype and let the
+        # judge score whatever was accomplished.
+        err_suffix = f" errors={result_errors}" if result_errors else ""
+        if answer:
+            final_result = f"[{result_subtype}] {answer}{err_suffix}"
+        else:
+            final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}"
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    # Propagate task_timeout param to run_and_judge before it wraps execute().
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/claude_code_harness/system_prompt.md b/frameworks/claude_code_harness/system_prompt.md
new file mode 100644
index 0000000..d1566de
--- /dev/null
+++ b/frameworks/claude_code_harness/system_prompt.md
@@ -0,0 +1,13 @@
+﻿You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory.
+
+Hard rules:
+- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo.
+- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/claude_code_harness_ab/__init__.py b/frameworks/claude_code_harness_ab/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/claude_code_harness_ab/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/claude_code_harness_ab/run_task.py b/frameworks/claude_code_harness_ab/run_task.py
new file mode 100644
index 0000000..baddc77
--- /dev/null
+++ b/frameworks/claude_code_harness_ab/run_task.py
@@ -0,0 +1,460 @@
+﻿"""Run a single benchmark task using Claude Code driving the vercel-labs/agent-browser CLI.
+
+This is the `agent-browser`-CDP variant of `claude-code-harness`. Claude Code owns
+the agent loop; the agent drives a remote Chrome via the `agent-browser` CLI
+(native Rust, single-process daemon). We pre-provision a browser-use-cloud session
+and pass its WebSocket CDP URL via `BU_CDP_WS`; the agent connects with
+`agent-browser --cdp "$BU_CDP_WS" open <url>` and the daemon auto-reattaches on
+subsequent calls.
+
+The joint system being benchmarked is (Claude Code + agent-browser + Claude
+model). Pin `claude_code_version`, `agent_browser_version`, and `framework_ref`
+for reproducible comparisons against the Python `claude-code-harness` and
+`claude-code-harness-js` frameworks.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "max_turns": "Max Claude Code agentic turns (default: 100)",
+    "max_budget_usd": "Per-task API budget cap in USD (default: 10)",
+    "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)",
+    "agent_browser_version": "agent-browser npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for agent-browser install (e.g. fork/agent-browser). Consumed by the workflow install step.",
+    "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+WORK_DIR = Path("/tmp/cch-ab-workdir")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+# Subtypes Claude Code emits in the terminal `result` event. Anything other than
+# 'success' means the agent did not complete the task (usually a limit was hit).
+RESULT_SUCCESS = "success"
+LIMIT_SUBTYPES = {
+    "error_max_turns",
+    "error_max_tokens",
+    "error_max_budget_usd",
+    "error_during_execution",
+    "error_api_error",
+}
+
+
+def _require_claude_model(model_name: str) -> str:
+    """This framework only supports Claude models (Claude Code requires them)."""
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"claude-code-harness-ab requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _reset_dir(p: Path) -> None:
+    if p.exists():
+        shutil.rmtree(p)
+    p.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+# ---- Browser-Use Cloud session provisioning (mirrors cch-js / bcode runners) ----
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser(browser_name: str) -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for claude-code-harness-ab: {browser_name}")
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+# ---- Claude Code invocation (identical to CCH/CCH-JS except for cwd) ----
+
+def _build_claude_cmd(
+    task_description: str,
+    model_name: str,
+    max_turns: int,
+    max_budget_usd: float,
+    use_bare: bool,
+) -> list[str]:
+    cmd = [
+        "claude",
+        "-p",
+        task_description,
+        "--model",
+        model_name,
+        "--dangerously-skip-permissions",
+        "--output-format",
+        "stream-json",
+        "--verbose",
+        "--max-turns",
+        str(max_turns),
+        "--max-budget-usd",
+        str(max_budget_usd),
+        "--append-system-prompt-file",
+        str(SYSTEM_PROMPT_FILE),
+        "--no-session-persistence",
+    ]
+    if use_bare:
+        cmd.append("--bare")
+    return cmd
+
+
+def _format_assistant_block(block: dict) -> str | None:
+    btype = block.get("type")
+    if btype == "tool_use":
+        name = block.get("name", "?")
+        inp = block.get("input", {}) or {}
+        if name == "Bash":
+            return f"Bash: {(inp.get('command') or '').strip()[:2000]}"
+        if name in ("Edit", "Write", "Read"):
+            path = inp.get("file_path") or inp.get("path") or ""
+            return f"{name}: {path}"
+        try:
+            return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if btype == "text":
+        text = (block.get("text") or "").strip()
+        if not text:
+            return None
+        return f"text: {text[:2000]}"
+    if btype == "thinking":
+        text = (block.get("thinking") or "").strip()
+        if not text:
+            return None
+        return f"thinking: {text[:2000]}"
+    return None
+
+
+def _format_tool_result_block(block: dict) -> str | None:
+    if block.get("type") != "tool_result":
+        return None
+    content = block.get("content")
+    is_error = bool(block.get("is_error"))
+    prefix = "tool_error" if is_error else "tool_result"
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                if c.get("type") == "text":
+                    parts.append(c.get("text", ""))
+                elif c.get("type") == "image":
+                    parts.append("<image>")
+        content = "\n".join(parts)
+    if not isinstance(content, str):
+        try:
+            content = json.dumps(content, default=str)
+        except Exception:
+            content = str(content)
+    content = content.strip()
+    if not content:
+        return None
+    return f"{prefix}: {content[:2000]}"
+
+
+def _format_event_steps(event: dict) -> list[str]:
+    etype = event.get("type")
+    if etype == "assistant":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_assistant_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    if etype == "user":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_tool_result_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    return []
+
+
+def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]:
+    subtype = event.get("subtype") or RESULT_SUCCESS
+    is_error = bool(event.get("is_error"))
+    errors_raw = event.get("errors") or []
+    errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)]
+    return subtype, is_error, errors
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        print(f"[claude-stderr] {s}", flush=True)
+
+
+async def _close_agent_browser_sessions() -> None:
+    """Best-effort: tell agent-browser to shut down all daemons.
+
+    agent-browser spawns a per-session background daemon (one per
+    `--session` name). `close --all` quits every active session so a
+    leaked daemon does not survive across tasks on the same runner.
+    """
+    try:
+        stop_proc = await asyncio.create_subprocess_exec(
+            "agent-browser", "close", "--all",
+            stdout=asyncio.subprocess.DEVNULL,
+            stderr=asyncio.subprocess.DEVNULL,
+        )
+        await asyncio.wait_for(stop_proc.wait(), timeout=10)
+    except Exception:
+        pass
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    max_turns = int(params.get("max_turns", "100"))
+    max_budget_usd = float(params.get("max_budget_usd", "10"))
+    use_bare = params.get("use_bare", "true").lower() != "false"
+
+    _reset_dir(SHOTS_DIR)
+    _reset_dir(WORK_DIR)
+
+    # Pre-provision a remote browser; pass its WS URL to the agent via env.
+    # The agent runs: agent-browser --cdp "$BU_CDP_WS" open <url>
+    browser_id, cdp_ws = _start_browser(browser_name)
+
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        "DISABLE_TELEMETRY": "1",
+        "DISABLE_AUTOUPDATER": "1",
+        "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+        # Don't let agent-browser try to auto-download Chrome at task time -- the
+        # workflow already ran `agent-browser install` and a remote browser is
+        # attached via --cdp anyway.
+        "AGENT_BROWSER_SKIP_INSTALL": "1",
+    }
+
+    cmd = _build_claude_cmd(
+        task_description, model_name, max_turns, max_budget_usd, use_bare
+    )
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    result_subtype: str | None = None
+    result_is_error = False
+    result_errors: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=str(WORK_DIR),
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[claude-stdout-raw] {line}", flush=True)
+                continue
+
+            new_steps = _format_event_steps(event)
+            for s in new_steps:
+                steps.append(s)
+                print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            if event.get("type") == "result":
+                final_text = event.get("result") or ""
+                total_cost = float(event.get("total_cost_usd") or 0.0)
+                result_subtype, result_is_error, result_errors = _summarize_result_event(event)
+                print(
+                    f"[claude-result] subtype={result_subtype} is_error={result_is_error} "
+                    f"cost=${total_cost:.4f} errors={result_errors}",
+                    flush=True,
+                )
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        # Best-effort: close any agent-browser daemon(s) the agent left running
+        # so they don't leak across tasks on the same runner.
+        await _close_agent_browser_sessions()
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    if result_subtype is None and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"claude exited with code {proc.returncode} before emitting a result event. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    match = FINAL_ANSWER_RE.search(final_text or "")
+    answer = match.group(1).strip() if match else (final_text.strip() or "")
+
+    if result_subtype and result_subtype != RESULT_SUCCESS:
+        err_suffix = f" errors={result_errors}" if result_errors else ""
+        if answer:
+            final_result = f"[{result_subtype}] {answer}{err_suffix}"
+        else:
+            final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}"
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/claude_code_harness_ab/system_prompt.md b/frameworks/claude_code_harness_ab/system_prompt.md
new file mode 100644
index 0000000..eec2715
--- /dev/null
+++ b/frameworks/claude_code_harness_ab/system_prompt.md
@@ -0,0 +1,29 @@
+﻿You are evaluating a benchmark task by driving a real browser through the `agent-browser` CLI from `vercel-labs/agent-browser`.
+
+Hard rules:
+- Use the `agent-browser` CLI for every browser interaction. It is on your PATH. Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `agent-browser` only.
+- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running:
+    ```
+    agent-browser --cdp "$BU_CDP_WS" open <url>
+    ```
+  All subsequent `agent-browser <verb>` calls automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`snapshot`, `click @e2`, `screenshot`, etc.).
+- Before issuing your first command, read the bundled skill so you know the full command surface and current best-practice workflow:
+    ```
+    agent-browser skills get core
+    ```
+  Use `agent-browser skills get core --full` for the complete command reference. The CLI also accepts `--help` on any subcommand.
+- Prefer the accessibility-tree workflow: `agent-browser snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser click @eN` / `agent-browser fill @eN "<text>"` to interact. Fall back to CSS selectors or `find role <role> --name "..."` semantic locators when refs are insufficient.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Use the `--screenshot-dir` / explicit-path form so files land on disk and the judge can see them:
+    ```
+    agent-browser screenshot /tmp/shots/step_001.png
+    agent-browser screenshot /tmp/shots/step_002.png
+    ```
+  Never overwrite a previous screenshot path. Annotated screenshots (`--annotate`) are fine for visual reasoning, but still write to a new numbered filename.
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/claude_code_harness_bu_cli/__init__.py b/frameworks/claude_code_harness_bu_cli/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/claude_code_harness_bu_cli/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/claude_code_harness_bu_cli/run_task.py b/frameworks/claude_code_harness_bu_cli/run_task.py
new file mode 100644
index 0000000..1f8f164
--- /dev/null
+++ b/frameworks/claude_code_harness_bu_cli/run_task.py
@@ -0,0 +1,461 @@
+﻿"""Run a single benchmark task using Claude Code driving the browser-use CLI.
+
+This is the `browser-use` CLI variant of `claude-code-harness`. Claude Code owns
+the agent loop; the agent drives a remote Chrome via the `browser-use` CLI from
+`browser-use/browser-use` (Python, daemon-backed). We pre-provision a
+browser-use-cloud session and pass its WebSocket CDP URL via `BU_CDP_WS`; the
+agent connects with `browser-use --cdp-url "$BU_CDP_WS" open <url>` and the
+per-session daemon reuses that attachment for all subsequent commands.
+
+The joint system being benchmarked is (Claude Code + browser-use CLI + Claude
+model). Pin `claude_code_version` and `framework_ref` (the `browser-use/browser-use`
+ref) for reproducible comparisons against `claude-code-harness`,
+`claude-code-harness-js`, and `claude-code-harness-ab`.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "max_turns": "Max Claude Code agentic turns (default: 100)",
+    "max_budget_usd": "Per-task API budget cap in USD (default: 10)",
+    "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for browser-use install (e.g. fork/browser-use). Consumed by the workflow install step.",
+    "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+WORK_DIR = Path("/tmp/cch-bu-cli-workdir")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+# Subtypes Claude Code emits in the terminal `result` event. Anything other than
+# 'success' means the agent did not complete the task (usually a limit was hit).
+RESULT_SUCCESS = "success"
+LIMIT_SUBTYPES = {
+    "error_max_turns",
+    "error_max_tokens",
+    "error_max_budget_usd",
+    "error_during_execution",
+    "error_api_error",
+}
+
+
+def _require_claude_model(model_name: str) -> str:
+    """This framework only supports Claude models (Claude Code requires them)."""
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"claude-code-harness-bu-cli requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _reset_dir(p: Path) -> None:
+    if p.exists():
+        shutil.rmtree(p)
+    p.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+# ---- Browser-Use Cloud session provisioning (mirrors cch-js / cch-ab / bcode) ----
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser(browser_name: str) -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for claude-code-harness-bu-cli: {browser_name}")
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+# ---- Claude Code invocation (identical to CCH/CCH-JS/CCH-AB except for cwd) ----
+
+def _build_claude_cmd(
+    task_description: str,
+    model_name: str,
+    max_turns: int,
+    max_budget_usd: float,
+    use_bare: bool,
+) -> list[str]:
+    cmd = [
+        "claude",
+        "-p",
+        task_description,
+        "--model",
+        model_name,
+        "--dangerously-skip-permissions",
+        "--output-format",
+        "stream-json",
+        "--verbose",
+        "--max-turns",
+        str(max_turns),
+        "--max-budget-usd",
+        str(max_budget_usd),
+        "--append-system-prompt-file",
+        str(SYSTEM_PROMPT_FILE),
+        "--no-session-persistence",
+    ]
+    if use_bare:
+        cmd.append("--bare")
+    return cmd
+
+
+def _format_assistant_block(block: dict) -> str | None:
+    btype = block.get("type")
+    if btype == "tool_use":
+        name = block.get("name", "?")
+        inp = block.get("input", {}) or {}
+        if name == "Bash":
+            return f"Bash: {(inp.get('command') or '').strip()[:2000]}"
+        if name in ("Edit", "Write", "Read"):
+            path = inp.get("file_path") or inp.get("path") or ""
+            return f"{name}: {path}"
+        try:
+            return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if btype == "text":
+        text = (block.get("text") or "").strip()
+        if not text:
+            return None
+        return f"text: {text[:2000]}"
+    if btype == "thinking":
+        text = (block.get("thinking") or "").strip()
+        if not text:
+            return None
+        return f"thinking: {text[:2000]}"
+    return None
+
+
+def _format_tool_result_block(block: dict) -> str | None:
+    if block.get("type") != "tool_result":
+        return None
+    content = block.get("content")
+    is_error = bool(block.get("is_error"))
+    prefix = "tool_error" if is_error else "tool_result"
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                if c.get("type") == "text":
+                    parts.append(c.get("text", ""))
+                elif c.get("type") == "image":
+                    parts.append("<image>")
+        content = "\n".join(parts)
+    if not isinstance(content, str):
+        try:
+            content = json.dumps(content, default=str)
+        except Exception:
+            content = str(content)
+    content = content.strip()
+    if not content:
+        return None
+    return f"{prefix}: {content[:2000]}"
+
+
+def _format_event_steps(event: dict) -> list[str]:
+    etype = event.get("type")
+    if etype == "assistant":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_assistant_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    if etype == "user":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_tool_result_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    return []
+
+
+def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]:
+    subtype = event.get("subtype") or RESULT_SUCCESS
+    is_error = bool(event.get("is_error"))
+    errors_raw = event.get("errors") or []
+    errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)]
+    return subtype, is_error, errors
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        print(f"[claude-stderr] {s}", flush=True)
+
+
+async def _close_browser_use_sessions() -> None:
+    """Best-effort: tell browser-use to shut down all daemons.
+
+    The browser-use CLI spawns a per-session background daemon (one per
+    `--session` name; default is "default"). `close --all` quits every
+    active session so a leaked daemon does not survive across tasks on the
+    same runner.
+    """
+    try:
+        stop_proc = await asyncio.create_subprocess_exec(
+            "browser-use", "close", "--all",
+            stdout=asyncio.subprocess.DEVNULL,
+            stderr=asyncio.subprocess.DEVNULL,
+        )
+        await asyncio.wait_for(stop_proc.wait(), timeout=15)
+    except Exception:
+        pass
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    max_turns = int(params.get("max_turns", "100"))
+    max_budget_usd = float(params.get("max_budget_usd", "10"))
+    use_bare = params.get("use_bare", "true").lower() != "false"
+
+    _reset_dir(SHOTS_DIR)
+    _reset_dir(WORK_DIR)
+
+    # Pre-provision a remote browser; pass its WS URL to the agent via env.
+    # The agent runs: browser-use --cdp-url "$BU_CDP_WS" open <url>
+    browser_id, cdp_ws = _start_browser(browser_name)
+
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        "DISABLE_TELEMETRY": "1",
+        "DISABLE_AUTOUPDATER": "1",
+        "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+        # browser-use CLI: don't auto-launch the setup wizard / installer on
+        # first call. The workflow already ran `browser-use install` and
+        # `browser-use doctor`, and we're attaching via --cdp-url so the
+        # local Chromium is not used to drive the page.
+        "BROWSER_USE_SETUP_LOGGING": "false",
+    }
+
+    cmd = _build_claude_cmd(
+        task_description, model_name, max_turns, max_budget_usd, use_bare
+    )
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    result_subtype: str | None = None
+    result_is_error = False
+    result_errors: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=str(WORK_DIR),
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[claude-stdout-raw] {line}", flush=True)
+                continue
+
+            new_steps = _format_event_steps(event)
+            for s in new_steps:
+                steps.append(s)
+                print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            if event.get("type") == "result":
+                final_text = event.get("result") or ""
+                total_cost = float(event.get("total_cost_usd") or 0.0)
+                result_subtype, result_is_error, result_errors = _summarize_result_event(event)
+                print(
+                    f"[claude-result] subtype={result_subtype} is_error={result_is_error} "
+                    f"cost=${total_cost:.4f} errors={result_errors}",
+                    flush=True,
+                )
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        # Best-effort: close any browser-use daemon(s) the agent left running
+        # so they don't leak across tasks on the same runner.
+        await _close_browser_use_sessions()
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    if result_subtype is None and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"claude exited with code {proc.returncode} before emitting a result event. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    match = FINAL_ANSWER_RE.search(final_text or "")
+    answer = match.group(1).strip() if match else (final_text.strip() or "")
+
+    if result_subtype and result_subtype != RESULT_SUCCESS:
+        err_suffix = f" errors={result_errors}" if result_errors else ""
+        if answer:
+            final_result = f"[{result_subtype}] {answer}{err_suffix}"
+        else:
+            final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}"
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/claude_code_harness_bu_cli/system_prompt.md b/frameworks/claude_code_harness_bu_cli/system_prompt.md
new file mode 100644
index 0000000..fa9789d
--- /dev/null
+++ b/frameworks/claude_code_harness_bu_cli/system_prompt.md
@@ -0,0 +1,25 @@
+﻿You are evaluating a benchmark task by driving a real browser through the `browser-use` CLI from `browser-use/browser-use`.
+
+Hard rules:
+- Use the `browser-use` CLI for every browser interaction. It is on your PATH (aliases: `bu`, `browser`, `browseruse` all work). Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `browser-use` only.
+- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running:
+    ```
+    browser-use --cdp-url "$BU_CDP_WS" open <url>
+    ```
+  All subsequent `browser-use <verb>` calls automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`state`, `click 5`, `input 3 "text"`, `screenshot`, etc.).
+- Before issuing your first interaction command, read the bundled SKILL.md so you know the full command surface, common workflows, and troubleshooting tips. It is at `~/.claude/skills/browser-use/SKILL.md`. If you have a Read tool, read that file. Otherwise: `cat ~/.claude/skills/browser-use/SKILL.md`.
+- Standard workflow per the SKILL: (1) `browser-use --cdp-url "$BU_CDP_WS" open <url>` to attach + navigate, (2) `browser-use state` to see clickable elements with indices, (3) `browser-use click <idx>` / `browser-use input <idx> "text"` to interact, (4) `browser-use state` or `browser-use screenshot` to verify, (5) repeat.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Pass an explicit path to `browser-use screenshot`:
+    ```
+    browser-use screenshot /tmp/shots/step_001.png
+    browser-use screenshot /tmp/shots/step_002.png
+    ```
+  Never overwrite a previous screenshot path.
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. Do not call `browser-use cloud connect` or `browser-use connect` -- the browser is already provisioned and attached via `--cdp-url`.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/claude_code_harness_js/__init__.py b/frameworks/claude_code_harness_js/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/claude_code_harness_js/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/claude_code_harness_js/run_task.py b/frameworks/claude_code_harness_js/run_task.py
new file mode 100644
index 0000000..8141989
--- /dev/null
+++ b/frameworks/claude_code_harness_js/run_task.py
@@ -0,0 +1,445 @@
+﻿"""Run a single benchmark task using Claude Code driving browser-harness-js.
+
+This is the JavaScript-CDP variant of `claude-code-harness`. Claude Code owns the
+agent loop; the agent drives a remote Chrome via the `browser-harness-js` CLI
+(typed CDP wrappers exposed as a single-process bun REPL). We pre-provision a
+browser-use-cloud session and pass its WebSocket CDP URL via `BU_CDP_WS`; the
+agent calls `session.connect({ wsUrl: process.env.BU_CDP_WS })` to attach.
+
+The joint system being benchmarked is (Claude Code + browser-harness-js + Claude
+model). Pin `claude_code_version` and `framework_ref` for reproducible
+comparisons against the Python `claude-code-harness` framework.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "max_turns": "Max Claude Code agentic turns (default: 100)",
+    "max_budget_usd": "Per-task API budget cap in USD (default: 10)",
+    "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for browser-harness-js install (e.g. fork/browser-harness-js). Consumed by the workflow install step.",
+    "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+WORK_DIR = Path("/tmp/cch-js-workdir")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+# Subtypes Claude Code emits in the terminal `result` event. Anything other than
+# 'success' means the agent did not complete the task (usually a limit was hit).
+RESULT_SUCCESS = "success"
+LIMIT_SUBTYPES = {
+    "error_max_turns",
+    "error_max_tokens",
+    "error_max_budget_usd",
+    "error_during_execution",
+    "error_api_error",
+}
+
+
+def _require_claude_model(model_name: str) -> str:
+    """This framework only supports Claude models (Claude Code requires them)."""
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"claude-code-harness-js requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _reset_dir(p: Path) -> None:
+    if p.exists():
+        shutil.rmtree(p)
+    p.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+# ---- Browser-Use Cloud session provisioning (mirrors bcode runner) ----
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"},
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser(browser_name: str) -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for claude-code-harness-js: {browser_name}")
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+# ---- Claude Code invocation (identical to CCH except for cwd) ----
+
+def _build_claude_cmd(
+    task_description: str,
+    model_name: str,
+    max_turns: int,
+    max_budget_usd: float,
+    use_bare: bool,
+) -> list[str]:
+    cmd = [
+        "claude",
+        "-p",
+        task_description,
+        "--model",
+        model_name,
+        "--dangerously-skip-permissions",
+        "--output-format",
+        "stream-json",
+        "--verbose",
+        "--max-turns",
+        str(max_turns),
+        "--max-budget-usd",
+        str(max_budget_usd),
+        "--append-system-prompt-file",
+        str(SYSTEM_PROMPT_FILE),
+        "--no-session-persistence",
+    ]
+    if use_bare:
+        cmd.append("--bare")
+    return cmd
+
+
+def _format_assistant_block(block: dict) -> str | None:
+    btype = block.get("type")
+    if btype == "tool_use":
+        name = block.get("name", "?")
+        inp = block.get("input", {}) or {}
+        if name == "Bash":
+            return f"Bash: {(inp.get('command') or '').strip()[:2000]}"
+        if name in ("Edit", "Write", "Read"):
+            path = inp.get("file_path") or inp.get("path") or ""
+            return f"{name}: {path}"
+        try:
+            return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return name
+    if btype == "text":
+        text = (block.get("text") or "").strip()
+        if not text:
+            return None
+        return f"text: {text[:2000]}"
+    if btype == "thinking":
+        text = (block.get("thinking") or "").strip()
+        if not text:
+            return None
+        return f"thinking: {text[:2000]}"
+    return None
+
+
+def _format_tool_result_block(block: dict) -> str | None:
+    if block.get("type") != "tool_result":
+        return None
+    content = block.get("content")
+    is_error = bool(block.get("is_error"))
+    prefix = "tool_error" if is_error else "tool_result"
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                if c.get("type") == "text":
+                    parts.append(c.get("text", ""))
+                elif c.get("type") == "image":
+                    parts.append("<image>")
+        content = "\n".join(parts)
+    if not isinstance(content, str):
+        try:
+            content = json.dumps(content, default=str)
+        except Exception:
+            content = str(content)
+    content = content.strip()
+    if not content:
+        return None
+    return f"{prefix}: {content[:2000]}"
+
+
+def _format_event_steps(event: dict) -> list[str]:
+    etype = event.get("type")
+    if etype == "assistant":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_assistant_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    if etype == "user":
+        msg = event.get("message", {}) or {}
+        steps = []
+        for block in msg.get("content", []) or []:
+            s = _format_tool_result_block(block)
+            if s:
+                steps.append(s)
+        return steps
+    return []
+
+
+def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]:
+    subtype = event.get("subtype") or RESULT_SUCCESS
+    is_error = bool(event.get("is_error"))
+    errors_raw = event.get("errors") or []
+    errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)]
+    return subtype, is_error, errors
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        print(f"[claude-stderr] {s}", flush=True)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    max_turns = int(params.get("max_turns", "100"))
+    max_budget_usd = float(params.get("max_budget_usd", "10"))
+    use_bare = params.get("use_bare", "true").lower() != "false"
+
+    _reset_dir(SHOTS_DIR)
+    _reset_dir(WORK_DIR)
+
+    # Pre-provision a remote browser; pass its WS URL to the agent via env.
+    # The agent connects with `session.connect({ wsUrl: process.env.BU_CDP_WS })`.
+    browser_id, cdp_ws = _start_browser(browser_name)
+
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        "DISABLE_TELEMETRY": "1",
+        "DISABLE_AUTOUPDATER": "1",
+        "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
+        # browser-harness-js auto-installs bun on first run if missing; we
+        # pre-installed bun in the workflow, so opt out of any check-in noise.
+        "BROWSER_HARNESS_SKIP_BUN_INSTALL": "1",
+    }
+
+    cmd = _build_claude_cmd(
+        task_description, model_name, max_turns, max_budget_usd, use_bare
+    )
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_cost = 0.0
+    result_subtype: str | None = None
+    result_is_error = False
+    result_errors: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=str(WORK_DIR),
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[claude-stdout-raw] {line}", flush=True)
+                continue
+
+            new_steps = _format_event_steps(event)
+            for s in new_steps:
+                steps.append(s)
+                print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            if event.get("type") == "result":
+                final_text = event.get("result") or ""
+                total_cost = float(event.get("total_cost_usd") or 0.0)
+                result_subtype, result_is_error, result_errors = _summarize_result_event(event)
+                print(
+                    f"[claude-result] subtype={result_subtype} is_error={result_is_error} "
+                    f"cost=${total_cost:.4f} errors={result_errors}",
+                    flush=True,
+                )
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        # Best-effort: stop the bun REPL server so it doesn't leak across tasks.
+        try:
+            stop_proc = await asyncio.create_subprocess_exec(
+                "browser-harness-js", "--stop",
+                stdout=asyncio.subprocess.DEVNULL,
+                stderr=asyncio.subprocess.DEVNULL,
+            )
+            await asyncio.wait_for(stop_proc.wait(), timeout=10)
+        except Exception:
+            pass
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    if result_subtype is None and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"claude exited with code {proc.returncode} before emitting a result event. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    match = FINAL_ANSWER_RE.search(final_text or "")
+    answer = match.group(1).strip() if match else (final_text.strip() or "")
+
+    if result_subtype and result_subtype != RESULT_SUCCESS:
+        err_suffix = f" errors={result_errors}" if result_errors else ""
+        if answer:
+            final_result = f"[{result_subtype}] {answer}{err_suffix}"
+        else:
+            final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}"
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/claude_code_harness_js/system_prompt.md b/frameworks/claude_code_harness_js/system_prompt.md
new file mode 100644
index 0000000..b5d375d
--- /dev/null
+++ b/frameworks/claude_code_harness_js/system_prompt.md
@@ -0,0 +1,21 @@
+﻿You are evaluating a benchmark task by driving a real browser through the browser-harness-js CDP skill.
+
+Hard rules:
+- Use the harness. Read `SKILL.md` first (under `~/.claude/skills/cdp/SKILL.md`). Drive the browser by running `browser-harness-js '<js>'` on the shell, or by piping multi-line snippets via heredoc. Do not install other browser tools, do not use Playwright directly, do not open a different repo.
+- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from the environment and calling `await session.connect({ wsUrl: process.env.BU_CDP_WS })`. Do NOT call `session.connect()` with no arguments (no local Chrome to auto-detect). Do NOT spawn or kill any browser processes.
+- After connecting, list page targets with `await listPageTargets()` and call `await session.use(targetInfo.targetId)` to bind to a tab before issuing Page/DOM/Runtime/Network calls. Globals (`session`, `globalThis.*`) persist across `browser-harness-js` invocations because the CLI auto-spawns a single long-lived bun server. Reuse them.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Decode the base64 returned by `Page.captureScreenshot` and write it to disk yourself; never overwrite a previous screenshot path. Example:
+    ```
+    browser-harness-js <<'JS'
+    const { data } = await session.Page.captureScreenshot({ format: 'png' });
+    require('fs').writeFileSync('/tmp/shots/step_001.png', Buffer.from(data, 'base64'));
+    return 'ok';
+    JS
+    ```
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/claude_cua/__init__.py b/frameworks/claude_cua/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/claude_cua/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/claude_cua/run_task.py b/frameworks/claude_cua/run_task.py
new file mode 100644
index 0000000..737d231
--- /dev/null
+++ b/frameworks/claude_cua/run_task.py
@@ -0,0 +1,97 @@
+﻿"""Run a single benchmark task using Claude Computer Use Agent.
+
+CUA controls its own desktop environment (Xvfb + browser). The browser parameter
+is meaningless for this framework -- it uses "integrated" as a placeholder.
+
+The agent loop:
+1. Launch Xvfb virtual display + browser
+2. Send task to Claude with the computer tool
+3. Loop: Claude emits actions -> execute on desktop -> screenshot -> send back
+4. Collect steps and final result into ExecutionResult
+"""
+
+import asyncio
+import os
+import sys
+import time
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {}
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    """Run the Claude CUA agent loop on a task.
+
+    TODO: Implement the full CUA agent loop:
+    1. Start Xvfb + browser via subprocess
+    2. Take initial screenshot
+    3. Send to Anthropic Messages API with computer_20251124 tool
+    4. Loop: parse tool_use blocks, execute actions, screenshot, send tool_result
+    5. Collect all steps and final text response
+    """
+    start = time.time()
+
+    # import anthropic
+    # client = anthropic.Anthropic()
+    #
+    # tools = [{"type": "computer_20251124", "name": "computer",
+    #           "display_width_px": 1920, "display_height_px": 1080}]
+    # messages = [{"role": "user", "content": task_description}]
+    #
+    # steps = []
+    # screenshots_b64 = []
+    # for _ in range(50):  # max iterations
+    #     response = client.beta.messages.create(
+    #         model="claude-sonnet-4-20250514", max_tokens=4096,
+    #         tools=tools, messages=messages, betas=["computer-use-2025-11-24"],
+    #     )
+    #     ... execute actions, collect screenshots, break on end_turn ...
+
+    duration = time.time() - start
+
+    return ExecutionResult(
+        final_result="NOT IMPLEMENTED",
+        steps=[],
+        screenshots_b64=[],
+        num_steps=0,
+        duration_seconds=duration,
+        cost=0,
+    )
+
+
+async def main():
+    validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/codex_harness/__init__.py b/frameworks/codex_harness/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/codex_harness/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/codex_harness/run_task.py b/frameworks/codex_harness/run_task.py
new file mode 100644
index 0000000..5838e3c
--- /dev/null
+++ b/frameworks/codex_harness/run_task.py
@@ -0,0 +1,489 @@
+﻿"""Run a single benchmark task using OpenAI Codex CLI driving browser-harness.
+
+This framework wraps OpenAI's Codex CLI (the coding agent) around the
+browser-harness repo: Codex owns the agent loop, we hand it a task and a
+workdir pre-loaded with the harness + a live browser daemon, then stream-parse
+its `codex exec --json` JSONL output.
+
+The joint system being benchmarked is (Codex CLI + browser-harness + OpenAI
+model). Pin `codex_version` and `framework_ref` for reproducible comparisons.
+
+Mirrors `frameworks/claude_code_harness/run_task.py` -- same browser
+provisioning (admin.start_remote_daemon under BU_NAME), same /tmp/shots
+screenshot drain, same FINAL ANSWER convention -- swapping out the agent CLI
+and its event schema.
+
+Codex JSON event schema (see https://developers.openai.com/codex/noninteractive):
+- `thread.started` {thread_id}
+- `turn.started`
+- `turn.completed` {usage: {input_tokens, cached_input_tokens, output_tokens,
+  reasoning_output_tokens}}
+- `turn.failed` {error: {...}}
+- `item.started` {item: {id, type, ...}}
+- `item.updated` {item: {...}}
+- `item.completed` {item: {id, type, ...}}
+   item.type in {agent_message, reasoning, command_execution, file_change,
+   mcp_tool_call, web_search, plan_update, todo_list, ...}
+- `error` {message}
+
+Codex does NOT emit a per-turn cost field; we compute cost from token counts
+via a small static price map (see _MODEL_PRICES). Models not in the map
+report cost=0.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow,
+# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules.
+HARNESS_DIR = "/tmp/browser-harness"
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "codex_version": "Codex CLI npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+    "sandbox": "Codex sandbox policy (read-only | workspace-write | danger-full-access; default: danger-full-access).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+# USD/token prices for cost estimation. Codex does not surface per-turn cost;
+# we compute total_cost = input * input_price + output * output_price. Cached
+# input tokens are charged at the cached rate when known, otherwise full rate.
+# Update as OpenAI publishes new prices. Models absent here report cost=0.
+_MODEL_PRICES: dict[str, dict[str, float]] = {
+    "gpt-5": {"input": 1.25e-6, "cached_input": 0.125e-6, "output": 10e-6},
+}
+
+
+def _model_price(model_name: str) -> dict[str, float] | None:
+    if model_name in _MODEL_PRICES:
+        return _MODEL_PRICES[model_name]
+    # Best-effort: strip common dated suffixes.
+    for key in _MODEL_PRICES:
+        if model_name.startswith(key):
+            return _MODEL_PRICES[key]
+    return None
+
+
+def _reset_shots_dir() -> None:
+    if SHOTS_DIR.exists():
+        shutil.rmtree(SHOTS_DIR)
+    SHOTS_DIR.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+def _start_browser(browser_name: str, bu_name: str) -> dict:
+    """Provision a browser for the harness to attach to."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for codex-harness: {browser_name}")
+    sys.path.insert(0, HARNESS_DIR)
+    from admin import start_remote_daemon  # type: ignore
+
+    return start_remote_daemon(name=bu_name)
+
+
+def _stop_browser(browser_name: str, bu_name: str) -> None:
+    try:
+        sys.path.insert(0, HARNESS_DIR)
+        from admin import stop_remote_daemon  # type: ignore
+
+        if browser_name == "browser-use-cloud":
+            stop_remote_daemon(name=bu_name)
+    except Exception as e:
+        print(f"Warning: failed to stop harness daemon: {e}")
+
+
+def _build_codex_cmd(model_name: str, sandbox: str) -> list[str]:
+    """Build the `codex exec` command. Prompt is passed via stdin.
+
+    Notes on flags:
+    - `--ask-for-approval` is NOT accepted at `codex exec` level in published
+      builds (despite docs suggesting global flags propagate). `codex exec`
+      is non-interactive by construction; no approval gating runs anyway.
+    - `--sandbox`, `--skip-git-repo-check`, `--ignore-user-config`, and
+      `--json` are exec-level flags.
+    - The prompt comes on stdin via `-` so we don't have to worry about
+      shell-escaping multi-MB prompts.
+    """
+    return [
+        "codex",
+        "exec",
+        "--json",
+        "--model",
+        model_name,
+        "--sandbox",
+        sandbox,
+        "--skip-git-repo-check",
+        "--ignore-user-config",
+        "-",  # read prompt from stdin
+    ]
+
+
+def _format_item(item: dict) -> str | None:
+    """Turn a single Codex `item.completed` payload into a step string."""
+    itype = item.get("type")
+    if itype == "agent_message":
+        text = (item.get("text") or "").strip()
+        if not text:
+            return None
+        return f"text: {text[:2000]}"
+    if itype == "reasoning":
+        # Codex emits a short summary; can also be in a `summary` array.
+        text = (item.get("text") or item.get("summary") or "").strip() if isinstance(
+            item.get("text") or item.get("summary"), str
+        ) else ""
+        if not text:
+            # Sometimes reasoning has a list of summary strings.
+            summary = item.get("summary")
+            if isinstance(summary, list):
+                text = " ".join(s for s in summary if isinstance(s, str)).strip()
+        if not text:
+            return "reasoning"
+        return f"reasoning: {text[:2000]}"
+    if itype == "command_execution":
+        cmd = item.get("command") or ""
+        if isinstance(cmd, list):
+            cmd = " ".join(str(c) for c in cmd)
+        cmd = (cmd or "").strip()
+        status = item.get("status") or ""
+        exit_code = item.get("exit_code")
+        out = item.get("aggregated_output") or item.get("output") or ""
+        if isinstance(out, dict):
+            out = out.get("text") or json.dumps(out, default=str)
+        out = (out or "").strip()
+        # Two-step: emit command itself, then result tag for visibility.
+        # We compact into a single step entry so step counts stay reasonable.
+        head = f"Bash: {cmd[:1500]}"
+        tail = ""
+        if status and status != "completed":
+            tail = f" [{status}]"
+        if exit_code is not None and exit_code != 0:
+            tail += f" exit={exit_code}"
+        if out:
+            tail += f"\n-> {out[:500]}"
+        return (head + tail)[:2000]
+    if itype == "file_change":
+        path = item.get("path") or ""
+        action = item.get("action") or "write"
+        return f"{action}: {path}"
+    if itype == "mcp_tool_call":
+        name = item.get("name") or item.get("tool") or "mcp"
+        args = item.get("arguments") or item.get("input") or {}
+        try:
+            blob = json.dumps(args, separators=(",", ":"))[:1500]
+        except Exception:
+            blob = str(args)[:1500]
+        return f"mcp:{name}: {blob}"
+    if itype == "web_search":
+        q = item.get("query") or ""
+        return f"web_search: {q[:500]}"
+    if itype == "plan_update" or itype == "todo_list":
+        try:
+            return f"{itype}: {json.dumps(item, default=str)[:1500]}"
+        except Exception:
+            return itype
+    if itype:
+        # Unknown but well-formed item type -- keep a breadcrumb.
+        try:
+            return f"{itype}: {json.dumps(item, default=str)[:1500]}"
+        except Exception:
+            return itype
+    return None
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    """Read stderr line-by-line, echo to our stdout, and buffer for later reporting."""
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        # Surface to GitHub Actions log in real time.
+        print(f"[codex-stderr] {s}", flush=True)
+
+
+def _compose_prompt(task_description: str) -> str:
+    """Combine our system prompt with the task. Codex CLI doesn't have
+    `--append-system-prompt-file`; we prepend the system prompt to the user
+    prompt instead. Codex also auto-loads `AGENTS.md` from the workdir, but
+    we put the rules in the prompt to be explicit + version-pinned."""
+    system = SYSTEM_PROMPT_FILE.read_text()
+    return f"{system}\n\n---\n\nTask:\n{task_description}\n"
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = os.environ["MODEL"]
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    task_index = os.environ.get("TASK_INDEX", "0")
+    sandbox = params.get("sandbox", "danger-full-access")
+    # task_timeout is consumed in main() before run_and_judge wraps execute.
+
+    bu_name = f"eval-{task_index}"
+    _reset_shots_dir()
+
+    # Pre-provision the browser so Codex starts with a live CDP attach.
+    _start_browser(browser_name, bu_name)
+
+    # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by
+    # default but accepts `CODEX_API_KEY` env explicitly (the only auth env
+    # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read
+    # by codex (it's for the OpenAI Python SDK). We mirror the workflow's
+    # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks
+    # both bcode (which uses OPENAI_API_KEY directly) and codex-harness.
+    #
+    # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness`
+    # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but
+    # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4
+    # showed the agent self-recovered by prepending the venv dir, but that
+    # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc
+    # in our system prompt + SKILL.md works on the first try.
+    harness_venv_bin = f"{HARNESS_DIR}/.venv/bin"
+    existing_path = os.environ.get("PATH", "")
+    env = {
+        **os.environ,
+        "BU_NAME": bu_name,
+        "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""),
+        "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin,
+    }
+
+    cmd = _build_codex_cmd(model_name, sandbox)
+    prompt = _compose_prompt(task_description)
+
+    start = time.time()
+    steps: list[str] = []
+    final_text = ""
+    total_input_tokens = 0
+    total_cached_input_tokens = 0
+    total_output_tokens = 0
+    total_reasoning_tokens = 0
+    turn_failed_error: str | None = None
+    error_events: list[str] = []
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=HARNESS_DIR,
+        env=env,
+        stdin=asyncio.subprocess.PIPE,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,  # 256 MiB safety cap on line buffer
+    )
+
+    # Pipe the prompt in on stdin and close.
+    assert proc.stdin is not None
+    proc.stdin.write(prompt.encode("utf-8"))
+    await proc.stdin.drain()
+    proc.stdin.close()
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        """Yield one JSONL line at a time. Codex item.completed payloads for
+        command_execution events can include large aggregated_output blobs --
+        read raw chunks and split on newlines, no per-line cap."""
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[codex-stdout-raw] {line}", flush=True)
+                continue
+
+            etype = event.get("type")
+            if etype == "item.completed":
+                item = event.get("item") or {}
+                s = _format_item(item)
+                if s:
+                    steps.append(s)
+                    print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+                # Track latest agent_message for FINAL ANSWER extraction.
+                if item.get("type") == "agent_message":
+                    text = (item.get("text") or "").strip()
+                    if text:
+                        final_text = text
+            elif etype == "turn.completed":
+                usage = event.get("usage") or {}
+                total_input_tokens += int(usage.get("input_tokens") or 0)
+                total_cached_input_tokens += int(usage.get("cached_input_tokens") or 0)
+                total_output_tokens += int(usage.get("output_tokens") or 0)
+                total_reasoning_tokens += int(usage.get("reasoning_output_tokens") or 0)
+                print(
+                    f"[codex-turn] in={usage.get('input_tokens')} "
+                    f"cached={usage.get('cached_input_tokens')} "
+                    f"out={usage.get('output_tokens')} "
+                    f"reasoning={usage.get('reasoning_output_tokens')}",
+                    flush=True,
+                )
+            elif etype == "turn.failed":
+                err = event.get("error") or {}
+                turn_failed_error = json.dumps(err, default=str)[:500]
+                print(f"[codex-turn-failed] {turn_failed_error}", flush=True)
+            elif etype == "error":
+                msg = event.get("message") or json.dumps(event, default=str)
+                error_events.append(str(msg)[:500])
+                print(f"[codex-error] {msg}", flush=True)
+            elif etype == "thread.started":
+                tid = event.get("thread_id")
+                print(f"[codex-thread] {tid}", flush=True)
+
+        # Wait for the process to exit cleanly.
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[codex-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_name, bu_name)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    # Cost estimate from token counts.
+    prices = _model_price(model_name)
+    if prices:
+        # Non-cached input is total_input_tokens - cached_input_tokens.
+        non_cached = max(0, total_input_tokens - total_cached_input_tokens)
+        cost = (
+            non_cached * prices["input"]
+            + total_cached_input_tokens * prices.get("cached_input", prices["input"])
+            + total_output_tokens * prices["output"]
+        )
+    else:
+        cost = 0.0
+
+    # Determine final result text. Prefer the FINAL ANSWER line.
+    match = FINAL_ANSWER_RE.search(final_text or "")
+    answer = match.group(1).strip() if match else (final_text.strip() or "")
+
+    # Failure precedence: turn.failed > error events > no agent_message > clean.
+    if turn_failed_error and not answer:
+        final_result = f"[codex_turn_failed] {turn_failed_error}"
+    elif error_events and not answer:
+        final_result = f"[codex_error] {error_events[-1]}"
+    elif not final_text:
+        if proc.returncode not in (0, None):
+            raise RuntimeError(
+                f"codex exited with code {proc.returncode} and emitted no agent_message. "
+                f"steps_captured={len(steps)} duration={duration:.1f}s "
+                f"stderr_tail:\n{stderr_tail[-2000:]}"
+            )
+        final_result = "Agent did not emit any output"
+    else:
+        final_result = answer or final_text.strip()
+        # If FINAL ANSWER missing but we had output, surface as fallback.
+        if not match:
+            final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    # Propagate task_timeout param to run_and_judge before it wraps execute().
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/codex_harness/system_prompt.md b/frameworks/codex_harness/system_prompt.md
new file mode 100644
index 0000000..7908e81
--- /dev/null
+++ b/frameworks/codex_harness/system_prompt.md
@@ -0,0 +1,15 @@
+﻿You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory.
+
+Hard rules:
+- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo.
+- The `browser-harness` CLI lives in the workdir venv at `./.venv/bin/browser-harness`. The shell's `PATH` already includes that directory (prepended by the runner). If you ever get `browser-harness: command not found`, you can also invoke it directly as `./.venv/bin/browser-harness <<'PY' ... PY` or run it via `uv run browser-harness <<'PY' ... PY` from the workdir.
+- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). When you reach an answer, deliver it in the format below and exit.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/pi_harness/__init__.py b/frameworks/pi_harness/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/pi_harness/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/pi_harness/run_task.py b/frameworks/pi_harness/run_task.py
new file mode 100644
index 0000000..785a27f
--- /dev/null
+++ b/frameworks/pi_harness/run_task.py
@@ -0,0 +1,447 @@
+﻿"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI)
+driving browser-harness.
+
+This framework is a near-mirror of `claude_code_harness`, except the coding
+agent is `pi` instead of Claude Code. The browser side is unchanged: we still
+pre-provision a live browser-harness daemon (Python `admin.start_remote_daemon`)
+and let the agent drive it via `browser-harness <<'PY' ... PY` heredocs.
+
+Joint system being benchmarked: (pi + browser-harness + Claude model).
+Restricted to Claude models, mirroring CCH. Pin `pi_version` and
+`framework_ref` for reproducible comparisons.
+
+Pi event-stream notes (`pi --mode json`):
+- First line is a `session` header (`{"type":"session",...}`).
+- `tool_execution_start` / `tool_execution_end` carry tool calls + results.
+- `message_end` carries finished assistant messages with `content` blocks
+  (text/thinking) -- same shape as Claude's content-block list.
+- There is no terminal `result` event with `total_cost_usd`. We therefore
+  collect cost per-turn from `turn_end.message.usage` if present and 0 otherwise.
+- `agent_end` is the terminal lifecycle event.
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow,
+# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules.
+HARNESS_DIR = "/tmp/browser-harness"
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "pi_version": "pi (@earendil-works/pi-coding-agent) npm version; consumed by the workflow install step (default: latest)",
+    "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.",
+    "thinking": "pi thinking level: off|minimal|low|medium|high|xhigh (default: high)",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+
+
+def _require_claude_model(model_name: str) -> str:
+    """This framework only supports Claude models, mirroring CCH."""
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"pi-harness requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _reset_shots_dir() -> None:
+    if SHOTS_DIR.exists():
+        shutil.rmtree(SHOTS_DIR)
+    SHOTS_DIR.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    """Read every PNG written to /tmp/shots in step order as base64."""
+    if not SHOTS_DIR.exists():
+        return []
+    paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file())
+    return [base64.b64encode(p.read_bytes()).decode() for p in paths]
+
+
+def _start_browser(browser_name: str, bu_name: str) -> dict:
+    """Provision a browser for the harness to attach to. Returns the cloud browser dict."""
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for pi-harness: {browser_name}")
+    sys.path.insert(0, HARNESS_DIR)
+    from admin import start_remote_daemon  # type: ignore
+
+    return start_remote_daemon(name=bu_name)
+
+
+def _stop_browser(browser_name: str, bu_name: str) -> None:
+    try:
+        sys.path.insert(0, HARNESS_DIR)
+        from admin import stop_remote_daemon  # type: ignore
+
+        if browser_name == "browser-use-cloud":
+            stop_remote_daemon(name=bu_name)
+    except Exception as e:
+        print(f"Warning: failed to stop harness daemon: {e}")
+
+
+def _build_pi_cmd(
+    task_description: str,
+    model_name: str,
+    thinking: str,
+    system_prompt: str,
+) -> list[str]:
+    cmd = [
+        "pi",
+        "--mode",
+        "json",
+        "--provider",
+        "anthropic",
+        "--model",
+        model_name,
+        "--thinking",
+        thinking,
+        "--no-session",
+        "--no-context-files",
+        "--no-extensions",
+        "--no-skills",
+        "--no-prompt-templates",
+        "--no-themes",
+        "--offline",
+        "--append-system-prompt",
+        system_prompt,
+        task_description,
+    ]
+    return cmd
+
+
+def _stringify_content(content) -> str:
+    """Flatten a content-block list (or string) to a single string."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                t = c.get("type")
+                if t == "text":
+                    parts.append(c.get("text", ""))
+                elif t == "thinking":
+                    parts.append(c.get("thinking", ""))
+                elif t == "image":
+                    parts.append("<image>")
+                else:
+                    try:
+                        parts.append(json.dumps(c, separators=(",", ":")))
+                    except Exception:
+                        parts.append(str(c))
+            else:
+                parts.append(str(c))
+        return "\n".join(parts)
+    try:
+        return json.dumps(content, default=str)
+    except Exception:
+        return str(content)
+
+
+def _format_assistant_message(message: dict) -> list[str]:
+    """Turn an assistant message_end's content blocks into step strings.
+
+    Tool-use blocks are skipped here (they are emitted as `tool_execution_*`
+    events separately) so we don't double-count them.
+    """
+    steps: list[str] = []
+    content = message.get("content", []) or []
+    if not isinstance(content, list):
+        return steps
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        btype = block.get("type")
+        if btype == "text":
+            text = (block.get("text") or "").strip()
+            if text:
+                steps.append(f"text: {text[:2000]}")
+        elif btype == "thinking":
+            text = (block.get("thinking") or "").strip()
+            if text:
+                steps.append(f"thinking: {text[:2000]}")
+        # tool_use blocks are handled by tool_execution_* events.
+    return steps
+
+
+def _format_tool_call(tool_name: str, args) -> str:
+    """Format a tool_execution_start event into a step string (matching CCH)."""
+    if not isinstance(args, dict):
+        try:
+            return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return tool_name
+    if tool_name == "bash":
+        return f"Bash: {(args.get('command') or '').strip()[:2000]}"
+    if tool_name in ("edit", "write", "read"):
+        path = args.get("file_path") or args.get("path") or ""
+        return f"{tool_name.capitalize()}: {path}"
+    try:
+        return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+    except Exception:
+        return tool_name
+
+
+def _format_tool_result(tool_name: str, result, is_error: bool) -> str | None:
+    """Format a tool_execution_end event into a step string."""
+    prefix = "tool_error" if is_error else "tool_result"
+    content = _stringify_content(result).strip()
+    if not content:
+        return None
+    return f"{prefix}: {content[:2000]}"
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    """Read stderr line-by-line, echo to our stdout, and buffer for later reporting."""
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        # Surface to GitHub Actions log in real time.
+        print(f"[pi-stderr] {s}", flush=True)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    task_index = os.environ.get("TASK_INDEX", "0")
+    thinking = params.get("thinking", "high")
+    # task_timeout is consumed in main() before run_and_judge wraps execute.
+
+    bu_name = f"eval-{task_index}"
+    _reset_shots_dir()
+
+    # Pre-provision the browser so pi starts with a live CDP attach.
+    _start_browser(browser_name, bu_name)
+
+    env = {
+        **os.environ,
+        "BU_NAME": bu_name,
+        "DISABLE_TELEMETRY": "1",
+        # Pi-specific: skip startup network ops so a flaky pi.dev doesn't
+        # block the run, and disable install/update telemetry.
+        "PI_OFFLINE": "1",
+        "PI_SKIP_VERSION_CHECK": "1",
+        "PI_TELEMETRY": "0",
+    }
+
+    system_prompt = SYSTEM_PROMPT_FILE.read_text()
+    cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
+
+    start = time.time()
+    steps: list[str] = []
+    last_assistant_text = ""
+    total_cost = 0.0
+    saw_agent_end = False
+    stderr_buf: list[str] = []
+
+    # pi stream-json lines can be huge (tool results with full page HTML/text).
+    # Same workaround as CCH: read raw chunks and split on newlines.
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=HARNESS_DIR,
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,  # 256 MiB safety cap
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        """Yield one stream-json line at a time, regardless of line length."""
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16  # 64 KiB
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[pi-stdout-raw] {line[:500]}", flush=True)
+                continue
+
+            etype = event.get("type")
+
+            if etype == "tool_execution_start":
+                s = _format_tool_call(event.get("toolName") or "?", event.get("args"))
+                if s:
+                    steps.append(s)
+                    print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            elif etype == "tool_execution_end":
+                s = _format_tool_result(
+                    event.get("toolName") or "?",
+                    event.get("result"),
+                    bool(event.get("isError")),
+                )
+                if s:
+                    steps.append(s)
+                    print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            elif etype == "message_end":
+                msg = event.get("message", {}) or {}
+                if msg.get("role") == "assistant":
+                    new_steps = _format_assistant_message(msg)
+                    for s in new_steps:
+                        steps.append(s)
+                        print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+                    # Track latest assistant text for FINAL ANSWER extraction.
+                    txt = _stringify_content(msg.get("content"))
+                    if txt:
+                        last_assistant_text = txt
+
+            elif etype == "turn_end":
+                # Some pi providers carry usage on the final assistant message.
+                msg = event.get("message", {}) or {}
+                usage = msg.get("usage") or {}
+                cost = usage.get("cost") or usage.get("total_cost") or usage.get("total_cost_usd")
+                if isinstance(cost, (int, float)):
+                    total_cost += float(cost)
+
+            elif etype == "agent_end":
+                saw_agent_end = True
+                # Final fallback: scan the full message list for the last
+                # assistant message in case message_end was missed.
+                msgs = event.get("messages", []) or []
+                for m in reversed(msgs):
+                    if isinstance(m, dict) and m.get("role") == "assistant":
+                        txt = _stringify_content(m.get("content"))
+                        if txt:
+                            last_assistant_text = last_assistant_text or txt
+                            break
+
+        # Wait for the process (stdout closed implies near-exit)
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[pi-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_name, bu_name)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    # Hard error: pi exited non-zero AND we never saw agent_end.
+    if not saw_agent_end and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"pi exited with code {proc.returncode} before emitting agent_end. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    # Extract FINAL ANSWER from the last assistant text.
+    match = FINAL_ANSWER_RE.search(last_assistant_text or "")
+    answer = match.group(1).strip() if match else (last_assistant_text.strip() or "")
+
+    if not saw_agent_end:
+        # Soft error: pi exited 0 but never emitted agent_end. Surface but keep data.
+        final_result = f"[pi_no_agent_end] {answer}" if answer else "[pi_no_agent_end] Agent did not complete task."
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    # Propagate task_timeout param to run_and_judge before it wraps execute().
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/pi_harness/system_prompt.md b/frameworks/pi_harness/system_prompt.md
new file mode 100644
index 0000000..d1566de
--- /dev/null
+++ b/frameworks/pi_harness/system_prompt.md
@@ -0,0 +1,13 @@
+﻿You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory.
+
+Hard rules:
+- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo.
+- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
+- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
+- Do not edit files outside the current working directory.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/pibt/__init__.py b/frameworks/pibt/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/pibt/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/pibt/run_task.py b/frameworks/pibt/run_task.py
new file mode 100644
index 0000000..d8fd61b
--- /dev/null
+++ b/frameworks/pibt/run_task.py
@@ -0,0 +1,466 @@
+﻿"""Run a single benchmark task using PIBT (pi browser terminal).
+
+PIBT = pi (the @earendil-works/pi-coding-agent CLI) + the
+`pi-agent-extensions` package, which provides built-in browser tools via a
+vendored `browser-harness-js` (CDP-based). No external Python harness, no
+heredocs -- pi drives the browser through its own `cdp_*` tool surface.
+
+Joint system being benchmarked: (pi + pi-agent-extensions + Claude model).
+Restricted to Claude models, mirroring CCH/PIH.
+
+Browser wiring: we pre-allocate a `browser-use-cloud` session via the v3 API
+(same path as bcode/cch-js), resolve the CDP WebSocket URL, and pass it as
+`BU_CDP_WS` in the pi subprocess env. The system prompt instructs the agent
+to call `cdp_connect({ wsUrl: process.env.BU_CDP_WS })` once at the start.
+
+Pi event-stream parsing follows PIH (`tool_execution_start/end`, `message_end`,
+`turn_end.message.usage`, `agent_end`).
+"""
+
+import asyncio
+import base64
+import json
+import os
+import re
+import shutil
+import sys
+import time
+import urllib.request
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {
+    "pi_version": "pi (@earendil-works/pi-coding-agent) npm version; consumed by the workflow install step (default: latest).",
+    "framework_repo": "Override GitHub repo for pi-agent-extensions install (e.g. fork/pi-agent-extensions). Consumed by the workflow install step.",
+    "thinking": "pi thinking level: off|minimal|low|medium|high|xhigh (default: high).",
+    "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).",
+}
+
+SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md"
+SHOTS_DIR = Path("/tmp/shots")
+FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE)
+# pi-agent-extensions are installed by the workflow install step; the runner
+# uses the pi `--extensions <path>` flag (or default loader) to pick them up.
+EXTENSIONS_DIR = "/tmp/pi-agent-extensions"
+# Tools we surface to pi. Includes the cdp_* tools from pi-agent-extensions
+# plus a minimal builtin set (bash for shots dir mgmt, read/write for general
+# scaffolding). Subagent tools are intentionally omitted.
+PIBT_TOOLS = "bash,read,write,cdp_connect,cdp_eval,cdp_status,cdp_targets,cdp_use_target"
+
+
+def _require_claude_model(model_name: str) -> str:
+    if not model_name.startswith("claude-"):
+        raise ValueError(
+            f"pibt requires a Claude model. Got: {model_name!r}. "
+            f"Supported model aliases start with 'claude-' (see models.py)."
+        )
+    return model_name
+
+
+def _bu_api_base() -> str:
+    base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
+    version = os.environ.get("BU_CLOUD_API_VERSION", "v3")
+    return f"{base}/api/{version}"
+
+
+def _bu_api_key() -> str:
+    return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]
+
+
+def _bu(path: str, method: str, body: dict | None = None) -> dict:
+    req = urllib.request.Request(
+        f"{_bu_api_base()}{path}",
+        method=method,
+        data=(json.dumps(body).encode() if body is not None else None),
+        headers={
+            "X-Browser-Use-API-Key": _bu_api_key(),
+            "Content-Type": "application/json",
+        },
+    )
+    return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}")
+
+
+def _start_browser() -> tuple[str, str]:
+    """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    info = _bu("/browsers", "POST", {})
+    cdp_ws = json.loads(
+        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+    )["webSocketDebuggerUrl"]
+    return info["id"], cdp_ws
+
+
+def _stop_browser(browser_id: str | None) -> None:
+    if not browser_id:
+        return
+    try:
+        _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"})
+    except Exception as e:
+        print(f"Warning: failed to stop browser {browser_id}: {e}")
+
+
+def _reset_shots_dir() -> None:
+    if SHOTS_DIR.exists():
+        shutil.rmtree(SHOTS_DIR)
+    SHOTS_DIR.mkdir(parents=True)
+
+
+def _collect_screenshots() -> list[str]:
+    if not SHOTS_DIR.exists():
+        return []
+    return [
+        base64.b64encode(p.read_bytes()).decode()
+        for p in sorted(SHOTS_DIR.glob("*.png"))
+        if p.is_file()
+    ]
+
+
+def _build_pi_cmd(
+    task_description: str,
+    model_name: str,
+    thinking: str,
+    system_prompt: str,
+) -> list[str]:
+    # NOTE: NOT --no-extensions (we want pi-agent-extensions to load).
+    # Still pass --no-context-files / --no-skills / --no-prompt-templates /
+    # --no-themes / --no-session for hermeticity. --offline disables the
+    # update-check network call. --tools restricts the model to the
+    # cdp_* surface plus minimal scaffolding.
+    cmd = [
+        "pi",
+        "--mode",
+        "json",
+        "--provider",
+        "anthropic",
+        "--model",
+        model_name,
+        "--thinking",
+        thinking,
+        "--tools",
+        PIBT_TOOLS,
+        "--no-session",
+        "--no-context-files",
+        "--no-skills",
+        "--no-prompt-templates",
+        "--no-themes",
+        "--offline",
+        "--append-system-prompt",
+        system_prompt,
+        task_description,
+    ]
+    return cmd
+
+
+def _stringify_content(content) -> str:
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        parts = []
+        for c in content:
+            if isinstance(c, dict):
+                t = c.get("type")
+                if t == "text":
+                    parts.append(c.get("text", ""))
+                elif t == "thinking":
+                    parts.append(c.get("thinking", ""))
+                elif t == "image":
+                    parts.append("<image>")
+                else:
+                    try:
+                        parts.append(json.dumps(c, separators=(",", ":")))
+                    except Exception:
+                        parts.append(str(c))
+            else:
+                parts.append(str(c))
+        return "\n".join(parts)
+    try:
+        return json.dumps(content, default=str)
+    except Exception:
+        return str(content)
+
+
+def _format_assistant_message(message: dict) -> list[str]:
+    steps: list[str] = []
+    content = message.get("content", []) or []
+    if not isinstance(content, list):
+        return steps
+    for block in content:
+        if not isinstance(block, dict):
+            continue
+        btype = block.get("type")
+        if btype == "text":
+            text = (block.get("text") or "").strip()
+            if text:
+                steps.append(f"text: {text[:2000]}")
+        elif btype == "thinking":
+            text = (block.get("thinking") or "").strip()
+            if text:
+                steps.append(f"thinking: {text[:2000]}")
+    return steps
+
+
+def _format_tool_call(tool_name: str, args) -> str:
+    if not isinstance(args, dict):
+        try:
+            return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+        except Exception:
+            return tool_name
+    if tool_name == "bash":
+        return f"Bash: {(args.get('command') or '').strip()[:2000]}"
+    if tool_name == "cdp_eval":
+        return f"cdp_eval: {(args.get('code') or '').strip()[:2000]}"
+    if tool_name == "cdp_connect":
+        url = args.get("wsUrl") or args.get("profileDir") or ""
+        return f"cdp_connect: {url[:500]}"
+    if tool_name in ("cdp_status", "cdp_targets"):
+        return tool_name
+    if tool_name == "cdp_use_target":
+        return f"cdp_use_target: {args.get('targetId') or ''}"
+    if tool_name in ("read", "write", "edit"):
+        path = args.get("file_path") or args.get("path") or ""
+        return f"{tool_name.capitalize()}: {path}"
+    try:
+        return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}"
+    except Exception:
+        return tool_name
+
+
+def _format_tool_result(tool_name: str, result, is_error: bool) -> str | None:
+    prefix = "tool_error" if is_error else "tool_result"
+    content = _stringify_content(result).strip()
+    if not content:
+        return None
+    return f"{prefix}: {content[:2000]}"
+
+
+async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None:
+    assert proc.stderr is not None
+    while True:
+        line = await proc.stderr.readline()
+        if not line:
+            break
+        try:
+            s = line.decode("utf-8", errors="replace").rstrip("\n")
+        except Exception:
+            s = repr(line)
+        buf.append(s)
+        print(f"[pi-stderr] {s}", flush=True)
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    params = validate_params(parse_params(), ACCEPTED_PARAMS)
+    model_name = _require_claude_model(os.environ["MODEL"])
+    browser_name = os.environ.get("BROWSER", "browser-use-cloud")
+    if browser_name != "browser-use-cloud":
+        raise ValueError(f"Unsupported browser for pibt: {browser_name}")
+    thinking = params.get("thinking", "high")
+
+    _reset_shots_dir()
+
+    # Provision a remote browser; pi attaches over CDP via cdp_connect with the
+    # WS URL we hand it through env.
+    browser_id, cdp_ws = _start_browser()
+
+    env = {
+        **os.environ,
+        "BU_CDP_WS": cdp_ws,
+        "DISABLE_TELEMETRY": "1",
+        "PI_OFFLINE": "1",
+        "PI_SKIP_VERSION_CHECK": "1",
+        "PI_TELEMETRY": "0",
+    }
+
+    system_prompt = SYSTEM_PROMPT_FILE.read_text()
+    cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
+
+    start = time.time()
+    steps: list[str] = []
+    last_assistant_text = ""
+    total_cost = 0.0
+    saw_agent_end = False
+    stderr_buf: list[str] = []
+
+    proc = await asyncio.create_subprocess_exec(
+        *cmd,
+        cwd=EXTENSIONS_DIR,  # pi loads the package.json `pi.extensions` from CWD
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+        limit=256 * 1024 * 1024,
+    )
+
+    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+
+    async def _iter_stdout_lines():
+        assert proc.stdout is not None
+        buf = bytearray()
+        CHUNK = 1 << 16
+        while True:
+            chunk = await proc.stdout.read(CHUNK)
+            if not chunk:
+                if buf:
+                    yield bytes(buf)
+                    buf.clear()
+                return
+            buf.extend(chunk)
+            while True:
+                nl = buf.find(b"\n")
+                if nl < 0:
+                    break
+                line_bytes = bytes(buf[:nl])
+                del buf[: nl + 1]
+                yield line_bytes
+
+    try:
+        assert proc.stdout is not None
+        async for raw in _iter_stdout_lines():
+            if not raw:
+                continue
+            line = raw.decode("utf-8", errors="replace").rstrip("\n")
+            if not line:
+                continue
+            try:
+                event = json.loads(line)
+            except json.JSONDecodeError:
+                print(f"[pi-stdout-raw] {line[:500]}", flush=True)
+                continue
+
+            etype = event.get("type")
+
+            if etype == "tool_execution_start":
+                s = _format_tool_call(event.get("toolName") or "?", event.get("args"))
+                if s:
+                    steps.append(s)
+                    print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            elif etype == "tool_execution_end":
+                s = _format_tool_result(
+                    event.get("toolName") or "?",
+                    event.get("result"),
+                    bool(event.get("isError")),
+                )
+                if s:
+                    steps.append(s)
+                    print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+
+            elif etype == "message_end":
+                msg = event.get("message", {}) or {}
+                if msg.get("role") == "assistant":
+                    new_steps = _format_assistant_message(msg)
+                    for s in new_steps:
+                        steps.append(s)
+                        print(f"[step {len(steps):>3}] {s[:500]}", flush=True)
+                    txt = _stringify_content(msg.get("content"))
+                    if txt:
+                        last_assistant_text = txt
+
+            elif etype == "turn_end":
+                msg = event.get("message", {}) or {}
+                usage = msg.get("usage") or {}
+                cost = (
+                    usage.get("cost")
+                    or usage.get("total_cost")
+                    or usage.get("total_cost_usd")
+                )
+                if isinstance(cost, (int, float)):
+                    total_cost += float(cost)
+
+            elif etype == "agent_end":
+                saw_agent_end = True
+                msgs = event.get("messages", []) or []
+                for m in reversed(msgs):
+                    if isinstance(m, dict) and m.get("role") == "assistant":
+                        txt = _stringify_content(m.get("content"))
+                        if txt:
+                            last_assistant_text = last_assistant_text or txt
+                            break
+
+        try:
+            await asyncio.wait_for(proc.wait(), timeout=60)
+        except asyncio.TimeoutError:
+            print("[pibt-runner] proc did not exit within 60s of stdout close; killing", flush=True)
+            proc.kill()
+            await proc.wait()
+
+        try:
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            stderr_task.cancel()
+    finally:
+        if proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        if not stderr_task.done():
+            stderr_task.cancel()
+        _stop_browser(browser_id)
+
+    duration = time.time() - start
+    stderr_tail = "\n".join(stderr_buf[-50:])
+
+    if not saw_agent_end and proc.returncode not in (0, None):
+        raise RuntimeError(
+            f"pi exited with code {proc.returncode} before emitting agent_end. "
+            f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}"
+        )
+
+    match = FINAL_ANSWER_RE.search(last_assistant_text or "")
+    answer = match.group(1).strip() if match else (last_assistant_text.strip() or "")
+
+    if not saw_agent_end:
+        final_result = (
+            f"[pi_no_agent_end] {answer}"
+            if answer
+            else "[pi_no_agent_end] Agent did not complete task."
+        )
+    else:
+        final_result = answer or "Agent did not emit FINAL ANSWER line"
+
+    return ExecutionResult(
+        final_result=final_result,
+        steps=steps,
+        screenshots_b64=_collect_screenshots(),
+        num_steps=len(steps),
+        duration_seconds=duration,
+        cost=total_cost,
+    )
+
+
+async def main():
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    early_params = parse_params()
+    if "task_timeout" in early_params:
+        os.environ["TASK_TIMEOUT"] = early_params["task_timeout"]
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/frameworks/pibt/system_prompt.md b/frameworks/pibt/system_prompt.md
new file mode 100644
index 0000000..e3ec34a
--- /dev/null
+++ b/frameworks/pibt/system_prompt.md
@@ -0,0 +1,13 @@
+﻿You are evaluating a benchmark task by driving a real browser via the pi browser-harness CDP tools (`cdp_connect`, `cdp_eval`, `cdp_status`, `cdp_targets`, `cdp_use_target`). The browser-harness-js extension is already installed.
+
+Hard rules:
+- Connect once at the start by calling `cdp_connect` with `wsUrl` set to `process.env.BU_CDP_WS` (the env var holds the WebSocket URL of a live browser-use cloud browser). Example: `cdp_connect({ "wsUrl": "<the value of BU_CDP_WS>" })`. Read the env var with `cdp_eval` first if you need to: `return process.env.BU_CDP_WS`.
+- Drive the browser exclusively through `cdp_eval`. Use idiomatic helpers: `gotoUrl(url)`, `waitForLoad()`, `js("...")` or `js(() => ...)`, `pageInfo()`, `clickAtXY(x, y)`, `typeText(text)`, `pressKey(key)`, `scroll({dy})`, `captureScreenshot({path})`. For raw CDP, use `cdp("Domain.method", params)`. NEVER use `session.send(...)` or `session.<Domain>.<method>(...)` -- that is not the contract.
+- Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing each shot. Pass it as the path: `await captureScreenshot({ path: "/tmp/shots/step_001.png" })`. Never overwrite a previous screenshot path. The PNG is also attached inline to the tool result automatically.
+- Do not install other browser tools, do not start a different browser, do not use Playwright.
+- Do not ask clarifying questions. If ambiguous, pick the most reasonable interpretation and proceed.
+- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
+
+FINAL ANSWER: <your concise answer to the task, on a single line>
+
+If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored.
diff --git a/frameworks/stagehand/__init__.py b/frameworks/stagehand/__init__.py
new file mode 100644
index 0000000..5f28270
--- /dev/null
+++ b/frameworks/stagehand/__init__.py
@@ -0,0 +1 @@
+﻿
\ No newline at end of file
diff --git a/frameworks/stagehand/executor.js b/frameworks/stagehand/executor.js
new file mode 100644
index 0000000..61eca97
--- /dev/null
+++ b/frameworks/stagehand/executor.js
@@ -0,0 +1,65 @@
+﻿/**
+ * Stagehand agent executor.
+ *
+ * Reads TASK_DESCRIPTION and BROWSER from env.
+ * Runs the Stagehand agent and prints a JSON result to stdout.
+ *
+ * Expected stdout format:
+ * {
+ *   "final_result": "...",
+ *   "steps": ["step 1", "step 2", ...],
+ *   "screenshots_b64": ["base64...", ...],
+ *   "num_steps": 10,
+ *   "duration_seconds": 45.2,
+ *   "cost": 0.05
+ * }
+ */
+
+// TODO: Implement Stagehand execution
+// const { Stagehand } = require("@browserbasehq/stagehand");
+
+async function main() {
+  const taskDescription = process.env.TASK_DESCRIPTION;
+  const browser = process.env.BROWSER || "browserbase";
+
+  if (!taskDescription) {
+    console.error("TASK_DESCRIPTION env var is required");
+    process.exit(1);
+  }
+
+  const startTime = Date.now();
+
+  // TODO: Initialize Stagehand with appropriate env (BROWSERBASE or LOCAL)
+  // const stagehand = new Stagehand({
+  //   env: browser === "browserbase" ? "BROWSERBASE" : "LOCAL",
+  //   modelName: "anthropic/claude-sonnet-4-20250514",
+  //   modelClientOptions: { apiKey: process.env.ANTHROPIC_API_KEY },
+  // });
+  // await stagehand.init();
+  //
+  // const page = stagehand.context.pages()[0];
+  // const agent = stagehand.agent({ modelName: "anthropic/claude-sonnet-4-20250514" });
+  // const result = await agent.execute({ instruction: taskDescription });
+  //
+  // await stagehand.close();
+
+  const durationSeconds = (Date.now() - startTime) / 1000;
+
+  // TODO: Map Stagehand result to standard format
+  const output = {
+    final_result: "NOT IMPLEMENTED",
+    steps: [],
+    screenshots_b64: [],
+    num_steps: 0,
+    duration_seconds: durationSeconds,
+    cost: 0,
+  };
+
+  // Print JSON to stdout for the Python wrapper to parse
+  console.log(JSON.stringify(output));
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/frameworks/stagehand/package.json b/frameworks/stagehand/package.json
new file mode 100644
index 0000000..7a963ae
--- /dev/null
+++ b/frameworks/stagehand/package.json
@@ -0,0 +1,8 @@
+﻿{
+  "name": "benchmark-stagehand-executor",
+  "private": true,
+  "type": "commonjs",
+  "dependencies": {
+    "@browserbasehq/stagehand": "^2.0.0"
+  }
+}
diff --git a/frameworks/stagehand/run_task.py b/frameworks/stagehand/run_task.py
new file mode 100644
index 0000000..1bc2b36
--- /dev/null
+++ b/frameworks/stagehand/run_task.py
@@ -0,0 +1,86 @@
+﻿"""Run a single benchmark task using the Stagehand agent framework.
+
+Stagehand is a TypeScript framework. This Python entry point:
+1. Loads the task and wires up Laminar (shared infra)
+2. Shells out to node executor.js which runs the Stagehand agent
+3. Parses the JSON result from stdout into ExecutionResult
+4. Feeds it into the shared judge flow
+"""
+
+import json
+import os
+import subprocess
+import sys
+import asyncio
+from pathlib import Path
+
+# Add project root to path for sibling imports
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent))
+
+from dotenv import load_dotenv
+from laminar import LaminarService
+from frameworks import (
+    ExecutionResult,
+    load_tasks,
+    interleave,
+    run_and_judge,
+    parse_params,
+    validate_params,
+)
+
+load_dotenv()
+
+ACCEPTED_PARAMS: dict[str, str] = {}
+
+EXECUTOR_DIR = Path(__file__).resolve().parent
+EXECUTOR_SCRIPT = EXECUTOR_DIR / "executor.js"
+
+
+async def execute(task_description: str) -> ExecutionResult:
+    """Run the Stagehand agent via node subprocess."""
+    browser_name = os.environ.get("BROWSER", "browserbase")
+
+    env = {**os.environ, "TASK_DESCRIPTION": task_description, "BROWSER": browser_name}
+    proc = subprocess.run(
+        ["node", str(EXECUTOR_SCRIPT)],
+        capture_output=True,
+        text=True,
+        timeout=900,
+        env=env,
+        cwd=str(EXECUTOR_DIR),
+    )
+
+    if proc.returncode != 0:
+        raise RuntimeError(f"Stagehand executor failed: {proc.stderr}")
+
+    data = json.loads(proc.stdout)
+    return ExecutionResult(
+        final_result=data.get("final_result", ""),
+        steps=data.get("steps", []),
+        screenshots_b64=data.get("screenshots_b64", []),
+        num_steps=data.get("num_steps", 0),
+        duration_seconds=data.get("duration_seconds", 0),
+        cost=data.get("cost", 0),
+    )
+
+
+async def main():
+    validate_params(parse_params(), ACCEPTED_PARAMS)
+    task_index = int(os.environ["TASK_INDEX"])
+    eval_id = os.environ["EVAL_ID"]
+    benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1")
+
+    tasks = load_tasks(benchmark)
+    if len(tasks) == 100:
+        tasks = interleave(tasks)
+    task = tasks[task_index]
+    task["_index"] = task_index
+
+    LaminarService.initialize()
+    LaminarService.attach_evaluation(eval_id)
+
+    await run_and_judge(task, execute)
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/laminar.py b/laminar.py
new file mode 100644
index 0000000..1cb7d9e
--- /dev/null
+++ b/laminar.py
@@ -0,0 +1,49 @@
+"""No-op Laminar compatibility layer for public local verification."""
+
+from typing import Any
+
+
+class LaminarService:
+    @classmethod
+    def initialize(cls) -> bool:
+        return False
+
+    @classmethod
+    def is_enabled(cls) -> bool:
+        return False
+
+    @classmethod
+    def create_evaluation(cls, *args: Any, **kwargs: Any) -> None:
+        return None
+
+    @classmethod
+    def attach_evaluation(cls, eval_id: str) -> None:
+        return None
+
+    @classmethod
+    def get_eval_id(cls) -> None:
+        return None
+
+    @classmethod
+    def get_eval_url(cls) -> None:
+        return None
+
+    @classmethod
+    def create_datapoint(cls, task: dict[str, Any]) -> None:
+        return None
+
+    @classmethod
+    def set_datapoint_score(
+        cls,
+        datapoint_id: str | None,
+        score: int,
+        final_result: str,
+        agent_steps: list[str],
+        metrics: dict[str, Any],
+        judgement: dict[str, Any],
+    ) -> None:
+        return None
+
+    @classmethod
+    def set_datapoint_error(cls, datapoint_id: str | None, error_msg: str) -> None:
+        return None
diff --git a/lmnr.py b/lmnr.py
new file mode 100644
index 0000000..c454556
--- /dev/null
+++ b/lmnr.py
@@ -0,0 +1,38 @@
+"""Local no-op subset of lmnr used by the public benchmark runners.
+
+The remote benchmark runner can attach traces to Laminar. Public
+verification writes local JSON artifacts instead, so these hooks are inert.
+"""
+
+from collections.abc import Callable
+from typing import Any, TypeVar
+
+F = TypeVar("F", bound=Callable[..., Any])
+
+
+def observe(*args: Any, **kwargs: Any):
+    if args and callable(args[0]) and len(args) == 1 and not kwargs:
+        return args[0]
+
+    def decorator(fn: F) -> F:
+        return fn
+
+    return decorator
+
+
+class Laminar:
+    @staticmethod
+    def initialize(*args: Any, **kwargs: Any) -> None:
+        return None
+
+    @staticmethod
+    def serialize_span_context() -> None:
+        return None
+
+    @staticmethod
+    def get_trace_id() -> None:
+        return None
+
+
+class LaminarClient:
+    pass
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..9ad0e2c
--- /dev/null
+++ b/models.py
@@ -0,0 +1,50 @@
+"""Public model registry for local benchmark verification."""
+
+import os
+
+from browser_use import ChatGoogle
+from browser_use.llm import ChatAnthropic, ChatBrowserUse, ChatOpenAI
+
+
+def _openai(model: str):
+    return ChatOpenAI(model=model, api_key=os.getenv("OPENAI_API_KEY"))
+
+
+def _anthropic(model: str):
+    return ChatAnthropic(model=model, api_key=os.getenv("ANTHROPIC_API_KEY"))
+
+
+def _google(model: str):
+    return ChatGoogle(model=model, api_key=os.getenv("GOOGLE_API_KEY"))
+
+
+def _openrouter(model: str):
+    return ChatOpenAI(
+        model=model,
+        base_url="https://openrouter.ai/api/v1",
+        api_key=os.getenv("OPENROUTER_API_KEY"),
+    )
+
+
+MODELS = {
+    "bu-1-0": lambda: ChatBrowserUse(model="bu-1-0"),
+    "bu-2-0": lambda: ChatBrowserUse(model="bu-2-0"),
+    "gpt-4.1": lambda: _openai("gpt-4.1"),
+    "gpt-5": lambda: _openai("gpt-5"),
+    "gpt-5-mini": lambda: _openai("gpt-5-mini"),
+    "gpt-5.1-codex-mini": lambda: _openai("gpt-5.1-codex-mini"),
+    "claude-3-5-haiku": lambda: _anthropic("claude-3-5-haiku"),
+    "claude-haiku-4-5": lambda: _anthropic("claude-haiku-4-5"),
+    "claude-sonnet-4-5": lambda: _anthropic("claude-sonnet-4-5"),
+    "claude-sonnet-4-6": lambda: _anthropic("claude-sonnet-4-6"),
+    "claude-opus-4-5": lambda: _anthropic("claude-opus-4-5"),
+    "claude-opus-4-6": lambda: _anthropic("claude-opus-4-6"),
+    "claude-opus-4-7": lambda: _anthropic("claude-opus-4-7"),
+    "gemini-2.5-flash-lite": lambda: _google("gemini-2.5-flash-lite"),
+    "gemini-2.5-flash": lambda: _google("gemini-2.5-flash"),
+    "gemini-3-flash-preview": lambda: _google("gemini-3-flash-preview"),
+    "gemini-3-pro-preview": lambda: _google("gemini-3-pro-preview"),
+    "gemini-3.1-pro-preview": lambda: _google("gemini-3.1-pro-preview"),
+    "gemini-3-1-pro-preview": lambda: _google("gemini-3.1-pro-preview"),
+    "kimi-k2.5": lambda: _openrouter("moonshotai/kimi-k2.5"),
+}
diff --git a/run_framework_eval.py b/run_framework_eval.py
new file mode 100644
index 0000000..8114b4e
--- /dev/null
+++ b/run_framework_eval.py
@@ -0,0 +1,295 @@
+"""Run BU_Bench_V1 through any registered framework adapter.
+
+The public verifier decrypts BU_Bench_V1.enc in memory, runs the selected
+adapter, judges each trace, writes summaries under ignored results/, and writes
+task-level traces under ignored run_data/. Do not publish run_data/ artifacts:
+they include decrypted task text, ground truth, model outputs, and screenshots.
+
+Examples:
+    uv run python run_framework_eval.py --list-frameworks
+    uv run python run_framework_eval.py --framework browser-use --browser browser-use-cloud --model bu-2-0
+    uv run python run_framework_eval.py --framework browser-use-cloud-api-v3 --model bu-ultra
+    uv run python run_framework_eval.py --framework bcode-v012 --framework-ref v0.1.2 --model gpt-5 --tasks 5
+
+Useful options:
+    --framework <framework>
+    --framework-ref <version-or-commit>
+    --browser <browser-or-integrated>
+    --model <model>
+    --tasks 10
+    --parallel 3
+    --params key=value,key=value
+
+Adapter prerequisites:
+    browser-use:
+        Install the desired browser-use package/ref into the uv environment.
+    browser-use-cloud-api-v2, browser-use-cloud-api-v3:
+        Set BROWSER_USE_API_KEY; no browser provider is needed.
+    bcode, bcode-v012:
+        Install bcode at $HOME/.bcode/bin/bcode; set BROWSER_USE_API_KEY and
+        model provider keys.
+    browserbase-agent:
+        Run `npm install --prefix frameworks/browserbase_agent`; set
+        BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID.
+    claude-code-harness, codex-harness, pi-harness:
+        Install the agent CLI, clone browser-use/browser-harness at the desired
+        ref to /tmp/browser-harness, and install it into the uv environment.
+    claude-code-harness-js:
+        Install Claude Code, clone/install browser-use/browser-harness-js, and
+        put browser-harness-js on PATH.
+    claude-code-harness-ab:
+        Install Claude Code and agent-browser, then install its browser assets.
+    claude-code-harness-bu-cli:
+        Install Claude Code and browser-use CLI at the desired ref.
+    pibt:
+        Install pi, clone/install browser-use/pi-agent-extensions to
+        /tmp/pi-agent-extensions, and install its JS dependencies.
+    but:
+        Install browser-use/browser-use-terminal to /tmp/but with
+        `uv sync --project /tmp/but`.
+    but-rust:
+        Build /tmp/but-rust/target/release/browser-use-terminal and provide
+        browser-harness to the worker.
+"""
+
+import argparse
+import asyncio
+import json
+import os
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from frameworks import FRAMEWORKS, framework_to_module, interleave, load_tasks
+
+ROOT_DIR = Path(__file__).resolve().parent
+
+
+def _safe_part(value: str) -> str:
+    return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-") or "unknown"
+
+
+def _selected_indices(total: int, args: argparse.Namespace) -> list[int]:
+    if args.task_indices:
+        indices = [int(x.strip()) for x in args.task_indices.split(",") if x.strip()]
+    else:
+        count = args.tasks if args.tasks is not None else total - args.task_start
+        indices = list(range(args.task_start, min(args.task_start + count, total)))
+    bad = [i for i in indices if i < 0 or i >= total]
+    if bad:
+        raise SystemExit(f"Task index out of range: {bad[:5]} for benchmark size {total}")
+    return indices
+
+
+async def _run_one(
+    *,
+    task_index: int,
+    framework: str,
+    model: str,
+    browser: str,
+    benchmark: str,
+    params: str,
+    run_data_dir: Path,
+    task_results_dir: Path,
+    task_timeout: int | None,
+) -> dict:
+    module_name = framework_to_module(framework)
+    runner = ROOT_DIR / "frameworks" / module_name / "run_task.py"
+    if not runner.exists():
+        return {
+            "task_index": task_index,
+            "task_id": None,
+            "score": 0,
+            "steps": 0,
+            "duration": 0,
+            "cost": 0,
+            "error": f"Missing framework runner: {runner}",
+        }
+
+    result_file = task_results_dir / f"task_{task_index}.json"
+    env = os.environ.copy()
+    env.update(
+        {
+            "MODEL": model,
+            "TASK_INDEX": str(task_index),
+            "EVAL_ID": "local",
+            "FRAMEWORK": framework,
+            "BROWSER": browser,
+            "BENCHMARK": benchmark,
+            "PARAMS": params,
+            "LOCAL_RESULT_FILE": str(result_file),
+            "RUN_DATA_DIR": str(run_data_dir),
+            "BROWSER_USE_SETUP_LOGGING": "false",
+        }
+    )
+    if task_timeout is not None:
+        env["TASK_TIMEOUT"] = str(task_timeout)
+    if os.environ.get("NO_INTERLEAVE") == "1":
+        env["NO_INTERLEAVE"] = "1"
+
+    proc = await asyncio.create_subprocess_exec(
+        sys.executable,
+        str(runner),
+        cwd=str(ROOT_DIR),
+        env=env,
+        stdout=asyncio.subprocess.PIPE,
+        stderr=asyncio.subprocess.PIPE,
+    )
+    stdout, stderr = await proc.communicate()
+    if stdout:
+        print(stdout.decode("utf-8", errors="replace"), end="")
+    if stderr:
+        print(stderr.decode("utf-8", errors="replace"), end="", file=sys.stderr)
+
+    if result_file.exists():
+        return json.loads(result_file.read_text(encoding="utf-8"))
+
+    return {
+        "task_index": task_index,
+        "task_id": None,
+        "score": 0,
+        "steps": 0,
+        "duration": 0,
+        "cost": 0,
+        "error": f"Runner exited {proc.returncode} without writing {result_file}",
+    }
+
+
+async def _run_all(args: argparse.Namespace) -> list[dict]:
+    tasks = load_tasks(args.benchmark)
+    if not args.no_interleave:
+        tasks = interleave(tasks)
+    indices = _selected_indices(len(tasks), args)
+
+    framework_info = FRAMEWORKS[args.framework]
+    browser = args.browser or framework_info.browsers[0]
+    if browser not in framework_info.browsers:
+        valid = ", ".join(framework_info.browsers)
+        raise SystemExit(
+            f"Browser {browser!r} is not supported by {args.framework!r}. "
+            f"Valid browsers: {valid}"
+        )
+
+    run_start = datetime.now().strftime("%Y%m%d_%H%M%S")
+    run_key = (
+        f"{args.benchmark}_framework_{_safe_part(args.framework)}"
+        f"_browser_{_safe_part(browser)}"
+        f"_model_{_safe_part(args.model)}"
+    )
+    run_data_dir = ROOT_DIR / "run_data" / f"{run_key}_start_at_{run_start}"
+    task_results_dir = run_data_dir / "_task_results"
+    results_file = ROOT_DIR / "results" / f"{run_key}.json"
+
+    print(
+        f"Running {len(indices)} task(s): benchmark={args.benchmark} "
+        f"framework={args.framework} browser={browser} model={args.model}"
+    )
+    if framework_info.repo:
+        print(f"Framework repo: {framework_info.repo} ref={args.framework_ref}")
+
+    semaphore = asyncio.Semaphore(args.parallel)
+
+    async def guarded(i: int) -> dict:
+        async with semaphore:
+            return await _run_one(
+                task_index=i,
+                framework=args.framework,
+                model=args.model,
+                browser=browser,
+                benchmark=args.benchmark,
+                params=args.params,
+                run_data_dir=run_data_dir,
+                task_results_dir=task_results_dir,
+                task_timeout=args.task_timeout,
+            )
+
+    results = await asyncio.gather(*(guarded(i) for i in indices))
+
+    run_entry = {
+        "run_start": run_start,
+        "benchmark": args.benchmark,
+        "framework": args.framework,
+        "framework_ref": args.framework_ref,
+        "browser": browser,
+        "model": args.model,
+        "params": args.params,
+        "task_indices": indices,
+        "tasks_completed": len(results),
+        "tasks_successful": sum(1 for r in results if r.get("score") == 1),
+        "total_steps": sum(int(r.get("steps", 0) or 0) for r in results),
+        "total_duration": sum(float(r.get("duration", 0) or 0) for r in results),
+        "total_cost": sum(float(r.get("cost", 0) or 0) for r in results),
+        "task_results": [
+            {
+                "task_id": r.get("task_id"),
+                "task_index": r.get("task_index"),
+                "score": r.get("score"),
+                "steps": r.get("steps", 0),
+                "duration": r.get("duration", 0),
+                "cost": r.get("cost", 0),
+                **({"error": r["error"]} if r.get("error") else {}),
+            }
+            for r in results
+        ],
+    }
+
+    results_file.parent.mkdir(parents=True, exist_ok=True)
+    previous = json.loads(results_file.read_text()) if results_file.exists() else []
+    previous.append(run_entry)
+    results_file.write_text(json.dumps(previous, indent=2), encoding="utf-8")
+
+    print(
+        f"Run complete: {run_entry['tasks_successful']}/{run_entry['tasks_completed']} "
+        f"successful, {run_entry['total_steps']} steps, "
+        f"{run_entry['total_duration']:.1f}s, ${run_entry['total_cost']:.2f}"
+    )
+    print(f"Summary: {results_file}")
+    print(f"Trace artifacts: {run_data_dir}")
+    return results
+
+
+def _print_frameworks() -> None:
+    for name, info in sorted(FRAMEWORKS.items()):
+        browsers = ", ".join(info.browsers)
+        repo = f" repo={info.repo}" if info.repo else ""
+        notes = f" ({info.notes})" if info.notes else ""
+        print(f"{name}: browsers=[{browsers}]{repo}{notes}")
+
+
+def main() -> None:
+    load_dotenv()
+    parser = argparse.ArgumentParser(description="Run public BU_Bench_V1 reverification")
+    parser.add_argument("--benchmark", default="BU_Bench_V1")
+    parser.add_argument("--framework", choices=sorted(FRAMEWORKS), default="browser-use")
+    parser.add_argument("--framework-ref", default="installed")
+    parser.add_argument("--browser", default=None)
+    parser.add_argument("--model", default="bu-2-0")
+    parser.add_argument("--params", default="")
+    parser.add_argument("--tasks", type=int, default=None)
+    parser.add_argument("--task-start", type=int, default=0)
+    parser.add_argument("--task-indices", default="")
+    parser.add_argument("--parallel", type=int, default=1)
+    parser.add_argument("--task-timeout", type=int, default=None)
+    parser.add_argument(
+        "--no-interleave",
+        action="store_true",
+        help="Use raw encrypted task order instead of the distributed runner order.",
+    )
+    parser.add_argument("--list-frameworks", action="store_true")
+    args = parser.parse_args()
+
+    if args.list_frameworks:
+        _print_frameworks()
+        return
+
+    if args.no_interleave:
+        os.environ["NO_INTERLEAVE"] = "1"
+
+    asyncio.run(_run_all(args))
+
+
+if __name__ == "__main__":
+    main()

From e3a3ebf647666024a0de5728ee17ece6b0c77ffe Mon Sep 17 00:00:00 2001
From: Alezander9 <alexander.j.yue@gmail.com>
Date: Tue, 12 May 2026 16:28:06 -0700
Subject: [PATCH 2/2] Address framework verifier review comments

---
 frameworks/__init__.py                        |   2 +-
 frameworks/bcode/run_task.py                  |  36 +-
 frameworks/browser_use/run_task.py            |  37 +-
 frameworks/browserbase_agent/package.json     |   2 +-
 frameworks/but/run_task.py                    |  47 ++-
 frameworks/but_rust/run_task.py               | 329 +++++++++---------
 .../claude_code_harness/system_prompt.md      |   2 +-
 frameworks/claude_code_harness_ab/run_task.py |  58 +--
 .../claude_code_harness_ab/system_prompt.md   |  14 +-
 .../claude_code_harness_bu_cli/run_task.py    |  59 ++--
 .../system_prompt.md                          |  14 +-
 frameworks/claude_code_harness_js/run_task.py |  37 +-
 .../claude_code_harness_js/system_prompt.md   |   2 +-
 frameworks/claude_cua/run_task.py             |  15 +-
 frameworks/codex_harness/run_task.py          |  93 ++---
 frameworks/codex_harness/system_prompt.md     |   2 +-
 frameworks/pi_harness/system_prompt.md        |   2 +-
 frameworks/pibt/run_task.py                   |  45 ++-
 frameworks/stagehand/executor.js              |  20 +-
 run_framework_eval.py                         |   2 +
 20 files changed, 442 insertions(+), 376 deletions(-)

diff --git a/frameworks/__init__.py b/frameworks/__init__.py
index 18c6390..ca44e7f 100644
--- a/frameworks/__init__.py
+++ b/frameworks/__init__.py
@@ -306,7 +306,7 @@ async def run_and_judge(
         _maybe_write_local_result(data)
         return data
 
-    except BaseException as e:
+    except Exception as e:
         error_msg = f"{type(e).__name__}: {e}"
         tb = traceback.format_exc()
         print(f"Task {task_id} failed: {error_msg}")
diff --git a/frameworks/bcode/run_task.py b/frameworks/bcode/run_task.py
index 434fb55..f0ce2b9 100644
--- a/frameworks/bcode/run_task.py
+++ b/frameworks/bcode/run_task.py
@@ -148,11 +148,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict:
 
 def _start_browser() -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -274,15 +280,19 @@ async def execute(task_description: str) -> ExecutionResult:
     errors: list[str] = []
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd="/tmp",
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd="/tmp",
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     try:
         async for raw in _iter_lines(proc.stdout):
diff --git a/frameworks/browser_use/run_task.py b/frameworks/browser_use/run_task.py
index ac521ab..9ade8a5 100644
--- a/frameworks/browser_use/run_task.py
+++ b/frameworks/browser_use/run_task.py
@@ -48,28 +48,33 @@ async def execute(
 ) -> ExecutionResult:
     """Run a browser-use agent on the task and return a standardized result."""
     provider = BROWSERS[browser_name]
-    cdp_url = await provider.connect()
-    if cdp_url:
-        browser = Browser(cdp_url=cdp_url)
-    else:
-        headless = getattr(provider, "HEADLESS", True)
-        browser = Browser(headless=headless)
-
-    agent = Agent(
-        task=task_description,
-        llm=llm,
-        browser=browser,
-        use_judge=False,
-        use_vision=use_vision,
-    )
+    browser = None
     try:
+        cdp_url = await provider.connect()
+        if cdp_url:
+            browser = Browser(cdp_url=cdp_url)
+        else:
+            headless = getattr(provider, "HEADLESS", True)
+            browser = Browser(headless=headless)
+
+        agent = Agent(
+            task=task_description,
+            llm=llm,
+            browser=browser,
+            use_judge=False,
+            use_vision=use_vision,
+        )
         history = await agent.run()
     finally:
+        if browser is not None:
+            try:
+                await browser.kill()
+            except Exception:
+                pass
         try:
-            await browser.kill()
+            await provider.disconnect()
         except Exception:
             pass
-        await provider.disconnect()
 
     return ExecutionResult(
         final_result=history.final_result() or "Agent did not return a result",
diff --git a/frameworks/browserbase_agent/package.json b/frameworks/browserbase_agent/package.json
index 58d1e87..55b6c5d 100644
--- a/frameworks/browserbase_agent/package.json
+++ b/frameworks/browserbase_agent/package.json
@@ -4,6 +4,6 @@
   "type": "module",
   "description": "Node executor for the browserbase-agent eval framework: drives Stagehand SDK against Browserbase cloud. Pinned to a Stagehand version that has the opus-4-7 temperature fix (PRs #2006/#2018, shipped in client 3.4.0).",
   "dependencies": {
-    "@browserbasehq/stagehand": "^3.4.0"
+    "@browserbasehq/stagehand": "3.4.0"
   }
 }
diff --git a/frameworks/but/run_task.py b/frameworks/but/run_task.py
index 30066cb..0864789 100644
--- a/frameworks/but/run_task.py
+++ b/frameworks/but/run_task.py
@@ -126,11 +126,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict:
 
 def _start_browser() -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -260,8 +266,12 @@ async def execute(task_description: str) -> ExecutionResult:
     # passing the env var costs nothing on the current version.
     parent_span_context = Laminar.serialize_span_context()
 
-    system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
-    full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
+    try:
+        system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
+        full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     env = {
         **os.environ,
@@ -300,15 +310,19 @@ async def execute(task_description: str) -> ExecutionResult:
     stdout_chunks: list[str] = []
     session_id: str | None = None
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=BUT_PROJECT_DIR,
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=BUT_PROJECT_DIR,
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     try:
         async for raw in _iter_lines(proc.stdout):
@@ -385,7 +399,8 @@ async def execute(task_description: str) -> ExecutionResult:
     if not final_text:
         for event in reversed(events):
             if (event.get("type") or "") in ("assistant.message", "message.assistant"):
-                text = ((event.get("payload") or {}).get("text") or "").strip()
+                payload = event.get("payload") or {}
+                text = (payload.get("text") or payload.get("content") or "").strip()
                 if text:
                     final_text = text
                     break
diff --git a/frameworks/but_rust/run_task.py b/frameworks/but_rust/run_task.py
index f54b9b1..8627770 100644
--- a/frameworks/but_rust/run_task.py
+++ b/frameworks/but_rust/run_task.py
@@ -118,11 +118,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict:
 
 
 def _start_browser() -> tuple[str, str]:
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -190,171 +196,172 @@ async def execute(task_description: str) -> ExecutionResult:
     task_idx = os.environ.get("TASK_INDEX", "0")
 
     browser_id, cdp_ws = _start_browser()
-
-    state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}"
-    if state_dir.exists():
-        shutil.rmtree(state_dir)
-    state_dir.mkdir(parents=True)
-
-    parent_span_context = Laminar.serialize_span_context()
-
-    system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
-    full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
-
-    env = {
-        **os.environ,
-        # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS
-        # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short
-        # circuits. Pass both URL forms for robustness.
-        "BU_CDP_WS": cdp_ws,
-        # Force flush on one-shot CLI runs so OTLP spans actually leave the
-        # process before exit (see docs/README on this branch).
-        "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1",
-    }
-    if parent_span_context:
-        # Forward-compat: but-rust telemetry doesn't honor this yet, but it
-        # doesn't error on unknown env either.
-        env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context
-
-    # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE
-    # the subcommand.
-    cmd_run = [
-        BUT_RUST_BIN,
-        "--state-dir", str(state_dir),
-        subcommand,
-        full_task,
-        "--model", model,
-    ]
-
     start = time.time()
-    stdout_buf: list[str] = []
-    stderr_buf: list[str] = []
-
-    proc = await asyncio.create_subprocess_exec(
-        *cmd_run,
-        cwd=BUT_RUST_REPO_DIR,
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-    stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf))
-    stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf))
-
     try:
-        await proc.wait()
-        await asyncio.wait_for(stdout_task, timeout=10)
-        await asyncio.wait_for(stderr_task, timeout=10)
-    except asyncio.TimeoutError:
-        for t in (stdout_task, stderr_task):
-            if not t.done():
-                t.cancel()
-    finally:
-        if proc.returncode is None:
-            proc.kill()
-            try:
-                await asyncio.wait_for(proc.wait(), timeout=10)
-            except asyncio.TimeoutError:
-                pass
-
-    # `run-openai`/etc print the session_id as the final non-empty stdout line.
-    session_id = ""
-    for line in reversed(stdout_buf):
-        line = line.strip()
-        if line and not line.startswith("{"):
-            session_id = line
-            break
-
-    if not session_id:
-        _stop_browser(browser_id)
-        raise RuntimeError(
-            f"but-rust: no session_id captured from stdout (exit={proc.returncode}). "
-            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+        state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}"
+        if state_dir.exists():
+            shutil.rmtree(state_dir)
+        state_dir.mkdir(parents=True)
+
+        parent_span_context = Laminar.serialize_span_context()
+
+        system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8")
+        full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}"
+
+        env = {
+            **os.environ,
+            # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS
+            # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short
+            # circuits. Pass both URL forms for robustness.
+            "BU_CDP_WS": cdp_ws,
+            # Force flush on one-shot CLI runs so OTLP spans actually leave the
+            # process before exit (see docs/README on this branch).
+            "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1",
+        }
+        if parent_span_context:
+            # Forward-compat: but-rust telemetry doesn't honor this yet, but it
+            # doesn't error on unknown env either.
+            env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context
+
+        # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE
+        # the subcommand.
+        cmd_run = [
+            BUT_RUST_BIN,
+            "--state-dir", str(state_dir),
+            subcommand,
+            full_task,
+            "--model", model,
+        ]
+
+        stdout_buf: list[str] = []
+        stderr_buf: list[str] = []
+
+        proc = await asyncio.create_subprocess_exec(
+            *cmd_run,
+            cwd=BUT_RUST_REPO_DIR,
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
         )
+        stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf))
+        stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf))
 
-    # Dump events for this session.
-    cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id]
-    events_proc = await asyncio.create_subprocess_exec(
-        *cmd_events,
-        cwd=BUT_RUST_REPO_DIR,
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-    events_stdout, events_stderr = await events_proc.communicate()
-    _stop_browser(browser_id)
-    duration = time.time() - start
-
-    events: list[dict] = []
-    for line in events_stdout.decode("utf-8", errors="replace").splitlines():
-        line = line.strip()
-        if not line:
-            continue
         try:
-            events.append(json.loads(line))
-        except json.JSONDecodeError:
-            continue
-
-    steps: list[str] = []
-    final_text = ""
-    total_cost = 0.0
-    errors: list[str] = []
-
-    for event in events:
-        if (s := _format_step_from_event(event)):
-            steps.append(s)
-        etype = event.get("type") or ""
-        payload = event.get("payload") or {}
-        if etype == "session.done":
-            done_result = (payload.get("result") or "").strip()
-            if done_result:
-                final_text = done_result
-        elif etype in ("model.usage", "llm.usage"):
-            cost_usd = payload.get("cost_usd") or payload.get("cost")
-            if cost_usd is not None:
+            await proc.wait()
+            await asyncio.wait_for(stdout_task, timeout=10)
+            await asyncio.wait_for(stderr_task, timeout=10)
+        except asyncio.TimeoutError:
+            for t in (stdout_task, stderr_task):
+                if not t.done():
+                    t.cancel()
+        finally:
+            if proc.returncode is None:
+                proc.kill()
                 try:
-                    total_cost += float(cost_usd)
-                except (TypeError, ValueError):
+                    await asyncio.wait_for(proc.wait(), timeout=10)
+                except asyncio.TimeoutError:
                     pass
-        elif etype in ("tool.failed", "session.failed", "error"):
-            err = payload.get("error") or payload.get("message") or ""
-            if err:
-                errors.append(str(err))
-                print(f"[but-rust-error] {str(err)[:500]}", flush=True)
-
-    if not final_text:
-        for event in reversed(events):
-            if (event.get("type") or "") in ("assistant.message", "message.assistant"):
-                text = ((event.get("payload") or {}).get("text") or "").strip()
-                if text:
-                    final_text = text
-                    break
-
-    if proc.returncode not in (0, None) and not final_text and not steps:
-        raise RuntimeError(
-            f"but-rust exited with code {proc.returncode} before producing output. "
-            f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
-        )
 
-    answer = (final_text or "").strip()
-    if errors and not answer:
-        final_result = f"[but_rust_error] {errors[0][:500]}"
-    elif errors:
-        final_result = f"[but_rust_error_recovered] {answer}"
-    else:
-        final_result = answer or "[but_rust_no_output]"
-
-    screenshots = _collect_screenshots(state_dir, session_id)
-
-    return ExecutionResult(
-        final_result=final_result,
-        steps=steps,
-        screenshots_b64=screenshots,
-        num_steps=len(steps),
-        duration_seconds=duration,
-        cost=total_cost,
-    )
+        # `run-openai`/etc print the session_id as the final non-empty stdout line.
+        session_id = ""
+        for line in reversed(stdout_buf):
+            line = line.strip()
+            if line and not line.startswith("{"):
+                session_id = line
+                break
+
+        if not session_id:
+            raise RuntimeError(
+                f"but-rust: no session_id captured from stdout (exit={proc.returncode}). "
+                f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+            )
+
+        # Dump events for this session.
+        cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id]
+        events_proc = await asyncio.create_subprocess_exec(
+            *cmd_events,
+            cwd=BUT_RUST_REPO_DIR,
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        events_stdout, _events_stderr = await events_proc.communicate()
+        duration = time.time() - start
+
+        events: list[dict] = []
+        for line in events_stdout.decode("utf-8", errors="replace").splitlines():
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                events.append(json.loads(line))
+            except json.JSONDecodeError:
+                continue
+
+        steps: list[str] = []
+        final_text = ""
+        total_cost = 0.0
+        errors: list[str] = []
+
+        for event in events:
+            if (s := _format_step_from_event(event)):
+                steps.append(s)
+            etype = event.get("type") or ""
+            payload = event.get("payload") or {}
+            if etype == "session.done":
+                done_result = (payload.get("result") or "").strip()
+                if done_result:
+                    final_text = done_result
+            elif etype in ("model.usage", "llm.usage"):
+                cost_usd = payload.get("cost_usd") or payload.get("cost")
+                if cost_usd is not None:
+                    try:
+                        total_cost += float(cost_usd)
+                    except (TypeError, ValueError):
+                        pass
+            elif etype in ("tool.failed", "session.failed", "error"):
+                err = payload.get("error") or payload.get("message") or ""
+                if err:
+                    errors.append(str(err))
+                    print(f"[but-rust-error] {str(err)[:500]}", flush=True)
+
+        if not final_text:
+            for event in reversed(events):
+                if (event.get("type") or "") in ("assistant.message", "message.assistant"):
+                    payload = event.get("payload") or {}
+                    text = (payload.get("text") or payload.get("content") or "").strip()
+                    if text:
+                        final_text = text
+                        break
+
+        if proc.returncode not in (0, None) and not final_text and not steps:
+            raise RuntimeError(
+                f"but-rust exited with code {proc.returncode} before producing output. "
+                f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}"
+            )
+
+        answer = (final_text or "").strip()
+        if errors and not answer:
+            final_result = f"[but_rust_error] {errors[0][:500]}"
+        elif errors:
+            final_result = f"[but_rust_error_recovered] {answer}"
+        else:
+            final_result = answer or "[but_rust_no_output]"
+
+        screenshots = _collect_screenshots(state_dir, session_id)
+
+        return ExecutionResult(
+            final_result=final_result,
+            steps=steps,
+            screenshots_b64=screenshots,
+            num_steps=len(steps),
+            duration_seconds=duration,
+            cost=total_cost,
+        )
+    finally:
+        _stop_browser(browser_id)
 
 
 async def main():
diff --git a/frameworks/claude_code_harness/system_prompt.md b/frameworks/claude_code_harness/system_prompt.md
index d1566de..5520ae4 100644
--- a/frameworks/claude_code_harness/system_prompt.md
+++ b/frameworks/claude_code_harness/system_prompt.md
@@ -5,7 +5,7 @@ Hard rules:
 - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
 - Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
 FINAL ANSWER: <your concise answer to the task, on a single line>
diff --git a/frameworks/claude_code_harness_ab/run_task.py b/frameworks/claude_code_harness_ab/run_task.py
index baddc77..307c50b 100644
--- a/frameworks/claude_code_harness_ab/run_task.py
+++ b/frameworks/claude_code_harness_ab/run_task.py
@@ -117,11 +117,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
     if browser_name != "browser-use-cloud":
         raise ValueError(f"Unsupported browser for claude-code-harness-ab: {browser_name}")
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -261,16 +267,11 @@ async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> Non
         print(f"[claude-stderr] {s}", flush=True)
 
 
-async def _close_agent_browser_sessions() -> None:
-    """Best-effort: tell agent-browser to shut down all daemons.
-
-    agent-browser spawns a per-session background daemon (one per
-    `--session` name). `close --all` quits every active session so a
-    leaked daemon does not survive across tasks on the same runner.
-    """
+async def _close_agent_browser_session(session_name: str) -> None:
+    """Best-effort: tell agent-browser to shut down this task's daemon."""
     try:
         stop_proc = await asyncio.create_subprocess_exec(
-            "agent-browser", "close", "--all",
+            "agent-browser", "--session", session_name, "close",
             stdout=asyncio.subprocess.DEVNULL,
             stderr=asyncio.subprocess.DEVNULL,
         )
@@ -291,12 +292,15 @@ async def execute(task_description: str) -> ExecutionResult:
     _reset_dir(WORK_DIR)
 
     # Pre-provision a remote browser; pass its WS URL to the agent via env.
-    # The agent runs: agent-browser --cdp "$BU_CDP_WS" open <url>
+    # The agent runs:
+    # agent-browser --session "$AB_SESSION" --cdp "$BU_CDP_WS" open <url>
     browser_id, cdp_ws = _start_browser(browser_name)
+    session_name = f"eval-{os.environ.get('TASK_INDEX', '0')}-{os.getpid()}"
 
     env = {
         **os.environ,
         "BU_CDP_WS": cdp_ws,
+        "AB_SESSION": session_name,
         "DISABLE_TELEMETRY": "1",
         "DISABLE_AUTOUPDATER": "1",
         "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
@@ -319,16 +323,20 @@ async def execute(task_description: str) -> ExecutionResult:
     result_errors: list[str] = []
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=str(WORK_DIR),
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=str(WORK_DIR),
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        await _close_agent_browser_session(session_name)
+        _stop_browser(browser_id)
+        raise
 
     async def _iter_stdout_lines():
         assert proc.stdout is not None
@@ -399,9 +407,7 @@ async def _iter_stdout_lines():
                 pass
         if not stderr_task.done():
             stderr_task.cancel()
-        # Best-effort: close any agent-browser daemon(s) the agent left running
-        # so they don't leak across tasks on the same runner.
-        await _close_agent_browser_sessions()
+        await _close_agent_browser_session(session_name)
         _stop_browser(browser_id)
 
     duration = time.time() - start
diff --git a/frameworks/claude_code_harness_ab/system_prompt.md b/frameworks/claude_code_harness_ab/system_prompt.md
index eec2715..fcdcfea 100644
--- a/frameworks/claude_code_harness_ab/system_prompt.md
+++ b/frameworks/claude_code_harness_ab/system_prompt.md
@@ -2,25 +2,25 @@
 
 Hard rules:
 - Use the `agent-browser` CLI for every browser interaction. It is on your PATH. Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `agent-browser` only.
-- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running:
+- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` and `AB_SESSION` from your environment and running:
     ```
-    agent-browser --cdp "$BU_CDP_WS" open <url>
+    agent-browser --session "$AB_SESSION" --cdp "$BU_CDP_WS" open <url>
     ```
-  All subsequent `agent-browser <verb>` calls automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`snapshot`, `click @e2`, `screenshot`, etc.).
+  All subsequent `agent-browser <verb>` calls must include `--session "$AB_SESSION"` and automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`--session "$AB_SESSION" snapshot`, `--session "$AB_SESSION" click @e2`, `--session "$AB_SESSION" screenshot`, etc.).
 - Before issuing your first command, read the bundled skill so you know the full command surface and current best-practice workflow:
     ```
     agent-browser skills get core
     ```
   Use `agent-browser skills get core --full` for the complete command reference. The CLI also accepts `--help` on any subcommand.
-- Prefer the accessibility-tree workflow: `agent-browser snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser click @eN` / `agent-browser fill @eN "<text>"` to interact. Fall back to CSS selectors or `find role <role> --name "..."` semantic locators when refs are insufficient.
+- Prefer the accessibility-tree workflow: `agent-browser --session "$AB_SESSION" snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser --session "$AB_SESSION" click @eN` / `agent-browser --session "$AB_SESSION" fill @eN "<text>"` to interact. Fall back to CSS selectors or `find role <role> --name "..."` semantic locators when refs are insufficient.
 - Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Use the `--screenshot-dir` / explicit-path form so files land on disk and the judge can see them:
     ```
-    agent-browser screenshot /tmp/shots/step_001.png
-    agent-browser screenshot /tmp/shots/step_002.png
+    agent-browser --session "$AB_SESSION" screenshot /tmp/shots/step_001.png
+    agent-browser --session "$AB_SESSION" screenshot /tmp/shots/step_002.png
     ```
   Never overwrite a previous screenshot path. Annotated screenshots (`--annotate`) are fine for visual reasoning, but still write to a new numbered filename.
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
diff --git a/frameworks/claude_code_harness_bu_cli/run_task.py b/frameworks/claude_code_harness_bu_cli/run_task.py
index 1f8f164..dc0adad 100644
--- a/frameworks/claude_code_harness_bu_cli/run_task.py
+++ b/frameworks/claude_code_harness_bu_cli/run_task.py
@@ -116,11 +116,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
     if browser_name != "browser-use-cloud":
         raise ValueError(f"Unsupported browser for claude-code-harness-bu-cli: {browser_name}")
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -260,17 +266,11 @@ async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> Non
         print(f"[claude-stderr] {s}", flush=True)
 
 
-async def _close_browser_use_sessions() -> None:
-    """Best-effort: tell browser-use to shut down all daemons.
-
-    The browser-use CLI spawns a per-session background daemon (one per
-    `--session` name; default is "default"). `close --all` quits every
-    active session so a leaked daemon does not survive across tasks on the
-    same runner.
-    """
+async def _close_browser_use_session(session_name: str) -> None:
+    """Best-effort: tell browser-use to shut down this task's daemon."""
     try:
         stop_proc = await asyncio.create_subprocess_exec(
-            "browser-use", "close", "--all",
+            "browser-use", "--session", session_name, "close",
             stdout=asyncio.subprocess.DEVNULL,
             stderr=asyncio.subprocess.DEVNULL,
         )
@@ -291,12 +291,15 @@ async def execute(task_description: str) -> ExecutionResult:
     _reset_dir(WORK_DIR)
 
     # Pre-provision a remote browser; pass its WS URL to the agent via env.
-    # The agent runs: browser-use --cdp-url "$BU_CDP_WS" open <url>
+    # The agent runs:
+    # browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open <url>
     browser_id, cdp_ws = _start_browser(browser_name)
+    session_name = f"eval-{os.environ.get('TASK_INDEX', '0')}-{os.getpid()}"
 
     env = {
         **os.environ,
         "BU_CDP_WS": cdp_ws,
+        "BU_SESSION": session_name,
         "DISABLE_TELEMETRY": "1",
         "DISABLE_AUTOUPDATER": "1",
         "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
@@ -320,16 +323,20 @@ async def execute(task_description: str) -> ExecutionResult:
     result_errors: list[str] = []
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=str(WORK_DIR),
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=str(WORK_DIR),
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        await _close_browser_use_session(session_name)
+        _stop_browser(browser_id)
+        raise
 
     async def _iter_stdout_lines():
         assert proc.stdout is not None
@@ -400,9 +407,7 @@ async def _iter_stdout_lines():
                 pass
         if not stderr_task.done():
             stderr_task.cancel()
-        # Best-effort: close any browser-use daemon(s) the agent left running
-        # so they don't leak across tasks on the same runner.
-        await _close_browser_use_sessions()
+        await _close_browser_use_session(session_name)
         _stop_browser(browser_id)
 
     duration = time.time() - start
diff --git a/frameworks/claude_code_harness_bu_cli/system_prompt.md b/frameworks/claude_code_harness_bu_cli/system_prompt.md
index fa9789d..48c3887 100644
--- a/frameworks/claude_code_harness_bu_cli/system_prompt.md
+++ b/frameworks/claude_code_harness_bu_cli/system_prompt.md
@@ -2,21 +2,21 @@
 
 Hard rules:
 - Use the `browser-use` CLI for every browser interaction. It is on your PATH (aliases: `bu`, `browser`, `browseruse` all work). Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `browser-use` only.
-- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running:
+- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` and `BU_SESSION` from your environment and running:
     ```
-    browser-use --cdp-url "$BU_CDP_WS" open <url>
+    browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open <url>
     ```
-  All subsequent `browser-use <verb>` calls automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`state`, `click 5`, `input 3 "text"`, `screenshot`, etc.).
+  All subsequent `browser-use <verb>` calls must include `--session "$BU_SESSION"` and automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`--session "$BU_SESSION" state`, `--session "$BU_SESSION" click 5`, `--session "$BU_SESSION" input 3 "text"`, `--session "$BU_SESSION" screenshot`, etc.).
 - Before issuing your first interaction command, read the bundled SKILL.md so you know the full command surface, common workflows, and troubleshooting tips. It is at `~/.claude/skills/browser-use/SKILL.md`. If you have a Read tool, read that file. Otherwise: `cat ~/.claude/skills/browser-use/SKILL.md`.
-- Standard workflow per the SKILL: (1) `browser-use --cdp-url "$BU_CDP_WS" open <url>` to attach + navigate, (2) `browser-use state` to see clickable elements with indices, (3) `browser-use click <idx>` / `browser-use input <idx> "text"` to interact, (4) `browser-use state` or `browser-use screenshot` to verify, (5) repeat.
+- Standard workflow per the SKILL: (1) `browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open <url>` to attach + navigate, (2) `browser-use --session "$BU_SESSION" state` to see clickable elements with indices, (3) `browser-use --session "$BU_SESSION" click <idx>` / `browser-use --session "$BU_SESSION" input <idx> "text"` to interact, (4) `browser-use --session "$BU_SESSION" state` or `browser-use --session "$BU_SESSION" screenshot` to verify, (5) repeat.
 - Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Pass an explicit path to `browser-use screenshot`:
     ```
-    browser-use screenshot /tmp/shots/step_001.png
-    browser-use screenshot /tmp/shots/step_002.png
+    browser-use --session "$BU_SESSION" screenshot /tmp/shots/step_001.png
+    browser-use --session "$BU_SESSION" screenshot /tmp/shots/step_002.png
     ```
   Never overwrite a previous screenshot path.
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. Do not call `browser-use cloud connect` or `browser-use connect` -- the browser is already provisioned and attached via `--cdp-url`.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
diff --git a/frameworks/claude_code_harness_js/run_task.py b/frameworks/claude_code_harness_js/run_task.py
index 8141989..7d08911 100644
--- a/frameworks/claude_code_harness_js/run_task.py
+++ b/frameworks/claude_code_harness_js/run_task.py
@@ -114,11 +114,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
     if browser_name != "browser-use-cloud":
         raise ValueError(f"Unsupported browser for claude-code-harness-js: {browser_name}")
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -297,16 +303,19 @@ async def execute(task_description: str) -> ExecutionResult:
     result_errors: list[str] = []
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=str(WORK_DIR),
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=str(WORK_DIR),
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     async def _iter_stdout_lines():
         assert proc.stdout is not None
diff --git a/frameworks/claude_code_harness_js/system_prompt.md b/frameworks/claude_code_harness_js/system_prompt.md
index b5d375d..4eaca11 100644
--- a/frameworks/claude_code_harness_js/system_prompt.md
+++ b/frameworks/claude_code_harness_js/system_prompt.md
@@ -13,7 +13,7 @@ Hard rules:
     JS
     ```
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
 FINAL ANSWER: <your concise answer to the task, on a single line>
diff --git a/frameworks/claude_cua/run_task.py b/frameworks/claude_cua/run_task.py
index 737d231..43461ab 100644
--- a/frameworks/claude_cua/run_task.py
+++ b/frameworks/claude_cua/run_task.py
@@ -13,7 +13,6 @@
 import asyncio
 import os
 import sys
-import time
 from pathlib import Path
 
 # Add project root to path for sibling imports
@@ -45,8 +44,6 @@ async def execute(task_description: str) -> ExecutionResult:
     4. Loop: parse tool_use blocks, execute actions, screenshot, send tool_result
     5. Collect all steps and final text response
     """
-    start = time.time()
-
     # import anthropic
     # client = anthropic.Anthropic()
     #
@@ -63,15 +60,9 @@ async def execute(task_description: str) -> ExecutionResult:
     #     )
     #     ... execute actions, collect screenshots, break on end_turn ...
 
-    duration = time.time() - start
-
-    return ExecutionResult(
-        final_result="NOT IMPLEMENTED",
-        steps=[],
-        screenshots_b64=[],
-        num_steps=0,
-        duration_seconds=duration,
-        cost=0,
+    raise NotImplementedError(
+        "claude-cua is not implemented in this public verifier. "
+        "Do not enable this adapter until it returns real task traces."
     )
 
 
diff --git a/frameworks/codex_harness/run_task.py b/frameworks/codex_harness/run_task.py
index 5838e3c..4932681 100644
--- a/frameworks/codex_harness/run_task.py
+++ b/frameworks/codex_harness/run_task.py
@@ -263,30 +263,34 @@ async def execute(task_description: str) -> ExecutionResult:
     # Pre-provision the browser so Codex starts with a live CDP attach.
     _start_browser(browser_name, bu_name)
 
-    # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by
-    # default but accepts `CODEX_API_KEY` env explicitly (the only auth env
-    # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read
-    # by codex (it's for the OpenAI Python SDK). We mirror the workflow's
-    # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks
-    # both bcode (which uses OPENAI_API_KEY directly) and codex-harness.
-    #
-    # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness`
-    # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but
-    # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4
-    # showed the agent self-recovered by prepending the venv dir, but that
-    # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc
-    # in our system prompt + SKILL.md works on the first try.
-    harness_venv_bin = f"{HARNESS_DIR}/.venv/bin"
-    existing_path = os.environ.get("PATH", "")
-    env = {
-        **os.environ,
-        "BU_NAME": bu_name,
-        "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""),
-        "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin,
-    }
-
-    cmd = _build_codex_cmd(model_name, sandbox)
-    prompt = _compose_prompt(task_description)
+    try:
+        # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by
+        # default but accepts `CODEX_API_KEY` env explicitly (the only auth env
+        # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read
+        # by codex (it's for the OpenAI Python SDK). We mirror the workflow's
+        # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks
+        # both bcode (which uses OPENAI_API_KEY directly) and codex-harness.
+        #
+        # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness`
+        # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but
+        # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4
+        # showed the agent self-recovered by prepending the venv dir, but that
+        # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc
+        # in our system prompt + SKILL.md works on the first try.
+        harness_venv_bin = f"{HARNESS_DIR}/.venv/bin"
+        existing_path = os.environ.get("PATH", "")
+        env = {
+            **os.environ,
+            "BU_NAME": bu_name,
+            "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""),
+            "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin,
+        }
+
+        cmd = _build_codex_cmd(model_name, sandbox)
+        prompt = _compose_prompt(task_description)
+    except Exception:
+        _stop_browser(browser_name, bu_name)
+        raise
 
     start = time.time()
     steps: list[str] = []
@@ -299,23 +303,34 @@ async def execute(task_description: str) -> ExecutionResult:
     error_events: list[str] = []
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=HARNESS_DIR,
-        env=env,
-        stdin=asyncio.subprocess.PIPE,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,  # 256 MiB safety cap on line buffer
-    )
+    proc: asyncio.subprocess.Process | None = None
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=HARNESS_DIR,
+            env=env,
+            stdin=asyncio.subprocess.PIPE,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,  # 256 MiB safety cap on line buffer
+        )
 
-    # Pipe the prompt in on stdin and close.
-    assert proc.stdin is not None
-    proc.stdin.write(prompt.encode("utf-8"))
-    await proc.stdin.drain()
-    proc.stdin.close()
+        # Pipe the prompt in on stdin and close.
+        assert proc.stdin is not None
+        proc.stdin.write(prompt.encode("utf-8"))
+        await proc.stdin.drain()
+        proc.stdin.close()
 
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        if proc is not None and proc.returncode is None:
+            proc.kill()
+            try:
+                await asyncio.wait_for(proc.wait(), timeout=10)
+            except asyncio.TimeoutError:
+                pass
+        _stop_browser(browser_name, bu_name)
+        raise
 
     async def _iter_stdout_lines():
         """Yield one JSONL line at a time. Codex item.completed payloads for
diff --git a/frameworks/codex_harness/system_prompt.md b/frameworks/codex_harness/system_prompt.md
index 7908e81..9807d51 100644
--- a/frameworks/codex_harness/system_prompt.md
+++ b/frameworks/codex_harness/system_prompt.md
@@ -6,7 +6,7 @@ Hard rules:
 - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
 - Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). When you reach an answer, deliver it in the format below and exit.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
diff --git a/frameworks/pi_harness/system_prompt.md b/frameworks/pi_harness/system_prompt.md
index d1566de..5520ae4 100644
--- a/frameworks/pi_harness/system_prompt.md
+++ b/frameworks/pi_harness/system_prompt.md
@@ -5,7 +5,7 @@ Hard rules:
 - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`.
 - Save every screenshot to `/tmp/shots/step_<N>.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path.
 - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed.
-- Do not edit files outside the current working directory.
+- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots.
 - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it:
 
 FINAL ANSWER: <your concise answer to the task, on a single line>
diff --git a/frameworks/pibt/run_task.py b/frameworks/pibt/run_task.py
index d8fd61b..71ede68 100644
--- a/frameworks/pibt/run_task.py
+++ b/frameworks/pibt/run_task.py
@@ -96,11 +96,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict:
 
 def _start_browser() -> tuple[str, str]:
     """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws)."""
+    browser_id = None
     info = _bu("/browsers", "POST", {})
-    cdp_ws = json.loads(
-        urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
-    )["webSocketDebuggerUrl"]
-    return info["id"], cdp_ws
+    browser_id = info["id"]
+    try:
+        cdp_ws = json.loads(
+            urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read()
+        )["webSocketDebuggerUrl"]
+        return browser_id, cdp_ws
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
 
 def _stop_browser(browser_id: str | None) -> None:
@@ -283,8 +289,12 @@ async def execute(task_description: str) -> ExecutionResult:
         "PI_TELEMETRY": "0",
     }
 
-    system_prompt = SYSTEM_PROMPT_FILE.read_text()
-    cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
+    try:
+        system_prompt = SYSTEM_PROMPT_FILE.read_text()
+        cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt)
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     start = time.time()
     steps: list[str] = []
@@ -293,16 +303,19 @@ async def execute(task_description: str) -> ExecutionResult:
     saw_agent_end = False
     stderr_buf: list[str] = []
 
-    proc = await asyncio.create_subprocess_exec(
-        *cmd,
-        cwd=EXTENSIONS_DIR,  # pi loads the package.json `pi.extensions` from CWD
-        env=env,
-        stdout=asyncio.subprocess.PIPE,
-        stderr=asyncio.subprocess.PIPE,
-        limit=256 * 1024 * 1024,
-    )
-
-    stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            *cmd,
+            cwd=EXTENSIONS_DIR,  # pi loads the package.json `pi.extensions` from CWD
+            env=env,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+            limit=256 * 1024 * 1024,
+        )
+        stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf))
+    except Exception:
+        _stop_browser(browser_id)
+        raise
 
     async def _iter_stdout_lines():
         assert proc.stdout is not None
diff --git a/frameworks/stagehand/executor.js b/frameworks/stagehand/executor.js
index 61eca97..c4fdef8 100644
--- a/frameworks/stagehand/executor.js
+++ b/frameworks/stagehand/executor.js
@@ -27,8 +27,6 @@ async function main() {
     process.exit(1);
   }
 
-  const startTime = Date.now();
-
   // TODO: Initialize Stagehand with appropriate env (BROWSERBASE or LOCAL)
   // const stagehand = new Stagehand({
   //   env: browser === "browserbase" ? "BROWSERBASE" : "LOCAL",
@@ -43,20 +41,10 @@ async function main() {
   //
   // await stagehand.close();
 
-  const durationSeconds = (Date.now() - startTime) / 1000;
-
-  // TODO: Map Stagehand result to standard format
-  const output = {
-    final_result: "NOT IMPLEMENTED",
-    steps: [],
-    screenshots_b64: [],
-    num_steps: 0,
-    duration_seconds: durationSeconds,
-    cost: 0,
-  };
-
-  // Print JSON to stdout for the Python wrapper to parse
-  console.log(JSON.stringify(output));
+  throw new Error(
+    `Stagehand executor is not implemented for browser=${browser}. ` +
+      "Use browserbase-agent for Stagehand SDK reverification or implement frameworks/stagehand/executor.js before enabling this adapter."
+  );
 }
 
 main().catch((err) => {
diff --git a/run_framework_eval.py b/run_framework_eval.py
index 8114b4e..f80f85a 100644
--- a/run_framework_eval.py
+++ b/run_framework_eval.py
@@ -190,6 +190,8 @@ async def _run_all(args: argparse.Namespace) -> list[dict]:
     if framework_info.repo:
         print(f"Framework repo: {framework_info.repo} ref={args.framework_ref}")
 
+    if args.parallel < 1:
+        raise SystemExit("--parallel must be >= 1")
     semaphore = asyncio.Semaphore(args.parallel)
 
     async def guarded(i: int) -> dict: