From 3824bebd11d73927abfc777dde158cd254b17dba Mon Sep 17 00:00:00 2001 From: Alezander9 Date: Tue, 12 May 2026 15:25:23 -0700 Subject: [PATCH 1/2] Add public framework reverification runner --- .env.example | 10 + .gitignore | 5 +- README.md | 18 + browsers/__init__.py | 57 +- browsers/browser_use_cloud.py | 30 +- browsers/util.py | 16 + frameworks/__init__.py | 325 ++++++++++++ frameworks/bcode/run_task.py | 384 ++++++++++++++ frameworks/browser_use/__init__.py | 1 + frameworks/browser_use/run_task.py | 113 ++++ .../browser_use_cloud_api_v2/run_task.py | 201 +++++++ .../browser_use_cloud_api_v3/run_task.py | 286 ++++++++++ frameworks/browserbase_agent/executor.mjs | 181 +++++++ frameworks/browserbase_agent/package.json | 9 + frameworks/browserbase_agent/run_task.py | 216 ++++++++ frameworks/but/__init__.py | 1 + frameworks/but/run_task.py | 441 ++++++++++++++++ frameworks/but/system_prompt.md | 10 + frameworks/but_rust/__init__.py | 1 + frameworks/but_rust/run_task.py | 382 ++++++++++++++ frameworks/but_rust/system_prompt.md | 10 + frameworks/claude_code_harness/__init__.py | 1 + frameworks/claude_code_harness/run_task.py | 436 ++++++++++++++++ .../claude_code_harness/system_prompt.md | 13 + frameworks/claude_code_harness_ab/__init__.py | 1 + frameworks/claude_code_harness_ab/run_task.py | 460 ++++++++++++++++ .../claude_code_harness_ab/system_prompt.md | 29 ++ .../claude_code_harness_bu_cli/__init__.py | 1 + .../claude_code_harness_bu_cli/run_task.py | 461 +++++++++++++++++ .../system_prompt.md | 25 + frameworks/claude_code_harness_js/__init__.py | 1 + frameworks/claude_code_harness_js/run_task.py | 445 ++++++++++++++++ .../claude_code_harness_js/system_prompt.md | 21 + frameworks/claude_cua/__init__.py | 1 + frameworks/claude_cua/run_task.py | 97 ++++ frameworks/codex_harness/__init__.py | 1 + frameworks/codex_harness/run_task.py | 489 ++++++++++++++++++ frameworks/codex_harness/system_prompt.md | 15 + frameworks/pi_harness/__init__.py | 1 + frameworks/pi_harness/run_task.py | 447 ++++++++++++++++ frameworks/pi_harness/system_prompt.md | 13 + frameworks/pibt/__init__.py | 1 + frameworks/pibt/run_task.py | 466 +++++++++++++++++ frameworks/pibt/system_prompt.md | 13 + frameworks/stagehand/__init__.py | 1 + frameworks/stagehand/executor.js | 65 +++ frameworks/stagehand/package.json | 8 + frameworks/stagehand/run_task.py | 86 +++ laminar.py | 49 ++ lmnr.py | 38 ++ models.py | 50 ++ run_framework_eval.py | 295 +++++++++++ 52 files changed, 6688 insertions(+), 39 deletions(-) create mode 100644 browsers/util.py create mode 100644 frameworks/__init__.py create mode 100644 frameworks/bcode/run_task.py create mode 100644 frameworks/browser_use/__init__.py create mode 100644 frameworks/browser_use/run_task.py create mode 100644 frameworks/browser_use_cloud_api_v2/run_task.py create mode 100644 frameworks/browser_use_cloud_api_v3/run_task.py create mode 100644 frameworks/browserbase_agent/executor.mjs create mode 100644 frameworks/browserbase_agent/package.json create mode 100644 frameworks/browserbase_agent/run_task.py create mode 100644 frameworks/but/__init__.py create mode 100644 frameworks/but/run_task.py create mode 100644 frameworks/but/system_prompt.md create mode 100644 frameworks/but_rust/__init__.py create mode 100644 frameworks/but_rust/run_task.py create mode 100644 frameworks/but_rust/system_prompt.md create mode 100644 frameworks/claude_code_harness/__init__.py create mode 100644 frameworks/claude_code_harness/run_task.py create mode 100644 frameworks/claude_code_harness/system_prompt.md create mode 100644 frameworks/claude_code_harness_ab/__init__.py create mode 100644 frameworks/claude_code_harness_ab/run_task.py create mode 100644 frameworks/claude_code_harness_ab/system_prompt.md create mode 100644 frameworks/claude_code_harness_bu_cli/__init__.py create mode 100644 frameworks/claude_code_harness_bu_cli/run_task.py create mode 100644 frameworks/claude_code_harness_bu_cli/system_prompt.md create mode 100644 frameworks/claude_code_harness_js/__init__.py create mode 100644 frameworks/claude_code_harness_js/run_task.py create mode 100644 frameworks/claude_code_harness_js/system_prompt.md create mode 100644 frameworks/claude_cua/__init__.py create mode 100644 frameworks/claude_cua/run_task.py create mode 100644 frameworks/codex_harness/__init__.py create mode 100644 frameworks/codex_harness/run_task.py create mode 100644 frameworks/codex_harness/system_prompt.md create mode 100644 frameworks/pi_harness/__init__.py create mode 100644 frameworks/pi_harness/run_task.py create mode 100644 frameworks/pi_harness/system_prompt.md create mode 100644 frameworks/pibt/__init__.py create mode 100644 frameworks/pibt/run_task.py create mode 100644 frameworks/pibt/system_prompt.md create mode 100644 frameworks/stagehand/__init__.py create mode 100644 frameworks/stagehand/executor.js create mode 100644 frameworks/stagehand/package.json create mode 100644 frameworks/stagehand/run_task.py create mode 100644 laminar.py create mode 100644 lmnr.py create mode 100644 models.py create mode 100644 run_framework_eval.py diff --git a/.env.example b/.env.example index 8c8b2d3..7149172 100644 --- a/.env.example +++ b/.env.example @@ -1,5 +1,15 @@ BROWSER_USE_API_KEY= GOOGLE_API_KEY= +OPENAI_API_KEY= +ANTHROPIC_API_KEY= + +# Optional model providers used by some framework/model combinations +GOOGLE_GENERATIVE_AI_API_KEY= +FIREWORKS_API_KEY= +OPENROUTER_API_KEY= +DEEPSEEK_API_KEY= +MOONSHOT_API_KEY= +DASHSCOPE_API_KEY= # Optional: only needed if using --browser with the corresponding provider ANCHORBROWSER_API_KEY= diff --git a/.gitignore b/.gitignore index a2735bf..29f3c05 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,7 @@ __pycache__/ .env run_data/ DESIGN.MD -results/ \ No newline at end of file +results/ +BU_Bench_V1.json +Stealth_Bench_V1.json +benchmarks/*.json diff --git a/README.md b/README.md index 7104e28..efe54a2 100644 --- a/README.md +++ b/README.md @@ -130,6 +130,24 @@ uv run python run_eval.py Results are saved to `results/` and detailed traces to `run_data/`. +### Re-verifying Framework Results + +Use `run_framework_eval.py` to rerun BU_Bench_V1 through a framework adapter. +It decrypts `BU_Bench_V1.enc` in memory and writes local outputs to ignored +`results/` and `run_data/`. + +```bash +uv run python run_framework_eval.py --list-frameworks +uv run python run_framework_eval.py --framework browser-use --browser browser-use-cloud --model bu-2-0 +``` + +See the comment at the top of `run_framework_eval.py` for framework-specific +setup, options, and examples. + +Important: `run_data/` traces include decrypted task text, ground truth, model +outputs, and screenshots. They are gitignored for local verification only. Do +not publish or commit them. + ### Swapping Models Edit `run_eval.py` to change the model: diff --git a/browsers/__init__.py b/browsers/__init__.py index 0862591..3784c1d 100644 --- a/browsers/__init__.py +++ b/browsers/__init__.py @@ -1,21 +1,21 @@ -"""Browser provider registry. +"""Browser provider registry.""" -Each provider module exports: - async def connect() -> str -- returns a CDP WebSocket URL - async def disconnect() -> None -- cleans up the session - -Usage: - from browsers import get_provider - provider = get_provider("anchor") - cdp_url = await provider.connect() - ... - await provider.disconnect() -""" - -import asyncio import importlib -import httpx +from browsers.util import retry_on_429 +from browsers import ( + anchor, + browser_use_cloud, + browserbase, + browserless, + driver, + hyperbrowser, + local_headful, + local_headless, + onkernel, + rebrowser, + steel, +) PROVIDERS = [ "anchor", @@ -30,22 +30,23 @@ async def disconnect() -> None -- cleans up the session "steel", ] +BROWSERS = { + "anchor": anchor, + "browser-use-cloud": browser_use_cloud, + "browserbase": browserbase, + "browserless": browserless, + "driver": driver, + "hyperbrowser": hyperbrowser, + "local_headful": local_headful, + "local_headless": local_headless, + "onkernel": onkernel, + "rebrowser": rebrowser, + "steel": steel, +} + def get_provider(name: str): """Import and return a browser provider module by name.""" if name not in PROVIDERS: raise ValueError(f"Unknown browser provider: {name}. Available: {PROVIDERS}") return importlib.import_module(f"browsers.{name}") - - -async def retry_on_429(fn, max_retries=10, max_wait=30): - """Call fn(), retrying with capped exponential backoff on 429 responses.""" - for attempt in range(max_retries + 1): - try: - return await fn() - except httpx.HTTPStatusError as e: - if e.response.status_code != 429 or attempt == max_retries: - raise - wait = min(2**attempt, max_wait) - print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s") - await asyncio.sleep(wait) diff --git a/browsers/browser_use_cloud.py b/browsers/browser_use_cloud.py index c212fcf..336289e 100644 --- a/browsers/browser_use_cloud.py +++ b/browsers/browser_use_cloud.py @@ -1,25 +1,34 @@ -"""browser-use Cloud -- https://browser-use.com - -Requires: BROWSER_USE_API_KEY env var. -""" +"""browser-use cloud browser provider.""" import os import httpx -from browsers import retry_on_429 +from browsers.util import retry_on_429 + +MAX_CONCURRENT = 200 _session_id: str | None = None +def _api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v2") + return f"{base}/api/{version}" + + +def _api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + async def connect() -> str: global _session_id async def _create(): async with httpx.AsyncClient() as client: resp = await client.post( - "https://api.browser-use.com/api/v2/browsers", - headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]}, + f"{_api_base()}/browsers", + headers={"X-Browser-Use-API-Key": _api_key()}, json={}, timeout=90, ) @@ -36,10 +45,11 @@ async def disconnect() -> None: if not _session_id: return async with httpx.AsyncClient() as client: - await client.patch( - f"https://api.browser-use.com/api/v2/browsers/{_session_id}", - headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]}, + resp = await client.patch( + f"{_api_base()}/browsers/{_session_id}", + headers={"X-Browser-Use-API-Key": _api_key()}, json={"action": "stop"}, timeout=30, ) + resp.raise_for_status() _session_id = None diff --git a/browsers/util.py b/browsers/util.py new file mode 100644 index 0000000..e4ab610 --- /dev/null +++ b/browsers/util.py @@ -0,0 +1,16 @@ +import asyncio + +import httpx + + +async def retry_on_429(fn, max_retries=10, max_wait=30): + """Call fn(), retrying with capped exponential backoff on 429 responses.""" + for attempt in range(max_retries + 1): + try: + return await fn() + except httpx.HTTPStatusError as e: + if e.response.status_code != 429 or attempt == max_retries: + raise + wait = min(2**attempt, max_wait) + print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s") + await asyncio.sleep(wait) diff --git a/frameworks/__init__.py b/frameworks/__init__.py new file mode 100644 index 0000000..18c6390 --- /dev/null +++ b/frameworks/__init__.py @@ -0,0 +1,325 @@ +"""Framework registry and shared local evaluation flow. + +This public runner intentionally avoids the remote dispatch/tracing stack. +It loads the encrypted benchmark file, executes one task with a framework +adapter, judges the trace, and writes local JSON artifacts under ignored paths. +""" + +import asyncio +import base64 +import hashlib +import json +import os +import traceback +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Awaitable, Callable + +from browser_use import ChatGoogle +from cryptography.fernet import Fernet + +from judge import JudgementResult, construct_judge_messages + +ROOT_DIR = Path(__file__).resolve().parent.parent +DEFAULT_TASK_TIMEOUT = 1800 + + +def _task_timeout() -> int: + raw = os.environ.get("TASK_TIMEOUT") + if not raw: + return DEFAULT_TASK_TIMEOUT + try: + return int(raw) + except ValueError: + return DEFAULT_TASK_TIMEOUT + + +def parse_params() -> dict[str, str]: + """Parse PARAMS env var. Format: key=value,key=value.""" + raw = os.environ.get("PARAMS", "") + if not raw: + return {} + params: dict[str, str] = {} + for part in raw.split(","): + if "=" not in part: + continue + key, value = part.split("=", 1) + key = key.strip() + if key: + params[key] = value.strip() + return params + + +def validate_params(params: dict[str, str], accepted: dict[str, str]) -> dict[str, str]: + unknown = set(params) - set(accepted) + if unknown: + accepted_keys = ", ".join(sorted(accepted)) or "none" + raise ValueError( + f"Unknown params: {', '.join(sorted(unknown))}. Accepted: {accepted_keys}" + ) + return params + + +@dataclass +class ExecutionResult: + final_result: str + steps: list[str] + screenshots_b64: list[str] + num_steps: int + duration_seconds: float + cost: float = 0.0 + + +@dataclass +class FrameworkInfo: + browsers: list[str] + repo: str | None = None + max_concurrent_override: int | None = None + notes: str = "" + + +FRAMEWORKS: dict[str, FrameworkInfo] = { + "browser-use": FrameworkInfo( + browsers=[ + "browser-use-cloud", + "anchor", + "browserbase", + "browserless", + "driver", + "hyperbrowser", + "local_headful", + "local_headless", + "onkernel", + "rebrowser", + "steel", + ], + repo="browser-use/browser-use", + ), + "browser-use-cloud-api-v2": FrameworkInfo(browsers=["integrated"]), + "browser-use-cloud-api-v3": FrameworkInfo(browsers=["integrated"]), + "bcode": FrameworkInfo(browsers=["browser-use-cloud"], repo="browser-use/browsercode"), + "bcode-v012": FrameworkInfo( + browsers=["browser-use-cloud"], + repo="browser-use/browsercode", + notes="Alias for the bcode adapter used with framework_ref v0.1.2.", + ), + "browserbase-agent": FrameworkInfo(browsers=["integrated"]), + "stagehand": FrameworkInfo( + browsers=["browserbase", "local_headless"], + repo="browserbase/stagehand", + notes="Adapter scaffold; executor must be completed before use.", + ), + "claude-code-harness": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-harness" + ), + "claude-code-harness-js": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-harness-js" + ), + "claude-code-harness-ab": FrameworkInfo( + browsers=["browser-use-cloud"], repo="vercel-labs/agent-browser" + ), + "claude-code-harness-bu-cli": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-use" + ), + "codex-harness": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-harness" + ), + "pi-harness": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-harness" + ), + "pibt": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/pi-agent-extensions" + ), + "but": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-use-terminal" + ), + "but-rust": FrameworkInfo( + browsers=["browser-use-cloud"], repo="browser-use/browser-use-terminal" + ), + "claude-cua": FrameworkInfo( + browsers=["integrated"], + notes="Adapter scaffold; not used for the published BU_Bench_V1 runs.", + ), +} + + +def framework_to_module(framework: str) -> str: + if framework == "bcode-v012": + return "bcode" + return framework.replace("-", "_") + + +def interleave(tasks: list[dict]) -> list[dict]: + """Reorder 100 tasks, 20 per section, matching the distributed runner.""" + if os.environ.get("NO_INTERLEAVE") == "1": + return tasks + if len(tasks) != 100: + return tasks + reordered = [] + for i in range(20): + for d in range(5): + reordered.append(tasks[d * 20 + i]) + return reordered + + +def _encrypted_task_file(benchmark: str) -> Path: + candidates = [ + ROOT_DIR / f"{benchmark}.enc", + ROOT_DIR / f"{benchmark.upper()}.enc", + ROOT_DIR / "benchmarks" / f"{benchmark}.enc", + ] + for path in candidates: + if path.exists(): + return path + raise FileNotFoundError( + f"Could not find encrypted task file for {benchmark}. Expected " + f"{ROOT_DIR / (benchmark + '.enc')}" + ) + + +def load_tasks(benchmark: str) -> list[dict]: + """Load tasks from the encrypted public artifact without writing plaintext.""" + task_file = _encrypted_task_file(benchmark) + key = base64.urlsafe_b64encode(hashlib.sha256(benchmark.encode()).digest()) + encrypted = base64.b64decode(task_file.read_text()) + return json.loads(Fernet(key).decrypt(encrypted)) + + +JUDGE_LLM = None + + +def _get_judge_llm(): + global JUDGE_LLM + if JUDGE_LLM is None: + JUDGE_LLM = ChatGoogle( + model=os.environ.get("JUDGE_MODEL", "gemini-2.5-flash"), + api_key=os.getenv("GOOGLE_API_KEY"), + ) + return JUDGE_LLM + + +async def _evaluate_task(judge_messages) -> JudgementResult: + response = await _get_judge_llm().ainvoke( + judge_messages, output_format=JudgementResult + ) + return response.completion + + +def _write_json(path: Path, data: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def _maybe_write_task_artifact( + task: dict, + result: ExecutionResult | None, + judgement: dict[str, Any] | None, + score: int, + error: str | None = None, + tb: str | None = None, +) -> None: + run_data_dir_raw = os.environ.get("RUN_DATA_DIR") + if not run_data_dir_raw: + return + task_id = task.get("task_id", f"task_{task.get('_index', 'unknown')}") + payload: dict[str, Any] = { + "task_id": task_id, + "score": score, + "judgement": judgement, + } + if result is not None: + payload["agent_trace"] = { + "agent_task": task.get("confirmed_task"), + "final_result": result.final_result, + "agent_steps": result.steps, + "ground_truth": task.get("answer"), + "screenshots_b64": result.screenshots_b64, + } + payload["metrics"] = { + "steps": result.num_steps, + "duration": result.duration_seconds, + "cost": result.cost, + } + if error: + payload["error"] = error + if tb: + payload["traceback"] = tb + _write_json(Path(run_data_dir_raw) / f"{task_id}.json", payload) + + +def _maybe_write_local_result(data: dict[str, Any]) -> None: + output = os.environ.get("LOCAL_RESULT_FILE") + if output: + _write_json(Path(output), data) + + +async def run_and_judge( + task: dict, + execute_fn: Callable[[str], Awaitable[ExecutionResult]], +) -> dict[str, Any]: + """Execute one task, judge it, and return a task-level result dict.""" + task_id = task.get("task_id", "unknown") + print(f"Running task: {task_id}") + + try: + result = await asyncio.wait_for( + execute_fn(task["confirmed_task"]), timeout=_task_timeout() + ) + judge_messages = construct_judge_messages( + task=task["confirmed_task"], + final_result=result.final_result, + agent_steps=result.steps, + ground_truth=task.get("answer"), + screenshots_b64=result.screenshots_b64, + ) + judgement = await _evaluate_task(judge_messages) + judgement_data = judgement.model_dump() + score = 1 if judgement.verdict else 0 + print(f"Task {task_id} completed: score={score}") + + data = { + "task_id": task_id, + "task_index": task.get("_index"), + "score": score, + "steps": result.num_steps, + "duration": result.duration_seconds, + "cost": result.cost, + "judgement": judgement_data, + } + _maybe_write_task_artifact(task, result, judgement_data, score) + _maybe_write_local_result(data) + return data + + except asyncio.TimeoutError: + error_msg = f"Timed out after {_task_timeout()}s" + print(f"Task {task_id} timed out after {_task_timeout()}s") + data = { + "task_id": task_id, + "task_index": task.get("_index"), + "score": 0, + "steps": 0, + "duration": _task_timeout(), + "cost": 0, + "error": error_msg, + } + _maybe_write_task_artifact(task, None, None, 0, error=error_msg) + _maybe_write_local_result(data) + return data + + except BaseException as e: + error_msg = f"{type(e).__name__}: {e}" + tb = traceback.format_exc() + print(f"Task {task_id} failed: {error_msg}") + data = { + "task_id": task_id, + "task_index": task.get("_index"), + "score": 0, + "steps": 0, + "duration": 0, + "cost": 0, + "error": error_msg, + "traceback": tb, + } + _maybe_write_task_artifact(task, None, None, 0, error=error_msg, tb=tb) + _maybe_write_local_result(data) + return data diff --git a/frameworks/bcode/run_task.py b/frameworks/bcode/run_task.py new file mode 100644 index 0000000..434fb55 --- /dev/null +++ b/frameworks/bcode/run_task.py @@ -0,0 +1,384 @@ +"""Run a single benchmark task using bcode (browsercode). + +bcode is a coding agent (opencode fork) with a built-in browser harness. +We pre-provision a browser-use-cloud session and pass its CDP URL through +`BU_CDP_WS`, which the in-process CDP `Session.connect()` reads as a +default endpoint when the agent calls `session.connect()` with no args +(v0.1.1+). bcode then runs headlessly: + + bcode run --model --format json -- "" + +Stdout is one JSON event per line (tool_use, step_start, step_finish, text, +reasoning, error). We extract steps, final answer, and cost from these +events. + +Screenshots (v0.1.2+): the bcode browser-execute hook taps every +`Page.captureScreenshot` CDP call and (a) auto-attaches the image to the +agent's next assistant turn so the model sees it inline, and (b) when +`BCODE_SCREENSHOT_DIR=` is set, writes the same PNG to disk for the +eval-judge. Files are named `--.` so +sort-by-name is sort-by-time. We point the dump dir at a per-task subdir, +read the PNGs back as base64, and hand them to the judge -- matching the +v0.0.x `/tmp/shots/` flow. Pin `framework_ref >= v0.1.2` to use this hook. + +v0.1.0 vs v0.1.1 vs v0.1.2: v0.1.0 ported the harness from Python (uv + +helpers.py + daemon) to in-process TypeScript (the agent writes JS that +drives a CDP `Session` directly). v0.1.0 dropped honoring `BU_CDP_WS`; +v0.1.1 restored it as a default in `Session.connect()`. v0.1.2 added the +`Page.captureScreenshot` tap (auto-attach + `BCODE_SCREENSHOT_DIR` disk +dump). Pin `framework_ref >= v0.1.2` for screenshot-judging. +""" + +import asyncio +import base64 +import json +import os +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from lmnr import Laminar +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", + "fetch_use": "Enable/disable the Browser-Use fetch-use proxy for bcode's webfetch tool (true/false, default: true on v0.1.3+ when BROWSER_USE_API_KEY is set). Setting false injects {\"experimental\":{\"fetch_use\":false}} via OPENCODE_CONFIG_CONTENT so webfetch uses the native HttpClient instead of the proxy. Use for A/B isolating v0.1.3's fetch-use rewrite from other v0.1.3 changes.", +} + +PRE_PROMPT = ( + "You are a coding agent with browser access working fully autonomously. " + "A browser is preconfigured for you: calling `await session.connect()` " + "(no args) inside a `browser_execute` snippet attaches to it. " + "Calling `session.Page.captureScreenshot()` returns the image and the " + "harness auto-attaches it to your next turn so you can see it inline. " + "Take screenshots whenever you need to verify page state. Your final " + "assistant message is what the judge will read as your answer to the " + "task.\n\n" + "Work to complete the following task: {task}" +) + +# bcode is installed via the official curl installer (eval.yaml) which drops +# the binary at $HOME/.bcode/bin/bcode. Resolve once at import time. +BCODE_BIN = str(Path(os.environ["HOME"]) / ".bcode" / "bin" / "bcode") +# Where bcode v0.1.2+ writes Page.captureScreenshot dumps when +# BCODE_SCREENSHOT_DIR is set. Per-task: reset before run, drained after. +SHOTS_DIR = Path("/tmp/bcode_shots") + + +def _reset_shots_dir() -> None: + if SHOTS_DIR.exists(): + shutil.rmtree(SHOTS_DIR) + SHOTS_DIR.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG/JPEG bcode wrote during this task as base64. + + File naming (v0.1.2): `--.`. Sort + by name to recover capture order across parallel browser_execute calls + (in practice opencode serializes tool calls within one assistant + message, so this is just a stable order). + """ + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.iterdir() if p.is_file() and p.suffix in (".png", ".jpeg", ".jpg")) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +def _bu_api_base() -> str: + """Resolve the Browser-Use Cloud API base. Default prod, override via env.""" + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + +# Map a benchmark model alias to opencode's `provider/model` slug by checking +# substrings. Avoids a per-model lookup table; new models pass through as long +# as the provider keyphrase is present in the alias. +PROVIDER_KEYPHRASES = ( + ("claude", "anthropic"), + ("gemini", "google"), + ("gemma", "google"), + ("gpt", "openai"), + ("codex", "openai"), +) + + +def _resolve_model_slug(model_name: str) -> str: + if "/" in model_name: + return model_name + lower = model_name.lower() + for key, provider in PROVIDER_KEYPHRASES: + if key in lower: + return f"{provider}/{model_name}" + raise ValueError( + f"bcode: cannot infer provider for model {model_name!r}. " + f"Pass an explicit `provider/model` slug as --model, or add a keyphrase " + f"to PROVIDER_KEYPHRASES in frameworks/bcode/run_task.py." + ) + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser() -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +def _format_step(event: dict) -> str | None: + """Turn one --format=json event into a step string (or None to skip).""" + etype = event.get("type") + part = event.get("part") or {} + if etype == "tool_use": + tool = part.get("tool", "?") + inp = (part.get("state") or {}).get("input") or {} + if tool == "bash": + return f"bash: {(inp.get('command') or '').strip()[:2000]}" + if tool in ("read", "write", "edit"): + return f"{tool}: {inp.get('filePath') or inp.get('path') or ''}" + if tool in ("browser-execute", "browser_execute"): + # v0.1.0+ renamed the snippet field python -> code; v0.0.x used + # python. Read both so this runner works against either harness. + snippet = (inp.get("code") or inp.get("python") or "").strip() + return f"browser_execute: {snippet[:2000]}" + if tool == "webfetch": + return f"webfetch: {inp.get('url') or ''}" + if tool in ("glob", "grep", "codesearch", "websearch"): + return f"{tool}: {inp.get('pattern') or inp.get('query') or ''}" + try: + return f"{tool}: {json.dumps(inp, separators=(',', ':'))[:2000]}" + except Exception: + return tool + if etype == "text": + text = (part.get("text") or "").strip() + return f"text: {text[:2000]}" if text else None + if etype == "reasoning": + text = (part.get("text") or "").strip() + return f"thinking: {text[:2000]}" if text else None + return None + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while line := await proc.stderr.readline(): + s = line.decode("utf-8", errors="replace").rstrip("\n") + buf.append(s) + print(f"[bcode-stderr] {s}", flush=True) + + +async def _iter_lines(stream: asyncio.StreamReader): + """Yield one line at a time, tolerant of arbitrarily long lines.""" + buf = bytearray() + while chunk := await stream.read(1 << 16): + buf.extend(chunk) + while (nl := buf.find(b"\n")) >= 0: + yield bytes(buf[:nl]) + del buf[: nl + 1] + if buf: + yield bytes(buf) + + +async def execute(task_description: str) -> ExecutionResult: + params = parse_params() + validate_params(params, ACCEPTED_PARAMS) + model_slug = _resolve_model_slug(os.environ["MODEL"]) + + # browser-use-cloud session via direct API. BU_CDP_WS is read by bcode's + # in-process CDP `Session.connect()` (v0.1.1+) as a default endpoint when + # the agent calls `session.connect()` with no args. v0.0.x's Python + # harness daemon read the same env var. Single env-var keeps the runner + # compatible with both harness eras. + browser_id, cdp_ws = _start_browser() + + parent_span_context = Laminar.serialize_span_context() + # Reset and route the screenshot dump dir BEFORE bcode starts. v0.1.2+ + # writes every Page.captureScreenshot result here (in addition to the + # auto-attach to the agent's next turn -- same tap, two consumers). + _reset_shots_dir() + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + "BCODE_SCREENSHOT_DIR": str(SHOTS_DIR), + } + if parent_span_context: + env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context + # fetch_use=false -> inject opencode.json-equivalent config disabling the + # fetch-use proxy. OPENCODE_CONFIG_CONTENT is merged with local-scope + # precedence at startup (see packages/opencode/src/config/config.ts:593), + # so this overrides any default bcode would have applied. Schema: + # experimental.fetch_use: bool (v0.1.3+). When BROWSER_USE_API_KEY is set + # AND this flag is true (default), webfetch routes via fetch.browser-use.com; + # setting false falls back to native HttpClient. No-op on 3}] {s[:500]}", flush=True) + + if event.get("type") == "text": + if t := ((event.get("part") or {}).get("text") or "").strip(): + final_text = t + elif event.get("type") == "step_finish": + total_cost += float((event.get("part") or {}).get("cost") or 0.0) + elif event.get("type") == "error": + err = event.get("error") + errors.append(err if isinstance(err, str) else json.dumps(err)) + print(f"[bcode-error] {errors[-1][:500]}", flush=True) + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_id) + + duration = time.time() - start + + if proc.returncode not in (0, None) and not final_text and not steps: + raise RuntimeError( + f"bcode exited with code {proc.returncode} before producing output. " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + answer = (final_text or "").strip() + if errors and not answer: + final_result = f"[bcode_error] {errors[0][:500]}" + elif errors: + final_result = f"[bcode_error_recovered] {answer}" + else: + final_result = answer or "[bcode_no_output]" + + return ExecutionResult( + final_result=final_result, + steps=steps, + # v0.1.2+ taps Page.captureScreenshot and writes PNGs to + # BCODE_SCREENSHOT_DIR (set above to SHOTS_DIR). Drain them now so + # the judge sees the same visual signal as on v0.0.x. + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + # Propagate task_timeout param to run_and_judge before it wraps execute(). + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/browser_use/__init__.py b/frameworks/browser_use/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/browser_use/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/browser_use/run_task.py b/frameworks/browser_use/run_task.py new file mode 100644 index 0000000..ac521ab --- /dev/null +++ b/frameworks/browser_use/run_task.py @@ -0,0 +1,113 @@ +"""Run a single benchmark task using the browser-use agent framework.""" + +import os +import sys +import asyncio +import base64 +from pathlib import Path +from functools import partial + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from browser_use import Agent, Browser +from lmnr import observe +from browsers import BROWSERS +from models import MODELS +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "use_vision": "Enable/disable vision (screenshots) for the agent (true/false, default: true)", + "framework_repo": "Override GitHub repo for browser-use install (e.g. Alezander9/alex-browser-use). Consumed by the workflow install step, not the runner.", +} + + +def encode_screenshots(paths: list[str]) -> list[str]: + result = [] + for p in paths: + path = Path(p) + if path.exists(): + result.append(base64.b64encode(path.read_bytes()).decode()) + return result + + +@observe(span_type="EXECUTOR") +async def execute( + task_description: str, llm, browser_name: str, use_vision: bool = True +) -> ExecutionResult: + """Run a browser-use agent on the task and return a standardized result.""" + provider = BROWSERS[browser_name] + cdp_url = await provider.connect() + if cdp_url: + browser = Browser(cdp_url=cdp_url) + else: + headless = getattr(provider, "HEADLESS", True) + browser = Browser(headless=headless) + + agent = Agent( + task=task_description, + llm=llm, + browser=browser, + use_judge=False, + use_vision=use_vision, + ) + try: + history = await agent.run() + finally: + try: + await browser.kill() + except Exception: + pass + await provider.disconnect() + + return ExecutionResult( + final_result=history.final_result() or "Agent did not return a result", + steps=history.agent_steps(), + screenshots_b64=encode_screenshots( + [p for p in history.screenshot_paths() if p is not None] + ), + num_steps=history.number_of_steps(), + duration_seconds=history.total_duration_seconds(), + cost=history.usage.total_cost if history.usage else 0, + ) + + +async def main(): + params = validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + model_name = os.environ["MODEL"] + eval_id = os.environ["EVAL_ID"] + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + use_vision = params.get("use_vision", "true").lower() != "false" + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + llm = MODELS[model_name]() + execute_fn = partial( + execute, llm=llm, browser_name=browser_name, use_vision=use_vision + ) + await run_and_judge(task, execute_fn) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/browser_use_cloud_api_v2/run_task.py b/frameworks/browser_use_cloud_api_v2/run_task.py new file mode 100644 index 0000000..31c294b --- /dev/null +++ b/frameworks/browser_use_cloud_api_v2/run_task.py @@ -0,0 +1,201 @@ +"""Run a single benchmark task using the Browser Use Cloud API v2. + +Dispatches a task via POST /api/v2/tasks, polls GET /api/v2/tasks/{id} +until completion, then maps the response into ExecutionResult for the judge. +""" + +import asyncio +import base64 +import os +import sys +from functools import partial +from pathlib import Path + +import httpx + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = {} + +API_BASE = "https://api.browser-use.com/api/v2" +POLL_INTERVAL = 5 +TERMINAL_STATUSES = {"finished", "stopped"} + +# V2 SupportedLLMs mapped from our model registry names. +# Only models the v2 API actually supports are listed here. +V2_MODEL_MAP = { + "bu-2-0": "browser-use-2.0", + "bu-1-0": "browser-use-llm", + "gpt-4.1": "gpt-4.1", + "gpt-4.1-mini": "gpt-4.1-mini", + "o4-mini": "o4-mini", + "o3": "o3", + "gemini-2.5-flash": "gemini-2.5-flash", + "gemini-2.5-pro": "gemini-2.5-pro", + "gemini-3-pro-preview": "gemini-3-pro-preview", + "gemini-3-flash-preview": "gemini-3-flash-preview", + "gpt-4o": "gpt-4o", + "gpt-4o-mini": "gpt-4o-mini", + "claude-sonnet-4-5": "claude-sonnet-4-5-20250929", + "claude-sonnet-4-6": "claude-sonnet-4-6", + "claude-opus-4-5": "claude-opus-4-5-20251101", + "claude-3-7-sonnet": "claude-3-7-sonnet-20250219", +} + + +def _headers() -> dict: + return {"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]} + + +def _create_task(task_description: str, model: str) -> dict: + """Create a v2 task and return the response (id, sessionId).""" + api_model = V2_MODEL_MAP.get(model, model) + resp = httpx.post( + f"{API_BASE}/tasks", + headers=_headers(), + json={"task": task_description, "llm": api_model}, + timeout=30, + ) + resp.raise_for_status() + return resp.json() + + +def _get_task(task_id: str) -> dict: + """Poll task status.""" + resp = httpx.get( + f"{API_BASE}/tasks/{task_id}", + headers=_headers(), + timeout=30, + ) + resp.raise_for_status() + return resp.json() + + +def _fetch_screenshot_b64(url: str) -> str | None: + """Download a screenshot URL and return base64-encoded bytes.""" + try: + resp = httpx.get(url, timeout=30) + resp.raise_for_status() + return base64.b64encode(resp.content).decode() + except Exception: + return None + + +def _format_step(step: dict) -> str: + """Format a v2 TaskStepView to match browser-use agent_steps() format. + + Ground truth format: + Step N: + Actions: [json array with indent=1] + Result M: (not available from v2 API) + """ + import json as _json + + step_text = f"Step {step.get('number', '?')}:\n" + + actions_raw = step.get("actions", []) + if actions_raw: + parsed = [] + for a in actions_raw: + try: + parsed.append(_json.loads(a)) + except (_json.JSONDecodeError, TypeError): + parsed.append(a) + step_text += f"Actions: {_json.dumps(parsed, indent=1)}\n" + + return step_text + + +def _duration_seconds(task_data: dict) -> float: + """Compute duration from startedAt/finishedAt timestamps.""" + from datetime import datetime + + started = task_data.get("startedAt") + finished = task_data.get("finishedAt") + if not started or not finished: + return 0.0 + try: + t0 = datetime.fromisoformat(started.replace("Z", "+00:00")) + t1 = datetime.fromisoformat(finished.replace("Z", "+00:00")) + return max((t1 - t0).total_seconds(), 0.0) + except Exception: + return 0.0 + + +async def execute(task_description: str, model_name: str) -> ExecutionResult: + """Create a v2 task, poll until done, return ExecutionResult.""" + created = _create_task(task_description, model_name) + task_id = created["id"] + print(f"V2 task created: {task_id}") + + # Poll until terminal + while True: + await asyncio.sleep(POLL_INTERVAL) + task_data = _get_task(task_id) + status = task_data.get("status", "") + if status in TERMINAL_STATUSES: + break + print(f" V2 task {task_id} status: {status}") + + steps = task_data.get("steps", []) + agent_steps = [_format_step(s) for s in steps] + + # Collect screenshots from step URLs + screenshots_b64 = [] + for step in steps: + url = step.get("screenshotUrl") + if url: + img = _fetch_screenshot_b64(url) + if img: + screenshots_b64.append(img) + + output = task_data.get("output") or "No output returned" + cost_str = task_data.get("cost") or "0" + cost = float(cost_str) + duration = _duration_seconds(task_data) + + return ExecutionResult( + final_result=output, + steps=agent_steps, + screenshots_b64=screenshots_b64, + num_steps=len(steps), + duration_seconds=duration, + cost=cost, + ) + + +async def main(): + validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + model_name = os.environ["MODEL"] + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + execute_fn = partial(execute, model_name=model_name) + await run_and_judge(task, execute_fn) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/browser_use_cloud_api_v3/run_task.py b/frameworks/browser_use_cloud_api_v3/run_task.py new file mode 100644 index 0000000..53f2c6f --- /dev/null +++ b/frameworks/browser_use_cloud_api_v3/run_task.py @@ -0,0 +1,286 @@ +"""Run a single benchmark task using the Browser Use Cloud API v3 (BU Agent). + +Dispatches a task via POST /api/v3/sessions, polls GET /api/v3/sessions/{id} +until completion, fetches session messages to reconstruct step data, then maps +into ExecutionResult for the judge. + +Step data is reconstructed from the messages endpoint to match the browser-use +agent_steps() ground truth format as closely as possible. Screenshots are not +available from this API. +""" + +import asyncio +import json +import os +import sys +from functools import partial +from pathlib import Path + +import httpx + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "skills": "Enable/disable skill memory (true/false, default: true)", +} + +API_BASE = "https://api.browser-use.com/api/v3" +POLL_INTERVAL = 5 +TERMINAL_STATUSES = {"stopped", "error", "timed_out"} + +V3_MODEL_MAP = { + "bu-mini": "bu-mini", + "bu-max": "bu-max", + "bu-ultra": "bu-ultra", +} + +# Map V3 tool names to browser-use action names for ground-truth-like formatting. +TOOL_NAME_MAP = { + "browser_navigate": "navigate", + "browser_type_text": "input", + "browser_wait": "wait", + "browser_click": "click", + "browser_scroll": "scroll", + "browser_go_back": "go_back", + "browser_search_google": "search_google", + "browser_analyze_state": "analyze_state", + "done_autonomous": "done", +} + + +def _headers() -> dict: + return {"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]} + + +def _create_session(task_description: str, model: str, skills: bool = True) -> dict: + api_model = V3_MODEL_MAP.get(model, model) + resp = httpx.post( + f"{API_BASE}/sessions", + headers=_headers(), + json={"task": task_description, "model": api_model, "skills": skills}, + timeout=30, + ) + resp.raise_for_status() + return resp.json() + + +def _get_session(session_id: str) -> dict: + resp = httpx.get( + f"{API_BASE}/sessions/{session_id}", + headers=_headers(), + timeout=30, + ) + resp.raise_for_status() + return resp.json() + + +def _get_all_messages(session_id: str) -> list[dict]: + """Paginate through all messages for a session.""" + all_msgs = [] + after = None + while True: + params = {"limit": 100} + if after: + params["after"] = after + resp = httpx.get( + f"{API_BASE}/sessions/{session_id}/messages", + headers=_headers(), + params=params, + timeout=30, + ) + resp.raise_for_status() + data = resp.json() + msgs = data.get("messages", []) + all_msgs.extend(msgs) + if not data.get("hasMore") or not msgs: + break + after = msgs[-1]["id"] + return all_msgs + + +def _parse_messages_to_steps(messages: list[dict]) -> list[str]: + """Convert V3 session messages into ground-truth-style step strings. + + Groups assistant tool_calls with their corresponding tool results. + Formats each group as: + Step N: + Actions: [json array, indent=1] + Result M: + Error M: + """ + # Parse data fields + parsed = [] + for msg in messages: + data_str = msg.get("data", "{}") + try: + data = json.loads(data_str) + except (json.JSONDecodeError, TypeError): + continue + data["_role"] = msg.get("role", data.get("role", "")) + parsed.append(data) + + # Index tool results by tool_call_id + tool_results: dict[str, dict] = {} + for m in parsed: + if m["_role"] == "tool": + tcid = m.get("tool_call_id") + if tcid: + tool_results[tcid] = m + + # Collect all tool_call_ids claimed by assistant messages + claimed_ids: set[str] = set() + + # Build steps from assistant messages that have tool_calls + steps = [] + step_num = 0 + for m in parsed: + if m["_role"] != "assistant": + continue + tool_calls = m.get("tool_calls") + if not tool_calls: + continue + + step_num += 1 + step_text = f"Step {step_num}:\n" + + # Build actions list matching ground truth format + actions = [] + for tc in tool_calls: + claimed_ids.add(tc.get("id", "")) + func = tc.get("function", {}) + raw_name = func.get("name", "unknown") + action_name = TOOL_NAME_MAP.get(raw_name, raw_name) + try: + args = json.loads(func.get("arguments", "{}")) + except (json.JSONDecodeError, TypeError): + args = {} + actions.append({action_name: args}) + + step_text += f"Actions: {json.dumps(actions, indent=1)}\n" + + # Append results/errors from tool messages + for j, tc in enumerate(tool_calls): + tcid = tc.get("id") + tr = tool_results.get(tcid) + if not tr: + continue + content = tr.get("content", "") + is_error = tr.get("is_error", False) + if is_error and content: + step_text += f"Error {j + 1}: {content}\n" + elif content: + step_text += f"Result {j + 1}: {content}\n" + + steps.append(step_text) + + # Handle orphaned tool results (e.g. done_autonomous whose assistant + # message was not returned by the API) + for tcid, tr in tool_results.items(): + if tcid in claimed_ids: + continue + tool_name = tr.get("tool_name", "") + action_name = TOOL_NAME_MAP.get(tool_name, tool_name) + content = tr.get("content", "") + if not content: + continue + step_num += 1 + step_text = f"Step {step_num}:\n" + action_obj = [{action_name: {}}] + step_text += f"Actions: {json.dumps(action_obj, indent=1)}\n" + step_text += f"Result 1: {content}\n" + steps.append(step_text) + + return steps + + +def _duration_seconds(session_data: dict) -> float: + from datetime import datetime + + created = session_data.get("createdAt") + updated = session_data.get("updatedAt") + if not created or not updated: + return 0.0 + try: + t0 = datetime.fromisoformat(created.replace("Z", "+00:00")) + t1 = datetime.fromisoformat(updated.replace("Z", "+00:00")) + return max((t1 - t0).total_seconds(), 0.0) + except Exception: + return 0.0 + + +async def execute( + task_description: str, model_name: str, skills: bool = True +) -> ExecutionResult: + """Create a v3 session, poll until done, fetch messages, return ExecutionResult.""" + session_data = _create_session(task_description, model_name, skills=skills) + session_id = session_data["id"] + print(f"V3 session created: {session_id}") + + while True: + await asyncio.sleep(POLL_INTERVAL) + session_data = _get_session(session_id) + status = session_data.get("status", "") + if status in TERMINAL_STATUSES: + break + print(f" V3 session {session_id} status: {status}") + + output = session_data.get("output") + if isinstance(output, dict): + output = json.dumps(output) + output = output or "No output returned" + + cost_str = session_data.get("totalCostUsd") or "0" + cost = float(cost_str) + duration = _duration_seconds(session_data) + + # Fetch messages and reconstruct steps + messages = _get_all_messages(session_id) + agent_steps = _parse_messages_to_steps(messages) + + return ExecutionResult( + final_result=output, + steps=agent_steps, + screenshots_b64=[], # Not available from V3 API + num_steps=len(agent_steps), + duration_seconds=duration, + cost=cost, + ) + + +async def main(): + params = validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + model_name = os.environ["MODEL"] + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + skills = params.get("skills", "true").lower() != "false" + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + execute_fn = partial(execute, model_name=model_name, skills=skills) + await run_and_judge(task, execute_fn) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/browserbase_agent/executor.mjs b/frameworks/browserbase_agent/executor.mjs new file mode 100644 index 0000000..dc500f3 --- /dev/null +++ b/frameworks/browserbase_agent/executor.mjs @@ -0,0 +1,181 @@ +/** + * Browserbase Stagehand agent executor (client-side SDK path). + * + * Why client-side and not the hosted REST API: + * `api.stagehand.browserbase.com` is an alpha hosted endpoint running a + * Stagehand server build that predates the opus-4-7 temperature fix + * (Stagehand PRs #2006/#2018, shipped in stagehand-server-v3 v3.6.5 on + * May 6 2026). That endpoint silently rejects opus-4-7 with the + * "`temperature` is deprecated for this model" error from inside the + * Stagehand `fillForm` tool. Running the client SDK locally gives us + * whichever Stagehand version we pin in package.json, fix included. + * + * This is also the path Browserbase tells customers to use for + * production (https://docs.stagehand.dev/v3/best-practices/deployments): + * embed the SDK in your backend, point it at Browserbase. The REST API + * is marketed for their Python SDK transport, not for scale-out. + * + * Joint system benchmarked: (Stagehand agent SDK + Browserbase cloud + * browser + model). Same surface as the original .mjs example we built + * for the .bcode workspace, just dispatched programmatically. + * + * Model routing: defaults to Browserbase Model Gateway (Stagehand + * auto-routes through the gateway when only `apiKey` is set on the + * constructor and no provider env key is present). The runner unsets + * provider env keys before spawning this script when STAGEHAND_USE_GATEWAY + * is "1" (default) so the SDK doesn't grab them out of process env. + * + * Env input (read at startup, all required unless noted): + * TASK_DESCRIPTION the task string to run + * STAGEHAND_MODEL gateway slug e.g. anthropic/claude-opus-4-7 + * MAX_STEPS int, default 25 + * BROWSERBASE_API_KEY required (for browser + gateway) + * BROWSERBASE_PROJECT_ID required (Stagehand SDK still wants it) + * STAGEHAND_VERBOSE int 0/1/2, default 1 + * + * Stdout: exactly one JSON object -- the ExecutionResult-shaped dict the + * Python wrapper reads. All progress / logs go to stderr. + */ + +import { Stagehand } from "@browserbasehq/stagehand"; + +const MODEL = process.env.STAGEHAND_MODEL || "anthropic/claude-sonnet-4-6"; +const MAX_STEPS = parseInt(process.env.MAX_STEPS || "25", 10); +const VERBOSE = parseInt(process.env.STAGEHAND_VERBOSE || "1", 10); + +const SYSTEM_PROMPT = + "You are a browser agent running inside an evaluation harness. " + + "Solve the user's task by navigating and interacting with the live web.\n\n" + + "When you finish, your final message MUST contain the concrete answer " + + "to the task -- the actual names, numbers, list items, or values you " + + "found. Do not paraphrase the answer as 'I extracted X' or 'I found the " + + "data' -- write the data itself. For lists, write items one per line."; + +function fail(msg, extra = {}) { + // Emit an ExecutionResult-shaped object so the Python side records the + // failure on the datapoint instead of raising -- matches the + // "[browserbase_incomplete] ..." convention from the REST runner. + const out = { + final_result: `[browserbase_incomplete] ${msg}`, + steps: [], + screenshots_b64: [], + num_steps: 0, + duration_seconds: 0, + cost: 0, + error: msg, + ...extra, + }; + process.stdout.write(JSON.stringify(out)); + process.exit(0); +} + +function formatStep(act, i) { + // Stagehand 3.x agent action shape: { type, action?, reasoning?, + // instruction?, pageUrl?, taskCompleted? }. We render one judge-readable + // step per action, same as the REST runner's _format_steps. + const parts = [`Step ${i}:`]; + if (act?.type) parts.push(`Type: ${act.type}`); + if (act?.instruction) parts.push(`Instruction: ${act.instruction}`); + if (act?.action) parts.push(`Action: ${act.action}`); + if (act?.reasoning) parts.push(`Reasoning: ${act.reasoning}`); + if (act?.pageUrl) parts.push(`URL: ${act.pageUrl}`); + if (act?.taskCompleted) parts.push("TaskCompleted: true"); + return parts.join("\n"); +} + +async function main() { + const task = process.env.TASK_DESCRIPTION; + if (!task) fail("TASK_DESCRIPTION env var is required"); + for (const k of ["BROWSERBASE_API_KEY", "BROWSERBASE_PROJECT_ID"]) { + if (!process.env[k]) fail(`missing env var: ${k}`); + } + + process.stderr.write( + `[browserbase-agent] model=${MODEL} maxSteps=${MAX_STEPS}\n` + ); + + const stagehand = new Stagehand({ + env: "BROWSERBASE", + apiKey: process.env.BROWSERBASE_API_KEY, + projectId: process.env.BROWSERBASE_PROJECT_ID, + // With `model` on the constructor and no provider key on env, Stagehand + // routes inference through the Browserbase Model Gateway. The Python + // wrapper scrubs provider keys before spawning us when gateway mode is + // requested (the default). + model: MODEL, + verbose: VERBOSE, + disablePino: true, + logger: (line) => { + if ((line?.level ?? 1) > VERBOSE) return; + const tag = line.level === 0 ? "ERR" : line.level === 2 ? "DBG" : "INF"; + const cat = line.category ? `[${line.category}] ` : ""; + process.stderr.write(`[stagehand:${tag}] ${cat}${line.message}\n`); + }, + }); + + const t0 = Date.now(); + try { + await stagehand.init(); + } catch (err) { + fail(`stagehand.init failed: ${err?.message || err}`); + } + + const sessionId = stagehand.browserbaseSessionID; + const recordingUrl = `https://browserbase.com/sessions/${sessionId}`; + process.stderr.write(`[browserbase-agent] session=${sessionId}\n`); + process.stderr.write(`[browserbase-agent] watch=${recordingUrl}\n`); + + const agent = stagehand.agent({ systemPrompt: SYSTEM_PROMPT }); + + let result; + let agentError = null; + try { + result = await agent.execute({ + instruction: task, + maxSteps: MAX_STEPS, + }); + } catch (err) { + agentError = err?.stack || String(err); + process.stderr.write(`[browserbase-agent] agent error: ${agentError}\n`); + } finally { + await stagehand.close().catch(() => {}); + } + + const durationSeconds = (Date.now() - t0) / 1000; + + if (agentError && !result) { + fail(`agent.execute threw: ${agentError}`, { + duration_seconds: durationSeconds, + session_id: sessionId, + recording_url: recordingUrl, + }); + } + + const actions = Array.isArray(result?.actions) ? result.actions : []; + const message = result?.message || "[browserbase_no_output]"; + const completed = !!result?.completed; + const finalResult = + completed || message.startsWith("[browserbase_") + ? message + : `[browserbase_incomplete] ${message}`; + + const out = { + final_result: finalResult, + steps: actions.map((a, i) => formatStep(a, i + 1)), + screenshots_b64: [], // Stagehand agent.execute doesn't surface shots directly. + num_steps: actions.length, + duration_seconds: durationSeconds, + // Token counts are in result.usage but Browserbase gateway pricing + // isn't exposed per-token. Leave at 0 (matches the REST runner) until + // we wire static prices through. + cost: 0, + session_id: sessionId, + recording_url: recordingUrl, + }; + process.stdout.write(JSON.stringify(out)); +} + +main().catch((err) => { + process.stderr.write(`[browserbase-agent] fatal: ${err?.stack || err}\n`); + fail(`fatal: ${err?.message || err}`); +}); diff --git a/frameworks/browserbase_agent/package.json b/frameworks/browserbase_agent/package.json new file mode 100644 index 0000000..58d1e87 --- /dev/null +++ b/frameworks/browserbase_agent/package.json @@ -0,0 +1,9 @@ +{ + "name": "benchmark-browserbase-agent-executor", + "private": true, + "type": "module", + "description": "Node executor for the browserbase-agent eval framework: drives Stagehand SDK against Browserbase cloud. Pinned to a Stagehand version that has the opus-4-7 temperature fix (PRs #2006/#2018, shipped in client 3.4.0).", + "dependencies": { + "@browserbasehq/stagehand": "^3.4.0" + } +} diff --git a/frameworks/browserbase_agent/run_task.py b/frameworks/browserbase_agent/run_task.py new file mode 100644 index 0000000..2e5f1d3 --- /dev/null +++ b/frameworks/browserbase_agent/run_task.py @@ -0,0 +1,216 @@ +"""Run a single benchmark task using the Stagehand agent SDK (client-side) on +Browserbase cloud. + +We used to dispatch against the hosted Stagehand REST API at +`api.stagehand.browserbase.com/v1` (no Node deps, pure Python HTTP). That +endpoint is alpha and pinned to an old Stagehand server build that predates +the opus-4-7 temperature fix (Stagehand PRs #2006/#2018, shipped in +stagehand-server-v3 v3.6.5 on May 6 2026), so opus-4-7 + the hosted API +silently dies inside the Stagehand `fillForm` tool. Out of our control. + +Instead this runner shells out to a Node executor (`executor.mjs`) that +imports `@browserbasehq/stagehand` directly, pinned in package.json to a +client release that has the fix. Same approach Browserbase tells customers +to use for production (deploy the SDK in your own runtime). Joint system +benchmarked is unchanged: (Stagehand agent + Browserbase cloud browser + +model). + +Model routing: by default Stagehand auto-routes through the Browserbase +Model Gateway when only the Browserbase API key is set on the constructor +and no provider env key is present. We scrub provider keys +(ANTHROPIC/OPENAI/GOOGLE/GOOGLE_GENERATIVE_AI/GEMINI) from the spawn env +when `use_gateway` (default true) so the SDK picks the gateway path even +though our workflow secrets normally inject them globally. Set +`use_gateway=false` via params to fall back to direct-provider billing +(useful for models the gateway hasn't onboarded yet). + +Concurrency: limited by the Browserbase plan, NOT by our infra. The +framework registry sets `max_concurrent_override` to match +`browsers/browserbase.py` (currently 20). +""" + +import asyncio +import json +import os +import sys +import time +from functools import partial +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "max_steps": "Max Stagehand agent steps per task (default: 25).", + "use_gateway": ( + "Route inference via Browserbase Model Gateway (true/false, " + "default: true). When true, the runner scrubs provider env keys " + "from the Node subprocess so Stagehand auto-routes via gateway. " + "When false, provider env keys pass through and the SDK bills the " + "provider directly. Use false for models the gateway hasn't " + "onboarded yet." + ), +} + +# Map benchmark model aliases to Stagehand gateway slugs. Slugs +# already containing '/' pass through verbatim. +MODEL_MAP = { + "claude-sonnet-4-6": "anthropic/claude-sonnet-4-6", + "claude-sonnet-4-5": "anthropic/claude-sonnet-4-5", + "claude-opus-4-5": "anthropic/claude-opus-4-5", + "claude-opus-4-6": "anthropic/claude-opus-4-6", + "claude-opus-4-7": "anthropic/claude-opus-4-7", + "gpt-5": "openai/gpt-5", + "gpt-5-mini": "openai/gpt-5-mini", + "gemini-2.5-flash": "google/gemini-2.5-flash", + "gemini-2.5-pro": "google/gemini-2.5-pro", +} + +EXECUTOR_DIR = Path(__file__).resolve().parent +EXECUTOR_SCRIPT = EXECUTOR_DIR / "executor.mjs" + +# Provider env keys to scrub when running in gateway mode. Anything Stagehand +# might autoload (per https://docs.stagehand.dev/v3/configuration/models -- +# "Error: API key not found" section). +_PROVIDER_ENV_KEYS = ( + "ANTHROPIC_API_KEY", + "OPENAI_API_KEY", + "GOOGLE_API_KEY", + "GOOGLE_GENERATIVE_AI_API_KEY", + "GEMINI_API_KEY", +) + + +def _resolve_model(model_name: str) -> str: + if "/" in model_name: + return model_name + if model_name in MODEL_MAP: + return MODEL_MAP[model_name] + raise ValueError( + f"Model '{model_name}' is not in MODEL_MAP and is not an explicit " + f"`provider/model` slug. Extend MODEL_MAP or pass an explicit slug." + ) + + +def _build_env(model_slug: str, max_steps: int, use_gateway: bool) -> dict: + """Construct the env dict for the Node subprocess. + + Forwards Browserbase creds + task config. If `use_gateway`, strips + provider keys so Stagehand auto-routes via the Model Gateway. + """ + env = dict(os.environ) + env["STAGEHAND_MODEL"] = model_slug + env["MAX_STEPS"] = str(max_steps) + # Pass through BROWSERBASE_* unchanged (required by SDK). + if use_gateway: + for k in _PROVIDER_ENV_KEYS: + env.pop(k, None) + return env + + +async def execute( + task_description: str, model_name: str, max_steps: int, use_gateway: bool +) -> ExecutionResult: + """Spawn the Node executor, parse its single-JSON stdout into ExecutionResult.""" + model_slug = _resolve_model(model_name) + print( + f"Browserbase Stagehand SDK model_slug={model_slug} " + f"max_steps={max_steps} use_gateway={use_gateway}" + ) + + env = _build_env(model_slug, max_steps, use_gateway) + env["TASK_DESCRIPTION"] = task_description + + t0 = time.time() + # Use asyncio subprocess so run_and_judge's outer asyncio.wait_for can + # cancel us cleanly on timeout. + proc = await asyncio.create_subprocess_exec( + "node", + str(EXECUTOR_SCRIPT), + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=env, + cwd=str(EXECUTOR_DIR), + ) + stdout, stderr = await proc.communicate() + duration = time.time() - t0 + + stderr_text = stderr.decode("utf-8", errors="replace") + if stderr_text: + # Stream Stagehand's logger output to our stdout for runner-log + # debugging. Each line is already prefixed by the executor. + print(stderr_text, end="") + + if proc.returncode != 0: + # The executor's `fail()` path always exits 0 with a valid JSON + # payload, so a non-zero return is a true crash (e.g. Node missing, + # uncaught throw outside main, OOM). Surface as a failed datapoint + # via the run_and_judge exception path. + raise RuntimeError( + f"executor.mjs crashed: returncode={proc.returncode}, " + f"stderr_tail={stderr_text[-500:]!r}" + ) + + try: + data = json.loads(stdout.decode("utf-8", errors="replace")) + except json.JSONDecodeError as e: + raise RuntimeError( + f"executor.mjs produced invalid JSON: {e}; " + f"stdout_head={stdout[:500]!r}" + ) + + # Prefer the executor's measured duration if it set one. + duration_seconds = float(data.get("duration_seconds") or duration) + + return ExecutionResult( + final_result=data.get("final_result", ""), + steps=data.get("steps") or [], + screenshots_b64=data.get("screenshots_b64") or [], + num_steps=int(data.get("num_steps") or 0), + duration_seconds=duration_seconds, + cost=float(data.get("cost") or 0.0), + ) + + +async def main(): + params = validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + model_name = os.environ["MODEL"] + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + max_steps = int(params.get("max_steps", "25")) + use_gateway = params.get("use_gateway", "true").lower() != "false" + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + execute_fn = partial( + execute, + model_name=model_name, + max_steps=max_steps, + use_gateway=use_gateway, + ) + await run_and_judge(task, execute_fn) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/but/__init__.py b/frameworks/but/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/but/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/but/run_task.py b/frameworks/but/run_task.py new file mode 100644 index 0000000..30066cb --- /dev/null +++ b/frameworks/but/run_task.py @@ -0,0 +1,441 @@ +"""Run a single benchmark task using browser-use-terminal (`but`). + +`but` is a browser-specific LLM agent harness: it owns its own agent loop, +provides an editable Python REPL tool with raw CDP helpers +(`goto_url`, `js`, `capture_screenshot`, `click_at_xy`, `fill_input`, ...), +streams screenshots inline to the model, and persists a JSONL event log +per session. See https://github.com/browser-use/browser-use-terminal. + +Browser wiring: we pre-provision a `browser-use-cloud` session via the +v3 API (same pattern as bcode/cch) and hand `but` the WebSocket CDP URL +via `--browser cdp --cdp-ws ` (also exported as `BU_CDP_WS` env; +`but`'s `_first_env("BU_CDP_WS", ...)` honors it as a fallback). `but` +attaches to our pre-allocated browser instead of provisioning one. + +Invocation: + uv run browser-use-terminal run \\ + --state-dir \\ + --provider

--model \\ + --browser cdp --cdp-ws \\ + --max-turns 80 \\ + "\\n\\n" + +`but run` is synchronous: it blocks until the agent calls `done` or hits +`--max-turns`, prints a session metadata JSON to stdout, then exits. The +agent's per-turn signals (tool calls, model usage, screenshots, final +result) live in `/sessions//events.jsonl`. We +parse that file to extract steps, cost, and the final result, and walk +`/browser/screenshots/` to feed PNGs to the judge. + +Provider resolution: benchmark aliases get an explicit `--provider` +chosen by substring (claude->anthropic, gpt->openai, glm->zai, qwen->qwen). +The `openai` provider in `but` reads `OPENAI_API_KEY` (already a +workflow secret); the `codex` provider needs Codex subscription auth +that we do not have on CI, so it is NOT auto-selected. +""" + +import asyncio +import base64 +import json +import os +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from lmnr import Laminar +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", + "max_turns": "Maximum model/tool turns before failing the run (default: 80, passed to `but run --max-turns`).", + "framework_repo": "Override the GitHub repo for browser-use-terminal install (default: browser-use/browser-use-terminal). Consumed by the workflow install step.", + "agent_mode": "Override the agent instruction mode for `but` (auto|browser|codex, default: leave unset -> `but` picks).", +} + +# `but` is installed as a uv-managed Python package at /tmp/but in the +# workflow install step. We invoke it via `uv run --project /tmp/but +# browser-use-terminal run ...` from that workdir so the project's +# console_scripts entry point resolves. +BUT_PROJECT_DIR = "/tmp/but" + +# system_prompt.md sits next to this file. +SYSTEM_PROMPT_PATH = Path(__file__).resolve().parent / "system_prompt.md" + +# State dir + screenshot scan path. One per task to avoid cross-talk. +STATE_ROOT = Path("/tmp/but_state") + +# Map benchmark model alias to (provider, model). Order matters: claude +# before gpt because "gpt" is a common prefix and we want claude to win +# on `claude-*` slugs. +_PROVIDER_KEYPHRASES: tuple[tuple[str, str], ...] = ( + ("claude", "anthropic"), + ("gpt", "openai"), + ("o1", "openai"), + ("o3", "openai"), + ("o4", "openai"), + ("glm", "zai"), + ("qwen", "qwen"), +) + + +def _resolve_provider(model_name: str) -> str: + lower = model_name.lower() + for key, provider in _PROVIDER_KEYPHRASES: + if key in lower: + return provider + raise ValueError( + f"but: cannot infer provider for model {model_name!r}. " + f"Add a keyphrase to _PROVIDER_KEYPHRASES in frameworks/but/run_task.py." + ) + + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser() -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +def _format_step_from_event(event: dict) -> str | None: + """Turn one events.jsonl entry into a short step string (or None).""" + etype = event.get("type") or "" + payload = event.get("payload") or {} + if etype == "tool.started": + name = payload.get("name") or "?" + args = payload.get("arguments") or {} + # Python REPL: dump the code field (sometimes named 'code' or 'source'). + if name in ("python", "python_browser"): + code = (args.get("code") or args.get("source") or "").strip() + return f"python: {code[:2000]}" + if name in ("bash", "shell", "shell_start"): + cmd = (args.get("command") or args.get("script") or "").strip() + return f"{name}: {cmd[:2000]}" + if name in ("read", "write", "edit"): + path = args.get("path") or args.get("filePath") or "" + return f"{name}: {path}" + if name == "done": + result = (args.get("result") or "").strip() + return f"done: {result[:2000]}" + try: + return f"{name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return name + if etype == "assistant.message" or etype == "message.assistant": + text = (payload.get("text") or payload.get("content") or "").strip() + return f"text: {text[:2000]}" if text else None + if etype == "reasoning" or etype == "assistant.reasoning": + text = (payload.get("text") or payload.get("content") or "").strip() + return f"thinking: {text[:2000]}" if text else None + return None + + +def _read_events(events_path: Path) -> list[dict]: + if not events_path.exists(): + return [] + events: list[dict] = [] + with events_path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + return events + + +def _collect_screenshots(artifact_dir: Path) -> list[str]: + """Read every PNG/JPEG `but` wrote to /browser/screenshots/.""" + shots_dir = artifact_dir / "browser" / "screenshots" + if not shots_dir.exists(): + return [] + paths = sorted(p for p in shots_dir.iterdir() if p.is_file() and p.suffix.lower() in (".png", ".jpeg", ".jpg")) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +def _find_session_dir(state_dir: Path, session_id: str | None) -> Path | None: + """Resolve the session dir from state_dir. If session_id is unknown, + pick the most recently modified session subdir.""" + sessions_root = state_dir / "sessions" + if not sessions_root.exists(): + return None + if session_id: + candidate = sessions_root / session_id + if candidate.exists(): + return candidate + subdirs = [p for p in sessions_root.iterdir() if p.is_dir()] + if not subdirs: + return None + return max(subdirs, key=lambda p: p.stat().st_mtime) + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while line := await proc.stderr.readline(): + s = line.decode("utf-8", errors="replace").rstrip("\n") + buf.append(s) + print(f"[but-stderr] {s}", flush=True) + + +async def _iter_lines(stream: asyncio.StreamReader): + buf = bytearray() + while chunk := await stream.read(1 << 16): + buf.extend(chunk) + while (nl := buf.find(b"\n")) >= 0: + yield bytes(buf[:nl]) + del buf[: nl + 1] + if buf: + yield bytes(buf) + + +async def execute(task_description: str) -> ExecutionResult: + params = parse_params() + validate_params(params, ACCEPTED_PARAMS) + model = os.environ["MODEL"] + provider = _resolve_provider(model) + max_turns = int(params.get("max_turns") or 80) + task_idx = os.environ.get("TASK_INDEX", "0") + + # Pre-provision the browser. `but` honors `BU_CDP_WS` natively AND we + # pass `--cdp-ws` explicitly with `--browser cdp` to make the attach + # deterministic and visible in the spawn cmdline. + browser_id, cdp_ws = _start_browser() + + # Isolate state dir per task so concurrent runs in the same workflow + # don't collide on the JSONL event log or screenshot dir. + state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}" + if state_dir.exists(): + shutil.rmtree(state_dir) + state_dir.mkdir(parents=True) + + # Laminar parent-span: same pattern as bcode. `but` does not (yet) + # honor LMNR_PARENT_SPAN_CONTEXT, so this is a forward-compat hook -- + # passing the env var costs nothing on the current version. + parent_span_context = Laminar.serialize_span_context() + + system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" + + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + # Default state-dir for `but`. Explicitly passed below too. + "LLM_BROWSER_STATE_DIR": str(state_dir), + } + if parent_span_context: + env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context + + # NOTE: `--state-dir` is a TOP-LEVEL arg on browser-use-terminal -- it + # must come BEFORE the `run` subcommand, otherwise argparse rejects it + # as an unrecognized argument on `run`. Same for `--config`. + cmd = [ + "uv", "run", "--project", BUT_PROJECT_DIR, "--no-sync", + "browser-use-terminal", + "--state-dir", str(state_dir), + "run", + "--provider", provider, + "--model", model, + "--browser", "cdp", + "--cdp-ws", cdp_ws, + "--max-turns", str(max_turns), + ] + agent_mode = (params.get("agent_mode") or "").strip().lower() + if agent_mode: + cmd.extend(["--agent-mode", agent_mode]) + cmd.append(full_task) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + errors: list[str] = [] + stderr_buf: list[str] = [] + stdout_chunks: list[str] = [] + session_id: str | None = None + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=BUT_PROJECT_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + try: + async for raw in _iter_lines(proc.stdout): + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if line: + stdout_chunks.append(line) + print(f"[but-stdout] {line[:500]}", flush=True) + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + proc.kill() + await proc.wait() + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_id) + + duration = time.time() - start + + # Parse the trailing JSON metadata `but run` prints (session.to_dict()). + # Even if we fail to parse, we can still recover from the events.jsonl. + try: + joined = "\n".join(stdout_chunks).strip() + # Take the last balanced {...} block -- `but run` prints exactly one. + last_brace_open = joined.rfind("{") + if last_brace_open != -1: + meta = json.loads(joined[last_brace_open:]) + session_id = str(meta.get("id") or "") or None + except Exception: + session_id = None + + session_dir = _find_session_dir(state_dir, session_id) + events: list[dict] = [] + artifact_dir: Path | None = None + if session_dir is not None: + events_path = session_dir / "events.jsonl" + events = _read_events(events_path) + artifact_dir = session_dir / "artifacts" + + for event in events: + if (s := _format_step_from_event(event)): + steps.append(s) + etype = event.get("type") or "" + payload = event.get("payload") or {} + if etype == "session.done": + done_result = (payload.get("result") or "").strip() + if done_result: + final_text = done_result + elif etype == "model.usage": + cost_usd = payload.get("cost_usd") + if cost_usd is not None: + try: + total_cost += float(cost_usd) + except (TypeError, ValueError): + pass + elif etype in ("tool.failed", "error", "session.failed"): + err = payload.get("error") or payload.get("message") or "" + if err: + errors.append(str(err)) + print(f"[but-error] {str(err)[:500]}", flush=True) + + # Fallback: scrape the last assistant message if `done` was never called. + if not final_text: + for event in reversed(events): + if (event.get("type") or "") in ("assistant.message", "message.assistant"): + text = ((event.get("payload") or {}).get("text") or "").strip() + if text: + final_text = text + break + + if proc.returncode not in (0, None) and not final_text and not steps: + raise RuntimeError( + f"but exited with code {proc.returncode} before producing output. " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + answer = (final_text or "").strip() + if errors and not answer: + final_result = f"[but_error] {errors[0][:500]}" + elif errors: + final_result = f"[but_error_recovered] {answer}" + else: + final_result = answer or "[but_no_output]" + + screenshots = _collect_screenshots(artifact_dir) if artifact_dir is not None else [] + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=screenshots, + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/but/system_prompt.md b/frameworks/but/system_prompt.md new file mode 100644 index 0000000..2a38214 --- /dev/null +++ b/frameworks/but/system_prompt.md @@ -0,0 +1,10 @@ +You are evaluating a benchmark task by driving a real browser via browser-use-terminal (`but`). + +Hard rules: +- A live remote browser is pre-attached to your session via the explicit CDP backend (`--browser cdp`). Do NOT call `cdp_connect` with a different URL, do NOT spawn a new browser, do NOT launch a local Chromium. Just use the browser that is already attached. +- Drive the browser through the Python REPL tool. Useful built-ins: `goto_url(url)`, `js(expr)`, `wait_for_load()`, `wait_for_network_idle()`, `capture_screenshot(path=None, attach=True)`, `click_at_xy(x, y)`, `fill_input(selector, text)`, `type_text(text)`, `press_key(key)`, `scroll()`, `recent_console()`, `recent_network_failures()`, and raw `cdp("Method", {...})`. +- Take screenshots whenever you need to verify page state. Calling `capture_screenshot(attach=True)` attaches the image to your next turn so you can see it inline. Screenshots are also saved to disk for the judge. +- Do not ask clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). +- When the task is complete, call the `done` tool with your final answer as the `result` argument. The judge reads the `result` you pass to `done` as your final answer to the task. +- If the task has no textual answer (e.g. "book a flight"), pass `result="done"` to the `done` tool and describe what you did in your preceding text. diff --git a/frameworks/but_rust/__init__.py b/frameworks/but_rust/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/but_rust/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/but_rust/run_task.py b/frameworks/but_rust/run_task.py new file mode 100644 index 0000000..f54b9b1 --- /dev/null +++ b/frameworks/but_rust/run_task.py @@ -0,0 +1,382 @@ +"""Run a single benchmark task using the Rust browser-use-terminal (`but-rust`). + +This is the rust-rewrite branch of `browser-use/browser-use-terminal`. The +old Python `but` framework wraps `main`; this one wraps `rust-rewrite`. +Completely independent install path + invocation, gated on +`inputs.framework == 'but-rust'` in the workflow so it cannot affect any +other framework. + +Architecture differences vs Python `but`: +- Cargo workspace; the CLI is a Rust binary at + `/target/release/browser-use-terminal`. +- Subcommand-per-provider: `run-openai --model `, plus + `run-codex`, `run-anthropic`, `run-openrouter`. No `--provider` flag. +- No `--browser` flag at all. Browser ops live in a Python worker + process (`python/llm_browser_worker/worker.py`) spawned by Rust; that + worker honors `BU_CDP_URL`/`BU_CDP_WS` and connects through the + browser-harness Python package. We pre-provision a browser-use-cloud + CDP WS the same way `but`/`bcode` do and pass it via `BU_CDP_WS`. +- State lives in SQLite at `/state.db`; events are read out + via `events ` (JSON lines). +Browser harness needs to be importable in the worker venv as +`browser_harness`. The workflow's install step `uv pip install` the +browser-harness repo at `BUT_RUST_HARNESS_REF` (default: main) into the +project venv at `/tmp/but-rust/.venv` so `import browser_harness.admin` +in the worker resolves. + +The runner shells out twice per task: +1. `run-openai/run-codex/...` -- agent loop, prints session_id on stdout. +2. `events ` -- JSON-lines event dump, parsed into steps + + final result + cost + screenshot paths. +""" + +import asyncio +import base64 +import json +import os +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from lmnr import Laminar +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", + "max_turns": "Maximum model/tool turns before failing (currently informational -- but-rust does not expose this on the run-* subcommand; the dataset-run subcommand has it but we are not using that path).", + "framework_repo": "Override the GitHub repo for browser-use-terminal install (default: browser-use/browser-use-terminal). Consumed by the workflow install step.", + "harness_repo": "Override the browser-harness GitHub repo installed into the worker venv (default: browser-use/browser-harness). Consumed by the workflow install step.", + "harness_ref": "Override the browser-harness ref/branch/commit (default: main). Consumed by the workflow install step.", +} + +# Workflow install step builds the binary here. +BUT_RUST_REPO_DIR = "/tmp/but-rust" +BUT_RUST_BIN = f"{BUT_RUST_REPO_DIR}/target/release/browser-use-terminal" + +SYSTEM_PROMPT_PATH = Path(__file__).resolve().parent / "system_prompt.md" + +STATE_ROOT = Path("/tmp/but_rust_state") + +# Map benchmark model alias to (rust subcommand, model arg). Order matters. +_PROVIDER_SUBCMDS: tuple[tuple[str, str], ...] = ( + ("claude", "run-anthropic"), + ("gpt", "run-openai"), + ("o1", "run-openai"), + ("o3", "run-openai"), + ("o4", "run-openai"), + # No native zai/qwen in but-rust; route via OpenRouter when the model + # name carries an OpenRouter-compatible provider/model slug. + ("glm", "run-openrouter"), + ("qwen", "run-openrouter"), +) + + +def _resolve_subcommand(model_name: str) -> str: + lower = model_name.lower() + for key, subcmd in _PROVIDER_SUBCMDS: + if key in lower: + return subcmd + raise ValueError( + f"but-rust: cannot infer subcommand for model {model_name!r}. " + f"Add a keyphrase to _PROVIDER_SUBCMDS in frameworks/but_rust/run_task.py." + ) + + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser() -> tuple[str, str]: + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +def _format_step_from_event(event: dict) -> str | None: + etype = event.get("type") or "" + payload = event.get("payload") or {} + if etype == "tool.started": + name = payload.get("name") or "?" + args = payload.get("arguments") or {} + if name == "python": + code = (args.get("code") or args.get("source") or "").strip() + return f"python: {code[:2000]}" + if name in ("bash", "shell"): + cmd = (args.get("command") or args.get("script") or "").strip() + return f"{name}: {cmd[:2000]}" + if name in ("read", "write", "edit"): + path = args.get("path") or args.get("filePath") or "" + return f"{name}: {path}" + if name == "done": + result = (args.get("result") or "").strip() + return f"done: {result[:2000]}" + try: + return f"{name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return name + if etype in ("assistant.message", "message.assistant"): + text = (payload.get("text") or payload.get("content") or "").strip() + return f"text: {text[:2000]}" if text else None + if etype in ("reasoning", "assistant.reasoning"): + text = (payload.get("text") or payload.get("content") or "").strip() + return f"thinking: {text[:2000]}" if text else None + return None + + +async def _read_stream(stream: asyncio.StreamReader, label: str, buf: list[str], echo: bool = True) -> None: + while line := await stream.readline(): + s = line.decode("utf-8", errors="replace").rstrip("\n") + buf.append(s) + if echo: + print(f"[{label}] {s[:500]}", flush=True) + + +def _collect_screenshots(state_dir: Path, session_id: str) -> list[str]: + """Read PNGs/JPEGs from `/artifacts//images/`.""" + images_dir = state_dir / "artifacts" / session_id / "images" + if not images_dir.exists(): + return [] + paths = sorted(p for p in images_dir.iterdir() if p.is_file() and p.suffix.lower() in (".png", ".jpeg", ".jpg")) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +async def execute(task_description: str) -> ExecutionResult: + params = parse_params() + validate_params(params, ACCEPTED_PARAMS) + model = os.environ["MODEL"] + subcommand = _resolve_subcommand(model) + task_idx = os.environ.get("TASK_INDEX", "0") + + browser_id, cdp_ws = _start_browser() + + state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}" + if state_dir.exists(): + shutil.rmtree(state_dir) + state_dir.mkdir(parents=True) + + parent_span_context = Laminar.serialize_span_context() + + system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" + + env = { + **os.environ, + # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS + # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short + # circuits. Pass both URL forms for robustness. + "BU_CDP_WS": cdp_ws, + # Force flush on one-shot CLI runs so OTLP spans actually leave the + # process before exit (see docs/README on this branch). + "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1", + } + if parent_span_context: + # Forward-compat: but-rust telemetry doesn't honor this yet, but it + # doesn't error on unknown env either. + env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context + + # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE + # the subcommand. + cmd_run = [ + BUT_RUST_BIN, + "--state-dir", str(state_dir), + subcommand, + full_task, + "--model", model, + ] + + start = time.time() + stdout_buf: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd_run, + cwd=BUT_RUST_REPO_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf)) + stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf)) + + try: + await proc.wait() + await asyncio.wait_for(stdout_task, timeout=10) + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + for t in (stdout_task, stderr_task): + if not t.done(): + t.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + + # `run-openai`/etc print the session_id as the final non-empty stdout line. + session_id = "" + for line in reversed(stdout_buf): + line = line.strip() + if line and not line.startswith("{"): + session_id = line + break + + if not session_id: + _stop_browser(browser_id) + raise RuntimeError( + f"but-rust: no session_id captured from stdout (exit={proc.returncode}). " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + # Dump events for this session. + cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id] + events_proc = await asyncio.create_subprocess_exec( + *cmd_events, + cwd=BUT_RUST_REPO_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + events_stdout, events_stderr = await events_proc.communicate() + _stop_browser(browser_id) + duration = time.time() - start + + events: list[dict] = [] + for line in events_stdout.decode("utf-8", errors="replace").splitlines(): + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + errors: list[str] = [] + + for event in events: + if (s := _format_step_from_event(event)): + steps.append(s) + etype = event.get("type") or "" + payload = event.get("payload") or {} + if etype == "session.done": + done_result = (payload.get("result") or "").strip() + if done_result: + final_text = done_result + elif etype in ("model.usage", "llm.usage"): + cost_usd = payload.get("cost_usd") or payload.get("cost") + if cost_usd is not None: + try: + total_cost += float(cost_usd) + except (TypeError, ValueError): + pass + elif etype in ("tool.failed", "session.failed", "error"): + err = payload.get("error") or payload.get("message") or "" + if err: + errors.append(str(err)) + print(f"[but-rust-error] {str(err)[:500]}", flush=True) + + if not final_text: + for event in reversed(events): + if (event.get("type") or "") in ("assistant.message", "message.assistant"): + text = ((event.get("payload") or {}).get("text") or "").strip() + if text: + final_text = text + break + + if proc.returncode not in (0, None) and not final_text and not steps: + raise RuntimeError( + f"but-rust exited with code {proc.returncode} before producing output. " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + answer = (final_text or "").strip() + if errors and not answer: + final_result = f"[but_rust_error] {errors[0][:500]}" + elif errors: + final_result = f"[but_rust_error_recovered] {answer}" + else: + final_result = answer or "[but_rust_no_output]" + + screenshots = _collect_screenshots(state_dir, session_id) + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=screenshots, + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/but_rust/system_prompt.md b/frameworks/but_rust/system_prompt.md new file mode 100644 index 0000000..dca9a86 --- /dev/null +++ b/frameworks/but_rust/system_prompt.md @@ -0,0 +1,10 @@ +You are evaluating a benchmark task by driving a real browser via the Rust browser-use-terminal (`but-rust`). + +Hard rules: +- A live remote browser is pre-attached for you. The Python worker that owns browser ops reads `BU_CDP_WS` from its env and connects through browser-harness, so do NOT spawn a new browser, do NOT change the CDP endpoint. +- Drive the browser through the Python tool. Useful browser-harness helpers exposed in the Python namespace include `goto_url(url)`, `js(expr)`, `wait_for_load()`, `wait_for_network_idle()`, `capture_screenshot(path=None, attach=True)`, `click_at_xy(x, y)`, `fill_input(selector, text)`, `type_text(text)`, `press_key(key)`, `scroll()`, `recent_console()`, `recent_network_failures()`, and raw `cdp("Method", {...})`. +- Take screenshots whenever you need to verify page state. Calling `capture_screenshot(attach=True)` attaches the image to your next turn so you can see it inline. Screenshots are also saved to the session artifact dir for the judge. +- Do not ask clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). +- When the task is complete, call the `done` tool with your final answer as the `result` argument. The judge reads the `result` you pass to `done` as your final answer to the task. +- If the task has no textual answer (e.g. "book a flight"), pass `result="done"` to the `done` tool and describe what you did in your preceding text. diff --git a/frameworks/claude_code_harness/__init__.py b/frameworks/claude_code_harness/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/claude_code_harness/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/claude_code_harness/run_task.py b/frameworks/claude_code_harness/run_task.py new file mode 100644 index 0000000..dc0e18e --- /dev/null +++ b/frameworks/claude_code_harness/run_task.py @@ -0,0 +1,436 @@ +"""Run a single benchmark task using Claude Code driving browser-harness. + +This framework wraps Claude Code (the CLI coding agent) around the browser-harness +repo: Claude Code owns the agent loop, we just hand it a task and a workdir +pre-loaded with the harness + a live browser daemon, then stream-parse its output. + +The joint system being benchmarked is (Claude Code + browser-harness + Claude model). +Pin `claude_code_version` and `framework_ref` for reproducible comparisons. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow, +# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules. +HARNESS_DIR = "/tmp/browser-harness" + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "max_turns": "Max Claude Code agentic turns (default: 100)", + "max_budget_usd": "Per-task API budget cap in USD (default: 10)", + "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.", + "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + +# Subtypes Claude Code emits in the terminal `result` event. Anything other than +# 'success' means the agent did not complete the task (usually a limit was hit). +# See: https://docs.claude.com/en/docs/claude-code/headless (stream-json spec) +RESULT_SUCCESS = "success" +LIMIT_SUBTYPES = { + "error_max_turns", + "error_max_tokens", + "error_max_budget_usd", + "error_during_execution", + "error_api_error", +} + + +def _require_claude_model(model_name: str) -> str: + """This framework only supports Claude models (Claude Code requires them).""" + if not model_name.startswith("claude-"): + raise ValueError( + f"claude-code-harness requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _reset_shots_dir() -> None: + if SHOTS_DIR.exists(): + shutil.rmtree(SHOTS_DIR) + SHOTS_DIR.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +def _start_browser(browser_name: str, bu_name: str) -> dict: + """Provision a browser for the harness to attach to. Returns the cloud browser dict.""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for claude-code-harness: {browser_name}") + sys.path.insert(0, HARNESS_DIR) + from admin import start_remote_daemon # type: ignore + + return start_remote_daemon(name=bu_name) + + +def _stop_browser(browser_name: str, bu_name: str) -> None: + try: + sys.path.insert(0, HARNESS_DIR) + from admin import stop_remote_daemon # type: ignore + + if browser_name == "browser-use-cloud": + stop_remote_daemon(name=bu_name) + except Exception as e: + print(f"Warning: failed to stop harness daemon: {e}") + + +def _build_claude_cmd( + task_description: str, + model_name: str, + max_turns: int, + max_budget_usd: float, + use_bare: bool, +) -> list[str]: + cmd = [ + "claude", + "-p", + task_description, + "--model", + model_name, + "--dangerously-skip-permissions", + "--output-format", + "stream-json", + "--verbose", + "--max-turns", + str(max_turns), + "--max-budget-usd", + str(max_budget_usd), + "--append-system-prompt-file", + str(SYSTEM_PROMPT_FILE), + "--no-session-persistence", + ] + if use_bare: + cmd.append("--bare") + return cmd + + +def _format_assistant_block(block: dict) -> str | None: + """Turn a single assistant message content block into a step string.""" + btype = block.get("type") + if btype == "tool_use": + name = block.get("name", "?") + inp = block.get("input", {}) or {} + if name == "Bash": + return f"Bash: {(inp.get('command') or '').strip()[:2000]}" + if name in ("Edit", "Write", "Read"): + path = inp.get("file_path") or inp.get("path") or "" + return f"{name}: {path}" + try: + return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}" + except Exception: + return name + if btype == "text": + text = (block.get("text") or "").strip() + if not text: + return None + return f"text: {text[:2000]}" + if btype == "thinking": + text = (block.get("thinking") or "").strip() + if not text: + return None + return f"thinking: {text[:2000]}" + return None + + +def _format_tool_result_block(block: dict) -> str | None: + """Turn a user message tool_result block into a step string.""" + if block.get("type") != "tool_result": + return None + content = block.get("content") + is_error = bool(block.get("is_error")) + prefix = "tool_error" if is_error else "tool_result" + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + if c.get("type") == "text": + parts.append(c.get("text", "")) + elif c.get("type") == "image": + parts.append("") + content = "\n".join(parts) + if not isinstance(content, str): + try: + content = json.dumps(content, default=str) + except Exception: + content = str(content) + content = content.strip() + if not content: + return None + # Cap per-step size so Laminar payloads stay reasonable. + return f"{prefix}: {content[:2000]}" + + +def _format_event_steps(event: dict) -> list[str]: + """Extract step strings from any stream-json event. Empty list = not a step.""" + etype = event.get("type") + if etype == "assistant": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_assistant_block(block) + if s: + steps.append(s) + return steps + if etype == "user": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_tool_result_block(block) + if s: + steps.append(s) + return steps + return [] + + +def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]: + """Parse the terminal `result` event. Returns (subtype, is_error, errors).""" + subtype = event.get("subtype") or RESULT_SUCCESS + is_error = bool(event.get("is_error")) + errors_raw = event.get("errors") or [] + errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)] + return subtype, is_error, errors + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + """Read stderr line-by-line, echo to our stdout, and buffer for later reporting.""" + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + # Surface to GitHub Actions log in real time. + print(f"[claude-stderr] {s}", flush=True) + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + task_index = os.environ.get("TASK_INDEX", "0") + max_turns = int(params.get("max_turns", "100")) + max_budget_usd = float(params.get("max_budget_usd", "10")) + use_bare = params.get("use_bare", "true").lower() != "false" + # task_timeout is consumed in main() before run_and_judge wraps execute. + + bu_name = f"eval-{task_index}" + _reset_shots_dir() + + # Pre-provision the browser so Claude starts with a live CDP attach. + _start_browser(browser_name, bu_name) + + env = { + **os.environ, + "BU_NAME": bu_name, + "DISABLE_TELEMETRY": "1", + "DISABLE_AUTOUPDATER": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + } + + cmd = _build_claude_cmd( + task_description, model_name, max_turns, max_budget_usd, use_bare + ) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + result_subtype: str | None = None + result_is_error = False + result_errors: list[str] = [] + stderr_buf: list[str] = [] + + # claude stream-json lines can be huge (tool_result blocks with full page + # HTML/text, assistant messages with signed thinking blocks). Default + # asyncio StreamReader line buffer is 64 KiB which raises ValueError on + # long lines, and even a larger limit has a ceiling. Read raw chunks and + # split on newlines ourselves. + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + """Yield one stream-json line at a time, regardless of line length.""" + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 # 64 KiB + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + # Emit every complete line in the buffer. + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + # Non-JSON line from claude (shouldn't happen in stream-json, but be safe) + print(f"[claude-stdout-raw] {line}", flush=True) + continue + + new_steps = _format_event_steps(event) + for s in new_steps: + steps.append(s) + # Echo each step so GitHub Actions log shows live progress. + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + # Terminal event + if event.get("type") == "result": + final_text = event.get("result") or "" + total_cost = float(event.get("total_cost_usd") or 0.0) + result_subtype, result_is_error, result_errors = _summarize_result_event(event) + print( + f"[claude-result] subtype={result_subtype} is_error={result_is_error} " + f"cost=${total_cost:.4f} errors={result_errors}", + flush=True, + ) + + # Wait for the process (stdout closed implies near-exit) + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + # Drain remaining stderr + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_name, bu_name) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + # If we never saw a `result` event AND claude exited non-zero, that is a true + # hard error (e.g. CLI startup failure, killed by OS). Surface it. + if result_subtype is None and proc.returncode not in (0, None): + raise RuntimeError( + f"claude exited with code {proc.returncode} before emitting a result event. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + # Determine final_result text. + match = FINAL_ANSWER_RE.search(final_text or "") + answer = match.group(1).strip() if match else (final_text.strip() or "") + + if result_subtype and result_subtype != RESULT_SUCCESS: + # Agent hit a limit or errored but Claude Code reported it cleanly. + # Preserve the datapoint: tag the final_result with the subtype and let the + # judge score whatever was accomplished. + err_suffix = f" errors={result_errors}" if result_errors else "" + if answer: + final_result = f"[{result_subtype}] {answer}{err_suffix}" + else: + final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}" + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + # Propagate task_timeout param to run_and_judge before it wraps execute(). + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/claude_code_harness/system_prompt.md b/frameworks/claude_code_harness/system_prompt.md new file mode 100644 index 0000000..d1566de --- /dev/null +++ b/frameworks/claude_code_harness/system_prompt.md @@ -0,0 +1,13 @@ +You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory. + +Hard rules: +- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo. +- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/claude_code_harness_ab/__init__.py b/frameworks/claude_code_harness_ab/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/claude_code_harness_ab/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/claude_code_harness_ab/run_task.py b/frameworks/claude_code_harness_ab/run_task.py new file mode 100644 index 0000000..baddc77 --- /dev/null +++ b/frameworks/claude_code_harness_ab/run_task.py @@ -0,0 +1,460 @@ +"""Run a single benchmark task using Claude Code driving the vercel-labs/agent-browser CLI. + +This is the `agent-browser`-CDP variant of `claude-code-harness`. Claude Code owns +the agent loop; the agent drives a remote Chrome via the `agent-browser` CLI +(native Rust, single-process daemon). We pre-provision a browser-use-cloud session +and pass its WebSocket CDP URL via `BU_CDP_WS`; the agent connects with +`agent-browser --cdp "$BU_CDP_WS" open ` and the daemon auto-reattaches on +subsequent calls. + +The joint system being benchmarked is (Claude Code + agent-browser + Claude +model). Pin `claude_code_version`, `agent_browser_version`, and `framework_ref` +for reproducible comparisons against the Python `claude-code-harness` and +`claude-code-harness-js` frameworks. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "max_turns": "Max Claude Code agentic turns (default: 100)", + "max_budget_usd": "Per-task API budget cap in USD (default: 10)", + "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)", + "agent_browser_version": "agent-browser npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for agent-browser install (e.g. fork/agent-browser). Consumed by the workflow install step.", + "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +WORK_DIR = Path("/tmp/cch-ab-workdir") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + +# Subtypes Claude Code emits in the terminal `result` event. Anything other than +# 'success' means the agent did not complete the task (usually a limit was hit). +RESULT_SUCCESS = "success" +LIMIT_SUBTYPES = { + "error_max_turns", + "error_max_tokens", + "error_max_budget_usd", + "error_during_execution", + "error_api_error", +} + + +def _require_claude_model(model_name: str) -> str: + """This framework only supports Claude models (Claude Code requires them).""" + if not model_name.startswith("claude-"): + raise ValueError( + f"claude-code-harness-ab requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _reset_dir(p: Path) -> None: + if p.exists(): + shutil.rmtree(p) + p.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +# ---- Browser-Use Cloud session provisioning (mirrors cch-js / bcode runners) ---- + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser(browser_name: str) -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for claude-code-harness-ab: {browser_name}") + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +# ---- Claude Code invocation (identical to CCH/CCH-JS except for cwd) ---- + +def _build_claude_cmd( + task_description: str, + model_name: str, + max_turns: int, + max_budget_usd: float, + use_bare: bool, +) -> list[str]: + cmd = [ + "claude", + "-p", + task_description, + "--model", + model_name, + "--dangerously-skip-permissions", + "--output-format", + "stream-json", + "--verbose", + "--max-turns", + str(max_turns), + "--max-budget-usd", + str(max_budget_usd), + "--append-system-prompt-file", + str(SYSTEM_PROMPT_FILE), + "--no-session-persistence", + ] + if use_bare: + cmd.append("--bare") + return cmd + + +def _format_assistant_block(block: dict) -> str | None: + btype = block.get("type") + if btype == "tool_use": + name = block.get("name", "?") + inp = block.get("input", {}) or {} + if name == "Bash": + return f"Bash: {(inp.get('command') or '').strip()[:2000]}" + if name in ("Edit", "Write", "Read"): + path = inp.get("file_path") or inp.get("path") or "" + return f"{name}: {path}" + try: + return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}" + except Exception: + return name + if btype == "text": + text = (block.get("text") or "").strip() + if not text: + return None + return f"text: {text[:2000]}" + if btype == "thinking": + text = (block.get("thinking") or "").strip() + if not text: + return None + return f"thinking: {text[:2000]}" + return None + + +def _format_tool_result_block(block: dict) -> str | None: + if block.get("type") != "tool_result": + return None + content = block.get("content") + is_error = bool(block.get("is_error")) + prefix = "tool_error" if is_error else "tool_result" + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + if c.get("type") == "text": + parts.append(c.get("text", "")) + elif c.get("type") == "image": + parts.append("") + content = "\n".join(parts) + if not isinstance(content, str): + try: + content = json.dumps(content, default=str) + except Exception: + content = str(content) + content = content.strip() + if not content: + return None + return f"{prefix}: {content[:2000]}" + + +def _format_event_steps(event: dict) -> list[str]: + etype = event.get("type") + if etype == "assistant": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_assistant_block(block) + if s: + steps.append(s) + return steps + if etype == "user": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_tool_result_block(block) + if s: + steps.append(s) + return steps + return [] + + +def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]: + subtype = event.get("subtype") or RESULT_SUCCESS + is_error = bool(event.get("is_error")) + errors_raw = event.get("errors") or [] + errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)] + return subtype, is_error, errors + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + print(f"[claude-stderr] {s}", flush=True) + + +async def _close_agent_browser_sessions() -> None: + """Best-effort: tell agent-browser to shut down all daemons. + + agent-browser spawns a per-session background daemon (one per + `--session` name). `close --all` quits every active session so a + leaked daemon does not survive across tasks on the same runner. + """ + try: + stop_proc = await asyncio.create_subprocess_exec( + "agent-browser", "close", "--all", + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await asyncio.wait_for(stop_proc.wait(), timeout=10) + except Exception: + pass + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + max_turns = int(params.get("max_turns", "100")) + max_budget_usd = float(params.get("max_budget_usd", "10")) + use_bare = params.get("use_bare", "true").lower() != "false" + + _reset_dir(SHOTS_DIR) + _reset_dir(WORK_DIR) + + # Pre-provision a remote browser; pass its WS URL to the agent via env. + # The agent runs: agent-browser --cdp "$BU_CDP_WS" open + browser_id, cdp_ws = _start_browser(browser_name) + + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + "DISABLE_TELEMETRY": "1", + "DISABLE_AUTOUPDATER": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + # Don't let agent-browser try to auto-download Chrome at task time -- the + # workflow already ran `agent-browser install` and a remote browser is + # attached via --cdp anyway. + "AGENT_BROWSER_SKIP_INSTALL": "1", + } + + cmd = _build_claude_cmd( + task_description, model_name, max_turns, max_budget_usd, use_bare + ) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + result_subtype: str | None = None + result_is_error = False + result_errors: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[claude-stdout-raw] {line}", flush=True) + continue + + new_steps = _format_event_steps(event) + for s in new_steps: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + if event.get("type") == "result": + final_text = event.get("result") or "" + total_cost = float(event.get("total_cost_usd") or 0.0) + result_subtype, result_is_error, result_errors = _summarize_result_event(event) + print( + f"[claude-result] subtype={result_subtype} is_error={result_is_error} " + f"cost=${total_cost:.4f} errors={result_errors}", + flush=True, + ) + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + # Best-effort: close any agent-browser daemon(s) the agent left running + # so they don't leak across tasks on the same runner. + await _close_agent_browser_sessions() + _stop_browser(browser_id) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + if result_subtype is None and proc.returncode not in (0, None): + raise RuntimeError( + f"claude exited with code {proc.returncode} before emitting a result event. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + match = FINAL_ANSWER_RE.search(final_text or "") + answer = match.group(1).strip() if match else (final_text.strip() or "") + + if result_subtype and result_subtype != RESULT_SUCCESS: + err_suffix = f" errors={result_errors}" if result_errors else "" + if answer: + final_result = f"[{result_subtype}] {answer}{err_suffix}" + else: + final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}" + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/claude_code_harness_ab/system_prompt.md b/frameworks/claude_code_harness_ab/system_prompt.md new file mode 100644 index 0000000..eec2715 --- /dev/null +++ b/frameworks/claude_code_harness_ab/system_prompt.md @@ -0,0 +1,29 @@ +You are evaluating a benchmark task by driving a real browser through the `agent-browser` CLI from `vercel-labs/agent-browser`. + +Hard rules: +- Use the `agent-browser` CLI for every browser interaction. It is on your PATH. Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `agent-browser` only. +- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running: + ``` + agent-browser --cdp "$BU_CDP_WS" open + ``` + All subsequent `agent-browser ` calls automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`snapshot`, `click @e2`, `screenshot`, etc.). +- Before issuing your first command, read the bundled skill so you know the full command surface and current best-practice workflow: + ``` + agent-browser skills get core + ``` + Use `agent-browser skills get core --full` for the complete command reference. The CLI also accepts `--help` on any subcommand. +- Prefer the accessibility-tree workflow: `agent-browser snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser click @eN` / `agent-browser fill @eN ""` to interact. Fall back to CSS selectors or `find role --name "..."` semantic locators when refs are insufficient. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Use the `--screenshot-dir` / explicit-path form so files land on disk and the judge can see them: + ``` + agent-browser screenshot /tmp/shots/step_001.png + agent-browser screenshot /tmp/shots/step_002.png + ``` + Never overwrite a previous screenshot path. Annotated screenshots (`--annotate`) are fine for visual reasoning, but still write to a new numbered filename. +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/claude_code_harness_bu_cli/__init__.py b/frameworks/claude_code_harness_bu_cli/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/claude_code_harness_bu_cli/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/claude_code_harness_bu_cli/run_task.py b/frameworks/claude_code_harness_bu_cli/run_task.py new file mode 100644 index 0000000..1f8f164 --- /dev/null +++ b/frameworks/claude_code_harness_bu_cli/run_task.py @@ -0,0 +1,461 @@ +"""Run a single benchmark task using Claude Code driving the browser-use CLI. + +This is the `browser-use` CLI variant of `claude-code-harness`. Claude Code owns +the agent loop; the agent drives a remote Chrome via the `browser-use` CLI from +`browser-use/browser-use` (Python, daemon-backed). We pre-provision a +browser-use-cloud session and pass its WebSocket CDP URL via `BU_CDP_WS`; the +agent connects with `browser-use --cdp-url "$BU_CDP_WS" open ` and the +per-session daemon reuses that attachment for all subsequent commands. + +The joint system being benchmarked is (Claude Code + browser-use CLI + Claude +model). Pin `claude_code_version` and `framework_ref` (the `browser-use/browser-use` +ref) for reproducible comparisons against `claude-code-harness`, +`claude-code-harness-js`, and `claude-code-harness-ab`. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "max_turns": "Max Claude Code agentic turns (default: 100)", + "max_budget_usd": "Per-task API budget cap in USD (default: 10)", + "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for browser-use install (e.g. fork/browser-use). Consumed by the workflow install step.", + "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +WORK_DIR = Path("/tmp/cch-bu-cli-workdir") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + +# Subtypes Claude Code emits in the terminal `result` event. Anything other than +# 'success' means the agent did not complete the task (usually a limit was hit). +RESULT_SUCCESS = "success" +LIMIT_SUBTYPES = { + "error_max_turns", + "error_max_tokens", + "error_max_budget_usd", + "error_during_execution", + "error_api_error", +} + + +def _require_claude_model(model_name: str) -> str: + """This framework only supports Claude models (Claude Code requires them).""" + if not model_name.startswith("claude-"): + raise ValueError( + f"claude-code-harness-bu-cli requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _reset_dir(p: Path) -> None: + if p.exists(): + shutil.rmtree(p) + p.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +# ---- Browser-Use Cloud session provisioning (mirrors cch-js / cch-ab / bcode) ---- + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser(browser_name: str) -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for claude-code-harness-bu-cli: {browser_name}") + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +# ---- Claude Code invocation (identical to CCH/CCH-JS/CCH-AB except for cwd) ---- + +def _build_claude_cmd( + task_description: str, + model_name: str, + max_turns: int, + max_budget_usd: float, + use_bare: bool, +) -> list[str]: + cmd = [ + "claude", + "-p", + task_description, + "--model", + model_name, + "--dangerously-skip-permissions", + "--output-format", + "stream-json", + "--verbose", + "--max-turns", + str(max_turns), + "--max-budget-usd", + str(max_budget_usd), + "--append-system-prompt-file", + str(SYSTEM_PROMPT_FILE), + "--no-session-persistence", + ] + if use_bare: + cmd.append("--bare") + return cmd + + +def _format_assistant_block(block: dict) -> str | None: + btype = block.get("type") + if btype == "tool_use": + name = block.get("name", "?") + inp = block.get("input", {}) or {} + if name == "Bash": + return f"Bash: {(inp.get('command') or '').strip()[:2000]}" + if name in ("Edit", "Write", "Read"): + path = inp.get("file_path") or inp.get("path") or "" + return f"{name}: {path}" + try: + return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}" + except Exception: + return name + if btype == "text": + text = (block.get("text") or "").strip() + if not text: + return None + return f"text: {text[:2000]}" + if btype == "thinking": + text = (block.get("thinking") or "").strip() + if not text: + return None + return f"thinking: {text[:2000]}" + return None + + +def _format_tool_result_block(block: dict) -> str | None: + if block.get("type") != "tool_result": + return None + content = block.get("content") + is_error = bool(block.get("is_error")) + prefix = "tool_error" if is_error else "tool_result" + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + if c.get("type") == "text": + parts.append(c.get("text", "")) + elif c.get("type") == "image": + parts.append("") + content = "\n".join(parts) + if not isinstance(content, str): + try: + content = json.dumps(content, default=str) + except Exception: + content = str(content) + content = content.strip() + if not content: + return None + return f"{prefix}: {content[:2000]}" + + +def _format_event_steps(event: dict) -> list[str]: + etype = event.get("type") + if etype == "assistant": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_assistant_block(block) + if s: + steps.append(s) + return steps + if etype == "user": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_tool_result_block(block) + if s: + steps.append(s) + return steps + return [] + + +def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]: + subtype = event.get("subtype") or RESULT_SUCCESS + is_error = bool(event.get("is_error")) + errors_raw = event.get("errors") or [] + errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)] + return subtype, is_error, errors + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + print(f"[claude-stderr] {s}", flush=True) + + +async def _close_browser_use_sessions() -> None: + """Best-effort: tell browser-use to shut down all daemons. + + The browser-use CLI spawns a per-session background daemon (one per + `--session` name; default is "default"). `close --all` quits every + active session so a leaked daemon does not survive across tasks on the + same runner. + """ + try: + stop_proc = await asyncio.create_subprocess_exec( + "browser-use", "close", "--all", + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await asyncio.wait_for(stop_proc.wait(), timeout=15) + except Exception: + pass + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + max_turns = int(params.get("max_turns", "100")) + max_budget_usd = float(params.get("max_budget_usd", "10")) + use_bare = params.get("use_bare", "true").lower() != "false" + + _reset_dir(SHOTS_DIR) + _reset_dir(WORK_DIR) + + # Pre-provision a remote browser; pass its WS URL to the agent via env. + # The agent runs: browser-use --cdp-url "$BU_CDP_WS" open + browser_id, cdp_ws = _start_browser(browser_name) + + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + "DISABLE_TELEMETRY": "1", + "DISABLE_AUTOUPDATER": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + # browser-use CLI: don't auto-launch the setup wizard / installer on + # first call. The workflow already ran `browser-use install` and + # `browser-use doctor`, and we're attaching via --cdp-url so the + # local Chromium is not used to drive the page. + "BROWSER_USE_SETUP_LOGGING": "false", + } + + cmd = _build_claude_cmd( + task_description, model_name, max_turns, max_budget_usd, use_bare + ) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + result_subtype: str | None = None + result_is_error = False + result_errors: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[claude-stdout-raw] {line}", flush=True) + continue + + new_steps = _format_event_steps(event) + for s in new_steps: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + if event.get("type") == "result": + final_text = event.get("result") or "" + total_cost = float(event.get("total_cost_usd") or 0.0) + result_subtype, result_is_error, result_errors = _summarize_result_event(event) + print( + f"[claude-result] subtype={result_subtype} is_error={result_is_error} " + f"cost=${total_cost:.4f} errors={result_errors}", + flush=True, + ) + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + # Best-effort: close any browser-use daemon(s) the agent left running + # so they don't leak across tasks on the same runner. + await _close_browser_use_sessions() + _stop_browser(browser_id) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + if result_subtype is None and proc.returncode not in (0, None): + raise RuntimeError( + f"claude exited with code {proc.returncode} before emitting a result event. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + match = FINAL_ANSWER_RE.search(final_text or "") + answer = match.group(1).strip() if match else (final_text.strip() or "") + + if result_subtype and result_subtype != RESULT_SUCCESS: + err_suffix = f" errors={result_errors}" if result_errors else "" + if answer: + final_result = f"[{result_subtype}] {answer}{err_suffix}" + else: + final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}" + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/claude_code_harness_bu_cli/system_prompt.md b/frameworks/claude_code_harness_bu_cli/system_prompt.md new file mode 100644 index 0000000..fa9789d --- /dev/null +++ b/frameworks/claude_code_harness_bu_cli/system_prompt.md @@ -0,0 +1,25 @@ +You are evaluating a benchmark task by driving a real browser through the `browser-use` CLI from `browser-use/browser-use`. + +Hard rules: +- Use the `browser-use` CLI for every browser interaction. It is on your PATH (aliases: `bu`, `browser`, `browseruse` all work). Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `browser-use` only. +- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running: + ``` + browser-use --cdp-url "$BU_CDP_WS" open + ``` + All subsequent `browser-use ` calls automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`state`, `click 5`, `input 3 "text"`, `screenshot`, etc.). +- Before issuing your first interaction command, read the bundled SKILL.md so you know the full command surface, common workflows, and troubleshooting tips. It is at `~/.claude/skills/browser-use/SKILL.md`. If you have a Read tool, read that file. Otherwise: `cat ~/.claude/skills/browser-use/SKILL.md`. +- Standard workflow per the SKILL: (1) `browser-use --cdp-url "$BU_CDP_WS" open ` to attach + navigate, (2) `browser-use state` to see clickable elements with indices, (3) `browser-use click ` / `browser-use input "text"` to interact, (4) `browser-use state` or `browser-use screenshot` to verify, (5) repeat. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Pass an explicit path to `browser-use screenshot`: + ``` + browser-use screenshot /tmp/shots/step_001.png + browser-use screenshot /tmp/shots/step_002.png + ``` + Never overwrite a previous screenshot path. +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. Do not call `browser-use cloud connect` or `browser-use connect` -- the browser is already provisioned and attached via `--cdp-url`. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/claude_code_harness_js/__init__.py b/frameworks/claude_code_harness_js/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/claude_code_harness_js/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/claude_code_harness_js/run_task.py b/frameworks/claude_code_harness_js/run_task.py new file mode 100644 index 0000000..8141989 --- /dev/null +++ b/frameworks/claude_code_harness_js/run_task.py @@ -0,0 +1,445 @@ +"""Run a single benchmark task using Claude Code driving browser-harness-js. + +This is the JavaScript-CDP variant of `claude-code-harness`. Claude Code owns the +agent loop; the agent drives a remote Chrome via the `browser-harness-js` CLI +(typed CDP wrappers exposed as a single-process bun REPL). We pre-provision a +browser-use-cloud session and pass its WebSocket CDP URL via `BU_CDP_WS`; the +agent calls `session.connect({ wsUrl: process.env.BU_CDP_WS })` to attach. + +The joint system being benchmarked is (Claude Code + browser-harness-js + Claude +model). Pin `claude_code_version` and `framework_ref` for reproducible +comparisons against the Python `claude-code-harness` framework. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "max_turns": "Max Claude Code agentic turns (default: 100)", + "max_budget_usd": "Per-task API budget cap in USD (default: 10)", + "claude_code_version": "Claude Code npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for browser-harness-js install (e.g. fork/browser-harness-js). Consumed by the workflow install step.", + "use_bare": "Pass --bare to claude to skip hook/MCP/plugin auto-discovery (true/false, default: true)", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +WORK_DIR = Path("/tmp/cch-js-workdir") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + +# Subtypes Claude Code emits in the terminal `result` event. Anything other than +# 'success' means the agent did not complete the task (usually a limit was hit). +RESULT_SUCCESS = "success" +LIMIT_SUBTYPES = { + "error_max_turns", + "error_max_tokens", + "error_max_budget_usd", + "error_during_execution", + "error_api_error", +} + + +def _require_claude_model(model_name: str) -> str: + """This framework only supports Claude models (Claude Code requires them).""" + if not model_name.startswith("claude-"): + raise ValueError( + f"claude-code-harness-js requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _reset_dir(p: Path) -> None: + if p.exists(): + shutil.rmtree(p) + p.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +# ---- Browser-Use Cloud session provisioning (mirrors bcode runner) ---- + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={"X-Browser-Use-API-Key": _bu_api_key(), "Content-Type": "application/json"}, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser(browser_name: str) -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for claude-code-harness-js: {browser_name}") + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +# ---- Claude Code invocation (identical to CCH except for cwd) ---- + +def _build_claude_cmd( + task_description: str, + model_name: str, + max_turns: int, + max_budget_usd: float, + use_bare: bool, +) -> list[str]: + cmd = [ + "claude", + "-p", + task_description, + "--model", + model_name, + "--dangerously-skip-permissions", + "--output-format", + "stream-json", + "--verbose", + "--max-turns", + str(max_turns), + "--max-budget-usd", + str(max_budget_usd), + "--append-system-prompt-file", + str(SYSTEM_PROMPT_FILE), + "--no-session-persistence", + ] + if use_bare: + cmd.append("--bare") + return cmd + + +def _format_assistant_block(block: dict) -> str | None: + btype = block.get("type") + if btype == "tool_use": + name = block.get("name", "?") + inp = block.get("input", {}) or {} + if name == "Bash": + return f"Bash: {(inp.get('command') or '').strip()[:2000]}" + if name in ("Edit", "Write", "Read"): + path = inp.get("file_path") or inp.get("path") or "" + return f"{name}: {path}" + try: + return f"{name}: {json.dumps(inp, separators=(',', ':'))[:2000]}" + except Exception: + return name + if btype == "text": + text = (block.get("text") or "").strip() + if not text: + return None + return f"text: {text[:2000]}" + if btype == "thinking": + text = (block.get("thinking") or "").strip() + if not text: + return None + return f"thinking: {text[:2000]}" + return None + + +def _format_tool_result_block(block: dict) -> str | None: + if block.get("type") != "tool_result": + return None + content = block.get("content") + is_error = bool(block.get("is_error")) + prefix = "tool_error" if is_error else "tool_result" + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + if c.get("type") == "text": + parts.append(c.get("text", "")) + elif c.get("type") == "image": + parts.append("") + content = "\n".join(parts) + if not isinstance(content, str): + try: + content = json.dumps(content, default=str) + except Exception: + content = str(content) + content = content.strip() + if not content: + return None + return f"{prefix}: {content[:2000]}" + + +def _format_event_steps(event: dict) -> list[str]: + etype = event.get("type") + if etype == "assistant": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_assistant_block(block) + if s: + steps.append(s) + return steps + if etype == "user": + msg = event.get("message", {}) or {} + steps = [] + for block in msg.get("content", []) or []: + s = _format_tool_result_block(block) + if s: + steps.append(s) + return steps + return [] + + +def _summarize_result_event(event: dict) -> tuple[str, bool, list[str]]: + subtype = event.get("subtype") or RESULT_SUCCESS + is_error = bool(event.get("is_error")) + errors_raw = event.get("errors") or [] + errors = [str(e) for e in errors_raw] if isinstance(errors_raw, list) else [str(errors_raw)] + return subtype, is_error, errors + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + print(f"[claude-stderr] {s}", flush=True) + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + max_turns = int(params.get("max_turns", "100")) + max_budget_usd = float(params.get("max_budget_usd", "10")) + use_bare = params.get("use_bare", "true").lower() != "false" + + _reset_dir(SHOTS_DIR) + _reset_dir(WORK_DIR) + + # Pre-provision a remote browser; pass its WS URL to the agent via env. + # The agent connects with `session.connect({ wsUrl: process.env.BU_CDP_WS })`. + browser_id, cdp_ws = _start_browser(browser_name) + + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + "DISABLE_TELEMETRY": "1", + "DISABLE_AUTOUPDATER": "1", + "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", + # browser-harness-js auto-installs bun on first run if missing; we + # pre-installed bun in the workflow, so opt out of any check-in noise. + "BROWSER_HARNESS_SKIP_BUN_INSTALL": "1", + } + + cmd = _build_claude_cmd( + task_description, model_name, max_turns, max_budget_usd, use_bare + ) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + result_subtype: str | None = None + result_is_error = False + result_errors: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[claude-stdout-raw] {line}", flush=True) + continue + + new_steps = _format_event_steps(event) + for s in new_steps: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + if event.get("type") == "result": + final_text = event.get("result") or "" + total_cost = float(event.get("total_cost_usd") or 0.0) + result_subtype, result_is_error, result_errors = _summarize_result_event(event) + print( + f"[claude-result] subtype={result_subtype} is_error={result_is_error} " + f"cost=${total_cost:.4f} errors={result_errors}", + flush=True, + ) + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[claude-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + # Best-effort: stop the bun REPL server so it doesn't leak across tasks. + try: + stop_proc = await asyncio.create_subprocess_exec( + "browser-harness-js", "--stop", + stdout=asyncio.subprocess.DEVNULL, + stderr=asyncio.subprocess.DEVNULL, + ) + await asyncio.wait_for(stop_proc.wait(), timeout=10) + except Exception: + pass + _stop_browser(browser_id) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + if result_subtype is None and proc.returncode not in (0, None): + raise RuntimeError( + f"claude exited with code {proc.returncode} before emitting a result event. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + match = FINAL_ANSWER_RE.search(final_text or "") + answer = match.group(1).strip() if match else (final_text.strip() or "") + + if result_subtype and result_subtype != RESULT_SUCCESS: + err_suffix = f" errors={result_errors}" if result_errors else "" + if answer: + final_result = f"[{result_subtype}] {answer}{err_suffix}" + else: + final_result = f"[{result_subtype}] Agent did not complete task.{err_suffix}" + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/claude_code_harness_js/system_prompt.md b/frameworks/claude_code_harness_js/system_prompt.md new file mode 100644 index 0000000..b5d375d --- /dev/null +++ b/frameworks/claude_code_harness_js/system_prompt.md @@ -0,0 +1,21 @@ +You are evaluating a benchmark task by driving a real browser through the browser-harness-js CDP skill. + +Hard rules: +- Use the harness. Read `SKILL.md` first (under `~/.claude/skills/cdp/SKILL.md`). Drive the browser by running `browser-harness-js ''` on the shell, or by piping multi-line snippets via heredoc. Do not install other browser tools, do not use Playwright directly, do not open a different repo. +- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from the environment and calling `await session.connect({ wsUrl: process.env.BU_CDP_WS })`. Do NOT call `session.connect()` with no arguments (no local Chrome to auto-detect). Do NOT spawn or kill any browser processes. +- After connecting, list page targets with `await listPageTargets()` and call `await session.use(targetInfo.targetId)` to bind to a tab before issuing Page/DOM/Runtime/Network calls. Globals (`session`, `globalThis.*`) persist across `browser-harness-js` invocations because the CLI auto-spawns a single long-lived bun server. Reuse them. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Decode the base64 returned by `Page.captureScreenshot` and write it to disk yourself; never overwrite a previous screenshot path. Example: + ``` + browser-harness-js <<'JS' + const { data } = await session.Page.captureScreenshot({ format: 'png' }); + require('fs').writeFileSync('/tmp/shots/step_001.png', Buffer.from(data, 'base64')); + return 'ok'; + JS + ``` +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/claude_cua/__init__.py b/frameworks/claude_cua/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/claude_cua/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/claude_cua/run_task.py b/frameworks/claude_cua/run_task.py new file mode 100644 index 0000000..737d231 --- /dev/null +++ b/frameworks/claude_cua/run_task.py @@ -0,0 +1,97 @@ +"""Run a single benchmark task using Claude Computer Use Agent. + +CUA controls its own desktop environment (Xvfb + browser). The browser parameter +is meaningless for this framework -- it uses "integrated" as a placeholder. + +The agent loop: +1. Launch Xvfb virtual display + browser +2. Send task to Claude with the computer tool +3. Loop: Claude emits actions -> execute on desktop -> screenshot -> send back +4. Collect steps and final result into ExecutionResult +""" + +import asyncio +import os +import sys +import time +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = {} + + +async def execute(task_description: str) -> ExecutionResult: + """Run the Claude CUA agent loop on a task. + + TODO: Implement the full CUA agent loop: + 1. Start Xvfb + browser via subprocess + 2. Take initial screenshot + 3. Send to Anthropic Messages API with computer_20251124 tool + 4. Loop: parse tool_use blocks, execute actions, screenshot, send tool_result + 5. Collect all steps and final text response + """ + start = time.time() + + # import anthropic + # client = anthropic.Anthropic() + # + # tools = [{"type": "computer_20251124", "name": "computer", + # "display_width_px": 1920, "display_height_px": 1080}] + # messages = [{"role": "user", "content": task_description}] + # + # steps = [] + # screenshots_b64 = [] + # for _ in range(50): # max iterations + # response = client.beta.messages.create( + # model="claude-sonnet-4-20250514", max_tokens=4096, + # tools=tools, messages=messages, betas=["computer-use-2025-11-24"], + # ) + # ... execute actions, collect screenshots, break on end_turn ... + + duration = time.time() - start + + return ExecutionResult( + final_result="NOT IMPLEMENTED", + steps=[], + screenshots_b64=[], + num_steps=0, + duration_seconds=duration, + cost=0, + ) + + +async def main(): + validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/codex_harness/__init__.py b/frameworks/codex_harness/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/codex_harness/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/codex_harness/run_task.py b/frameworks/codex_harness/run_task.py new file mode 100644 index 0000000..5838e3c --- /dev/null +++ b/frameworks/codex_harness/run_task.py @@ -0,0 +1,489 @@ +"""Run a single benchmark task using OpenAI Codex CLI driving browser-harness. + +This framework wraps OpenAI's Codex CLI (the coding agent) around the +browser-harness repo: Codex owns the agent loop, we hand it a task and a +workdir pre-loaded with the harness + a live browser daemon, then stream-parse +its `codex exec --json` JSONL output. + +The joint system being benchmarked is (Codex CLI + browser-harness + OpenAI +model). Pin `codex_version` and `framework_ref` for reproducible comparisons. + +Mirrors `frameworks/claude_code_harness/run_task.py` -- same browser +provisioning (admin.start_remote_daemon under BU_NAME), same /tmp/shots +screenshot drain, same FINAL ANSWER convention -- swapping out the agent CLI +and its event schema. + +Codex JSON event schema (see https://developers.openai.com/codex/noninteractive): +- `thread.started` {thread_id} +- `turn.started` +- `turn.completed` {usage: {input_tokens, cached_input_tokens, output_tokens, + reasoning_output_tokens}} +- `turn.failed` {error: {...}} +- `item.started` {item: {id, type, ...}} +- `item.updated` {item: {...}} +- `item.completed` {item: {id, type, ...}} + item.type in {agent_message, reasoning, command_execution, file_change, + mcp_tool_call, web_search, plan_update, todo_list, ...} +- `error` {message} + +Codex does NOT emit a per-turn cost field; we compute cost from token counts +via a small static price map (see _MODEL_PRICES). Models not in the map +report cost=0. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow, +# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules. +HARNESS_DIR = "/tmp/browser-harness" + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "codex_version": "Codex CLI npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", + "sandbox": "Codex sandbox policy (read-only | workspace-write | danger-full-access; default: danger-full-access).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + +# USD/token prices for cost estimation. Codex does not surface per-turn cost; +# we compute total_cost = input * input_price + output * output_price. Cached +# input tokens are charged at the cached rate when known, otherwise full rate. +# Update as OpenAI publishes new prices. Models absent here report cost=0. +_MODEL_PRICES: dict[str, dict[str, float]] = { + "gpt-5": {"input": 1.25e-6, "cached_input": 0.125e-6, "output": 10e-6}, +} + + +def _model_price(model_name: str) -> dict[str, float] | None: + if model_name in _MODEL_PRICES: + return _MODEL_PRICES[model_name] + # Best-effort: strip common dated suffixes. + for key in _MODEL_PRICES: + if model_name.startswith(key): + return _MODEL_PRICES[key] + return None + + +def _reset_shots_dir() -> None: + if SHOTS_DIR.exists(): + shutil.rmtree(SHOTS_DIR) + SHOTS_DIR.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +def _start_browser(browser_name: str, bu_name: str) -> dict: + """Provision a browser for the harness to attach to.""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for codex-harness: {browser_name}") + sys.path.insert(0, HARNESS_DIR) + from admin import start_remote_daemon # type: ignore + + return start_remote_daemon(name=bu_name) + + +def _stop_browser(browser_name: str, bu_name: str) -> None: + try: + sys.path.insert(0, HARNESS_DIR) + from admin import stop_remote_daemon # type: ignore + + if browser_name == "browser-use-cloud": + stop_remote_daemon(name=bu_name) + except Exception as e: + print(f"Warning: failed to stop harness daemon: {e}") + + +def _build_codex_cmd(model_name: str, sandbox: str) -> list[str]: + """Build the `codex exec` command. Prompt is passed via stdin. + + Notes on flags: + - `--ask-for-approval` is NOT accepted at `codex exec` level in published + builds (despite docs suggesting global flags propagate). `codex exec` + is non-interactive by construction; no approval gating runs anyway. + - `--sandbox`, `--skip-git-repo-check`, `--ignore-user-config`, and + `--json` are exec-level flags. + - The prompt comes on stdin via `-` so we don't have to worry about + shell-escaping multi-MB prompts. + """ + return [ + "codex", + "exec", + "--json", + "--model", + model_name, + "--sandbox", + sandbox, + "--skip-git-repo-check", + "--ignore-user-config", + "-", # read prompt from stdin + ] + + +def _format_item(item: dict) -> str | None: + """Turn a single Codex `item.completed` payload into a step string.""" + itype = item.get("type") + if itype == "agent_message": + text = (item.get("text") or "").strip() + if not text: + return None + return f"text: {text[:2000]}" + if itype == "reasoning": + # Codex emits a short summary; can also be in a `summary` array. + text = (item.get("text") or item.get("summary") or "").strip() if isinstance( + item.get("text") or item.get("summary"), str + ) else "" + if not text: + # Sometimes reasoning has a list of summary strings. + summary = item.get("summary") + if isinstance(summary, list): + text = " ".join(s for s in summary if isinstance(s, str)).strip() + if not text: + return "reasoning" + return f"reasoning: {text[:2000]}" + if itype == "command_execution": + cmd = item.get("command") or "" + if isinstance(cmd, list): + cmd = " ".join(str(c) for c in cmd) + cmd = (cmd or "").strip() + status = item.get("status") or "" + exit_code = item.get("exit_code") + out = item.get("aggregated_output") or item.get("output") or "" + if isinstance(out, dict): + out = out.get("text") or json.dumps(out, default=str) + out = (out or "").strip() + # Two-step: emit command itself, then result tag for visibility. + # We compact into a single step entry so step counts stay reasonable. + head = f"Bash: {cmd[:1500]}" + tail = "" + if status and status != "completed": + tail = f" [{status}]" + if exit_code is not None and exit_code != 0: + tail += f" exit={exit_code}" + if out: + tail += f"\n-> {out[:500]}" + return (head + tail)[:2000] + if itype == "file_change": + path = item.get("path") or "" + action = item.get("action") or "write" + return f"{action}: {path}" + if itype == "mcp_tool_call": + name = item.get("name") or item.get("tool") or "mcp" + args = item.get("arguments") or item.get("input") or {} + try: + blob = json.dumps(args, separators=(",", ":"))[:1500] + except Exception: + blob = str(args)[:1500] + return f"mcp:{name}: {blob}" + if itype == "web_search": + q = item.get("query") or "" + return f"web_search: {q[:500]}" + if itype == "plan_update" or itype == "todo_list": + try: + return f"{itype}: {json.dumps(item, default=str)[:1500]}" + except Exception: + return itype + if itype: + # Unknown but well-formed item type -- keep a breadcrumb. + try: + return f"{itype}: {json.dumps(item, default=str)[:1500]}" + except Exception: + return itype + return None + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + """Read stderr line-by-line, echo to our stdout, and buffer for later reporting.""" + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + # Surface to GitHub Actions log in real time. + print(f"[codex-stderr] {s}", flush=True) + + +def _compose_prompt(task_description: str) -> str: + """Combine our system prompt with the task. Codex CLI doesn't have + `--append-system-prompt-file`; we prepend the system prompt to the user + prompt instead. Codex also auto-loads `AGENTS.md` from the workdir, but + we put the rules in the prompt to be explicit + version-pinned.""" + system = SYSTEM_PROMPT_FILE.read_text() + return f"{system}\n\n---\n\nTask:\n{task_description}\n" + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = os.environ["MODEL"] + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + task_index = os.environ.get("TASK_INDEX", "0") + sandbox = params.get("sandbox", "danger-full-access") + # task_timeout is consumed in main() before run_and_judge wraps execute. + + bu_name = f"eval-{task_index}" + _reset_shots_dir() + + # Pre-provision the browser so Codex starts with a live CDP attach. + _start_browser(browser_name, bu_name) + + # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by + # default but accepts `CODEX_API_KEY` env explicitly (the only auth env + # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read + # by codex (it's for the OpenAI Python SDK). We mirror the workflow's + # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks + # both bcode (which uses OPENAI_API_KEY directly) and codex-harness. + # + # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness` + # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but + # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4 + # showed the agent self-recovered by prepending the venv dir, but that + # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc + # in our system prompt + SKILL.md works on the first try. + harness_venv_bin = f"{HARNESS_DIR}/.venv/bin" + existing_path = os.environ.get("PATH", "") + env = { + **os.environ, + "BU_NAME": bu_name, + "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""), + "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin, + } + + cmd = _build_codex_cmd(model_name, sandbox) + prompt = _compose_prompt(task_description) + + start = time.time() + steps: list[str] = [] + final_text = "" + total_input_tokens = 0 + total_cached_input_tokens = 0 + total_output_tokens = 0 + total_reasoning_tokens = 0 + turn_failed_error: str | None = None + error_events: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap on line buffer + ) + + # Pipe the prompt in on stdin and close. + assert proc.stdin is not None + proc.stdin.write(prompt.encode("utf-8")) + await proc.stdin.drain() + proc.stdin.close() + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + """Yield one JSONL line at a time. Codex item.completed payloads for + command_execution events can include large aggregated_output blobs -- + read raw chunks and split on newlines, no per-line cap.""" + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[codex-stdout-raw] {line}", flush=True) + continue + + etype = event.get("type") + if etype == "item.completed": + item = event.get("item") or {} + s = _format_item(item) + if s: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + # Track latest agent_message for FINAL ANSWER extraction. + if item.get("type") == "agent_message": + text = (item.get("text") or "").strip() + if text: + final_text = text + elif etype == "turn.completed": + usage = event.get("usage") or {} + total_input_tokens += int(usage.get("input_tokens") or 0) + total_cached_input_tokens += int(usage.get("cached_input_tokens") or 0) + total_output_tokens += int(usage.get("output_tokens") or 0) + total_reasoning_tokens += int(usage.get("reasoning_output_tokens") or 0) + print( + f"[codex-turn] in={usage.get('input_tokens')} " + f"cached={usage.get('cached_input_tokens')} " + f"out={usage.get('output_tokens')} " + f"reasoning={usage.get('reasoning_output_tokens')}", + flush=True, + ) + elif etype == "turn.failed": + err = event.get("error") or {} + turn_failed_error = json.dumps(err, default=str)[:500] + print(f"[codex-turn-failed] {turn_failed_error}", flush=True) + elif etype == "error": + msg = event.get("message") or json.dumps(event, default=str) + error_events.append(str(msg)[:500]) + print(f"[codex-error] {msg}", flush=True) + elif etype == "thread.started": + tid = event.get("thread_id") + print(f"[codex-thread] {tid}", flush=True) + + # Wait for the process to exit cleanly. + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[codex-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_name, bu_name) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + # Cost estimate from token counts. + prices = _model_price(model_name) + if prices: + # Non-cached input is total_input_tokens - cached_input_tokens. + non_cached = max(0, total_input_tokens - total_cached_input_tokens) + cost = ( + non_cached * prices["input"] + + total_cached_input_tokens * prices.get("cached_input", prices["input"]) + + total_output_tokens * prices["output"] + ) + else: + cost = 0.0 + + # Determine final result text. Prefer the FINAL ANSWER line. + match = FINAL_ANSWER_RE.search(final_text or "") + answer = match.group(1).strip() if match else (final_text.strip() or "") + + # Failure precedence: turn.failed > error events > no agent_message > clean. + if turn_failed_error and not answer: + final_result = f"[codex_turn_failed] {turn_failed_error}" + elif error_events and not answer: + final_result = f"[codex_error] {error_events[-1]}" + elif not final_text: + if proc.returncode not in (0, None): + raise RuntimeError( + f"codex exited with code {proc.returncode} and emitted no agent_message. " + f"steps_captured={len(steps)} duration={duration:.1f}s " + f"stderr_tail:\n{stderr_tail[-2000:]}" + ) + final_result = "Agent did not emit any output" + else: + final_result = answer or final_text.strip() + # If FINAL ANSWER missing but we had output, surface as fallback. + if not match: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + # Propagate task_timeout param to run_and_judge before it wraps execute(). + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/codex_harness/system_prompt.md b/frameworks/codex_harness/system_prompt.md new file mode 100644 index 0000000..7908e81 --- /dev/null +++ b/frameworks/codex_harness/system_prompt.md @@ -0,0 +1,15 @@ +You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory. + +Hard rules: +- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo. +- The `browser-harness` CLI lives in the workdir venv at `./.venv/bin/browser-harness`. The shell's `PATH` already includes that directory (prepended by the runner). If you ever get `browser-harness: command not found`, you can also invoke it directly as `./.venv/bin/browser-harness <<'PY' ... PY` or run it via `uv run browser-harness <<'PY' ... PY` from the workdir. +- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). When you reach an answer, deliver it in the format below and exit. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/pi_harness/__init__.py b/frameworks/pi_harness/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/pi_harness/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/pi_harness/run_task.py b/frameworks/pi_harness/run_task.py new file mode 100644 index 0000000..785a27f --- /dev/null +++ b/frameworks/pi_harness/run_task.py @@ -0,0 +1,447 @@ +"""Run a single benchmark task using pi (the @earendil-works/pi-coding-agent CLI) +driving browser-harness. + +This framework is a near-mirror of `claude_code_harness`, except the coding +agent is `pi` instead of Claude Code. The browser side is unchanged: we still +pre-provision a live browser-harness daemon (Python `admin.start_remote_daemon`) +and let the agent drive it via `browser-harness <<'PY' ... PY` heredocs. + +Joint system being benchmarked: (pi + browser-harness + Claude model). +Restricted to Claude models, mirroring CCH. Pin `pi_version` and +`framework_ref` for reproducible comparisons. + +Pi event-stream notes (`pi --mode json`): +- First line is a `session` header (`{"type":"session",...}`). +- `tool_execution_start` / `tool_execution_end` carry tool calls + results. +- `message_end` carries finished assistant messages with `content` blocks + (text/thinking) -- same shape as Claude's content-block list. +- There is no terminal `result` event with `total_cost_usd`. We therefore + collect cost per-turn from `turn_end.message.usage` if present and 0 otherwise. +- `agent_end` is the terminal lifecycle event. +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +# Harness is installed via `uv pip install /tmp/browser-harness` in the workflow, +# which exposes `admin`, `helpers`, `run`, `daemon` as top-level modules. +HARNESS_DIR = "/tmp/browser-harness" + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "pi_version": "pi (@earendil-works/pi-coding-agent) npm version; consumed by the workflow install step (default: latest)", + "framework_repo": "Override GitHub repo for browser-harness install (e.g. fork/browser-harness). Consumed by the workflow install step.", + "thinking": "pi thinking level: off|minimal|low|medium|high|xhigh (default: high)", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) + + +def _require_claude_model(model_name: str) -> str: + """This framework only supports Claude models, mirroring CCH.""" + if not model_name.startswith("claude-"): + raise ValueError( + f"pi-harness requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _reset_shots_dir() -> None: + if SHOTS_DIR.exists(): + shutil.rmtree(SHOTS_DIR) + SHOTS_DIR.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + """Read every PNG written to /tmp/shots in step order as base64.""" + if not SHOTS_DIR.exists(): + return [] + paths = sorted(p for p in SHOTS_DIR.glob("*.png") if p.is_file()) + return [base64.b64encode(p.read_bytes()).decode() for p in paths] + + +def _start_browser(browser_name: str, bu_name: str) -> dict: + """Provision a browser for the harness to attach to. Returns the cloud browser dict.""" + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for pi-harness: {browser_name}") + sys.path.insert(0, HARNESS_DIR) + from admin import start_remote_daemon # type: ignore + + return start_remote_daemon(name=bu_name) + + +def _stop_browser(browser_name: str, bu_name: str) -> None: + try: + sys.path.insert(0, HARNESS_DIR) + from admin import stop_remote_daemon # type: ignore + + if browser_name == "browser-use-cloud": + stop_remote_daemon(name=bu_name) + except Exception as e: + print(f"Warning: failed to stop harness daemon: {e}") + + +def _build_pi_cmd( + task_description: str, + model_name: str, + thinking: str, + system_prompt: str, +) -> list[str]: + cmd = [ + "pi", + "--mode", + "json", + "--provider", + "anthropic", + "--model", + model_name, + "--thinking", + thinking, + "--no-session", + "--no-context-files", + "--no-extensions", + "--no-skills", + "--no-prompt-templates", + "--no-themes", + "--offline", + "--append-system-prompt", + system_prompt, + task_description, + ] + return cmd + + +def _stringify_content(content) -> str: + """Flatten a content-block list (or string) to a single string.""" + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + t = c.get("type") + if t == "text": + parts.append(c.get("text", "")) + elif t == "thinking": + parts.append(c.get("thinking", "")) + elif t == "image": + parts.append("") + else: + try: + parts.append(json.dumps(c, separators=(",", ":"))) + except Exception: + parts.append(str(c)) + else: + parts.append(str(c)) + return "\n".join(parts) + try: + return json.dumps(content, default=str) + except Exception: + return str(content) + + +def _format_assistant_message(message: dict) -> list[str]: + """Turn an assistant message_end's content blocks into step strings. + + Tool-use blocks are skipped here (they are emitted as `tool_execution_*` + events separately) so we don't double-count them. + """ + steps: list[str] = [] + content = message.get("content", []) or [] + if not isinstance(content, list): + return steps + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + text = (block.get("text") or "").strip() + if text: + steps.append(f"text: {text[:2000]}") + elif btype == "thinking": + text = (block.get("thinking") or "").strip() + if text: + steps.append(f"thinking: {text[:2000]}") + # tool_use blocks are handled by tool_execution_* events. + return steps + + +def _format_tool_call(tool_name: str, args) -> str: + """Format a tool_execution_start event into a step string (matching CCH).""" + if not isinstance(args, dict): + try: + return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return tool_name + if tool_name == "bash": + return f"Bash: {(args.get('command') or '').strip()[:2000]}" + if tool_name in ("edit", "write", "read"): + path = args.get("file_path") or args.get("path") or "" + return f"{tool_name.capitalize()}: {path}" + try: + return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return tool_name + + +def _format_tool_result(tool_name: str, result, is_error: bool) -> str | None: + """Format a tool_execution_end event into a step string.""" + prefix = "tool_error" if is_error else "tool_result" + content = _stringify_content(result).strip() + if not content: + return None + return f"{prefix}: {content[:2000]}" + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + """Read stderr line-by-line, echo to our stdout, and buffer for later reporting.""" + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + # Surface to GitHub Actions log in real time. + print(f"[pi-stderr] {s}", flush=True) + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + task_index = os.environ.get("TASK_INDEX", "0") + thinking = params.get("thinking", "high") + # task_timeout is consumed in main() before run_and_judge wraps execute. + + bu_name = f"eval-{task_index}" + _reset_shots_dir() + + # Pre-provision the browser so pi starts with a live CDP attach. + _start_browser(browser_name, bu_name) + + env = { + **os.environ, + "BU_NAME": bu_name, + "DISABLE_TELEMETRY": "1", + # Pi-specific: skip startup network ops so a flaky pi.dev doesn't + # block the run, and disable install/update telemetry. + "PI_OFFLINE": "1", + "PI_SKIP_VERSION_CHECK": "1", + "PI_TELEMETRY": "0", + } + + system_prompt = SYSTEM_PROMPT_FILE.read_text() + cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + + start = time.time() + steps: list[str] = [] + last_assistant_text = "" + total_cost = 0.0 + saw_agent_end = False + stderr_buf: list[str] = [] + + # pi stream-json lines can be huge (tool results with full page HTML/text). + # Same workaround as CCH: read raw chunks and split on newlines. + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + """Yield one stream-json line at a time, regardless of line length.""" + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 # 64 KiB + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[pi-stdout-raw] {line[:500]}", flush=True) + continue + + etype = event.get("type") + + if etype == "tool_execution_start": + s = _format_tool_call(event.get("toolName") or "?", event.get("args")) + if s: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + elif etype == "tool_execution_end": + s = _format_tool_result( + event.get("toolName") or "?", + event.get("result"), + bool(event.get("isError")), + ) + if s: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + elif etype == "message_end": + msg = event.get("message", {}) or {} + if msg.get("role") == "assistant": + new_steps = _format_assistant_message(msg) + for s in new_steps: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + # Track latest assistant text for FINAL ANSWER extraction. + txt = _stringify_content(msg.get("content")) + if txt: + last_assistant_text = txt + + elif etype == "turn_end": + # Some pi providers carry usage on the final assistant message. + msg = event.get("message", {}) or {} + usage = msg.get("usage") or {} + cost = usage.get("cost") or usage.get("total_cost") or usage.get("total_cost_usd") + if isinstance(cost, (int, float)): + total_cost += float(cost) + + elif etype == "agent_end": + saw_agent_end = True + # Final fallback: scan the full message list for the last + # assistant message in case message_end was missed. + msgs = event.get("messages", []) or [] + for m in reversed(msgs): + if isinstance(m, dict) and m.get("role") == "assistant": + txt = _stringify_content(m.get("content")) + if txt: + last_assistant_text = last_assistant_text or txt + break + + # Wait for the process (stdout closed implies near-exit) + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[pi-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_name, bu_name) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + # Hard error: pi exited non-zero AND we never saw agent_end. + if not saw_agent_end and proc.returncode not in (0, None): + raise RuntimeError( + f"pi exited with code {proc.returncode} before emitting agent_end. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + # Extract FINAL ANSWER from the last assistant text. + match = FINAL_ANSWER_RE.search(last_assistant_text or "") + answer = match.group(1).strip() if match else (last_assistant_text.strip() or "") + + if not saw_agent_end: + # Soft error: pi exited 0 but never emitted agent_end. Surface but keep data. + final_result = f"[pi_no_agent_end] {answer}" if answer else "[pi_no_agent_end] Agent did not complete task." + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + # Propagate task_timeout param to run_and_judge before it wraps execute(). + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/pi_harness/system_prompt.md b/frameworks/pi_harness/system_prompt.md new file mode 100644 index 0000000..d1566de --- /dev/null +++ b/frameworks/pi_harness/system_prompt.md @@ -0,0 +1,13 @@ +You are evaluating a benchmark task by driving a real browser through the browser-harness in the current working directory. + +Hard rules: +- Use the harness. Read `SKILL.md` and `helpers.py` first. Drive the browser via `browser-harness <<'PY' ... PY` heredocs -- do not install other browser tools, do not use Playwright directly, do not open a different repo. +- A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. +- Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. +- Do not edit files outside the current working directory. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/pibt/__init__.py b/frameworks/pibt/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/pibt/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/pibt/run_task.py b/frameworks/pibt/run_task.py new file mode 100644 index 0000000..d8fd61b --- /dev/null +++ b/frameworks/pibt/run_task.py @@ -0,0 +1,466 @@ +"""Run a single benchmark task using PIBT (pi browser terminal). + +PIBT = pi (the @earendil-works/pi-coding-agent CLI) + the +`pi-agent-extensions` package, which provides built-in browser tools via a +vendored `browser-harness-js` (CDP-based). No external Python harness, no +heredocs -- pi drives the browser through its own `cdp_*` tool surface. + +Joint system being benchmarked: (pi + pi-agent-extensions + Claude model). +Restricted to Claude models, mirroring CCH/PIH. + +Browser wiring: we pre-allocate a `browser-use-cloud` session via the v3 API +(same path as bcode/cch-js), resolve the CDP WebSocket URL, and pass it as +`BU_CDP_WS` in the pi subprocess env. The system prompt instructs the agent +to call `cdp_connect({ wsUrl: process.env.BU_CDP_WS })` once at the start. + +Pi event-stream parsing follows PIH (`tool_execution_start/end`, `message_end`, +`turn_end.message.usage`, `agent_end`). +""" + +import asyncio +import base64 +import json +import os +import re +import shutil +import sys +import time +import urllib.request +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = { + "pi_version": "pi (@earendil-works/pi-coding-agent) npm version; consumed by the workflow install step (default: latest).", + "framework_repo": "Override GitHub repo for pi-agent-extensions install (e.g. fork/pi-agent-extensions). Consumed by the workflow install step.", + "thinking": "pi thinking level: off|minimal|low|medium|high|xhigh (default: high).", + "task_timeout": "Per-task wall-clock timeout in seconds, sets TASK_TIMEOUT for run_and_judge (default: 1800).", +} + +SYSTEM_PROMPT_FILE = Path(__file__).resolve().parent / "system_prompt.md" +SHOTS_DIR = Path("/tmp/shots") +FINAL_ANSWER_RE = re.compile(r"FINAL ANSWER:\s*(.+?)\s*$", re.MULTILINE) +# pi-agent-extensions are installed by the workflow install step; the runner +# uses the pi `--extensions ` flag (or default loader) to pick them up. +EXTENSIONS_DIR = "/tmp/pi-agent-extensions" +# Tools we surface to pi. Includes the cdp_* tools from pi-agent-extensions +# plus a minimal builtin set (bash for shots dir mgmt, read/write for general +# scaffolding). Subagent tools are intentionally omitted. +PIBT_TOOLS = "bash,read,write,cdp_connect,cdp_eval,cdp_status,cdp_targets,cdp_use_target" + + +def _require_claude_model(model_name: str) -> str: + if not model_name.startswith("claude-"): + raise ValueError( + f"pibt requires a Claude model. Got: {model_name!r}. " + f"Supported model aliases start with 'claude-' (see models.py)." + ) + return model_name + + +def _bu_api_base() -> str: + base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/") + version = os.environ.get("BU_CLOUD_API_VERSION", "v3") + return f"{base}/api/{version}" + + +def _bu_api_key() -> str: + return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"] + + +def _bu(path: str, method: str, body: dict | None = None) -> dict: + req = urllib.request.Request( + f"{_bu_api_base()}{path}", + method=method, + data=(json.dumps(body).encode() if body is not None else None), + headers={ + "X-Browser-Use-API-Key": _bu_api_key(), + "Content-Type": "application/json", + }, + ) + return json.loads(urllib.request.urlopen(req, timeout=90).read() or b"{}") + + +def _start_browser() -> tuple[str, str]: + """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + info = _bu("/browsers", "POST", {}) + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return info["id"], cdp_ws + + +def _stop_browser(browser_id: str | None) -> None: + if not browser_id: + return + try: + _bu(f"/browsers/{browser_id}", "PATCH", {"action": "stop"}) + except Exception as e: + print(f"Warning: failed to stop browser {browser_id}: {e}") + + +def _reset_shots_dir() -> None: + if SHOTS_DIR.exists(): + shutil.rmtree(SHOTS_DIR) + SHOTS_DIR.mkdir(parents=True) + + +def _collect_screenshots() -> list[str]: + if not SHOTS_DIR.exists(): + return [] + return [ + base64.b64encode(p.read_bytes()).decode() + for p in sorted(SHOTS_DIR.glob("*.png")) + if p.is_file() + ] + + +def _build_pi_cmd( + task_description: str, + model_name: str, + thinking: str, + system_prompt: str, +) -> list[str]: + # NOTE: NOT --no-extensions (we want pi-agent-extensions to load). + # Still pass --no-context-files / --no-skills / --no-prompt-templates / + # --no-themes / --no-session for hermeticity. --offline disables the + # update-check network call. --tools restricts the model to the + # cdp_* surface plus minimal scaffolding. + cmd = [ + "pi", + "--mode", + "json", + "--provider", + "anthropic", + "--model", + model_name, + "--thinking", + thinking, + "--tools", + PIBT_TOOLS, + "--no-session", + "--no-context-files", + "--no-skills", + "--no-prompt-templates", + "--no-themes", + "--offline", + "--append-system-prompt", + system_prompt, + task_description, + ] + return cmd + + +def _stringify_content(content) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + parts = [] + for c in content: + if isinstance(c, dict): + t = c.get("type") + if t == "text": + parts.append(c.get("text", "")) + elif t == "thinking": + parts.append(c.get("thinking", "")) + elif t == "image": + parts.append("") + else: + try: + parts.append(json.dumps(c, separators=(",", ":"))) + except Exception: + parts.append(str(c)) + else: + parts.append(str(c)) + return "\n".join(parts) + try: + return json.dumps(content, default=str) + except Exception: + return str(content) + + +def _format_assistant_message(message: dict) -> list[str]: + steps: list[str] = [] + content = message.get("content", []) or [] + if not isinstance(content, list): + return steps + for block in content: + if not isinstance(block, dict): + continue + btype = block.get("type") + if btype == "text": + text = (block.get("text") or "").strip() + if text: + steps.append(f"text: {text[:2000]}") + elif btype == "thinking": + text = (block.get("thinking") or "").strip() + if text: + steps.append(f"thinking: {text[:2000]}") + return steps + + +def _format_tool_call(tool_name: str, args) -> str: + if not isinstance(args, dict): + try: + return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return tool_name + if tool_name == "bash": + return f"Bash: {(args.get('command') or '').strip()[:2000]}" + if tool_name == "cdp_eval": + return f"cdp_eval: {(args.get('code') or '').strip()[:2000]}" + if tool_name == "cdp_connect": + url = args.get("wsUrl") or args.get("profileDir") or "" + return f"cdp_connect: {url[:500]}" + if tool_name in ("cdp_status", "cdp_targets"): + return tool_name + if tool_name == "cdp_use_target": + return f"cdp_use_target: {args.get('targetId') or ''}" + if tool_name in ("read", "write", "edit"): + path = args.get("file_path") or args.get("path") or "" + return f"{tool_name.capitalize()}: {path}" + try: + return f"{tool_name}: {json.dumps(args, separators=(',', ':'))[:2000]}" + except Exception: + return tool_name + + +def _format_tool_result(tool_name: str, result, is_error: bool) -> str | None: + prefix = "tool_error" if is_error else "tool_result" + content = _stringify_content(result).strip() + if not content: + return None + return f"{prefix}: {content[:2000]}" + + +async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> None: + assert proc.stderr is not None + while True: + line = await proc.stderr.readline() + if not line: + break + try: + s = line.decode("utf-8", errors="replace").rstrip("\n") + except Exception: + s = repr(line) + buf.append(s) + print(f"[pi-stderr] {s}", flush=True) + + +async def execute(task_description: str) -> ExecutionResult: + params = validate_params(parse_params(), ACCEPTED_PARAMS) + model_name = _require_claude_model(os.environ["MODEL"]) + browser_name = os.environ.get("BROWSER", "browser-use-cloud") + if browser_name != "browser-use-cloud": + raise ValueError(f"Unsupported browser for pibt: {browser_name}") + thinking = params.get("thinking", "high") + + _reset_shots_dir() + + # Provision a remote browser; pi attaches over CDP via cdp_connect with the + # WS URL we hand it through env. + browser_id, cdp_ws = _start_browser() + + env = { + **os.environ, + "BU_CDP_WS": cdp_ws, + "DISABLE_TELEMETRY": "1", + "PI_OFFLINE": "1", + "PI_SKIP_VERSION_CHECK": "1", + "PI_TELEMETRY": "0", + } + + system_prompt = SYSTEM_PROMPT_FILE.read_text() + cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + + start = time.time() + steps: list[str] = [] + last_assistant_text = "" + total_cost = 0.0 + saw_agent_end = False + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=EXTENSIONS_DIR, # pi loads the package.json `pi.extensions` from CWD + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + + async def _iter_stdout_lines(): + assert proc.stdout is not None + buf = bytearray() + CHUNK = 1 << 16 + while True: + chunk = await proc.stdout.read(CHUNK) + if not chunk: + if buf: + yield bytes(buf) + buf.clear() + return + buf.extend(chunk) + while True: + nl = buf.find(b"\n") + if nl < 0: + break + line_bytes = bytes(buf[:nl]) + del buf[: nl + 1] + yield line_bytes + + try: + assert proc.stdout is not None + async for raw in _iter_stdout_lines(): + if not raw: + continue + line = raw.decode("utf-8", errors="replace").rstrip("\n") + if not line: + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + print(f"[pi-stdout-raw] {line[:500]}", flush=True) + continue + + etype = event.get("type") + + if etype == "tool_execution_start": + s = _format_tool_call(event.get("toolName") or "?", event.get("args")) + if s: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + elif etype == "tool_execution_end": + s = _format_tool_result( + event.get("toolName") or "?", + event.get("result"), + bool(event.get("isError")), + ) + if s: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + + elif etype == "message_end": + msg = event.get("message", {}) or {} + if msg.get("role") == "assistant": + new_steps = _format_assistant_message(msg) + for s in new_steps: + steps.append(s) + print(f"[step {len(steps):>3}] {s[:500]}", flush=True) + txt = _stringify_content(msg.get("content")) + if txt: + last_assistant_text = txt + + elif etype == "turn_end": + msg = event.get("message", {}) or {} + usage = msg.get("usage") or {} + cost = ( + usage.get("cost") + or usage.get("total_cost") + or usage.get("total_cost_usd") + ) + if isinstance(cost, (int, float)): + total_cost += float(cost) + + elif etype == "agent_end": + saw_agent_end = True + msgs = event.get("messages", []) or [] + for m in reversed(msgs): + if isinstance(m, dict) and m.get("role") == "assistant": + txt = _stringify_content(m.get("content")) + if txt: + last_assistant_text = last_assistant_text or txt + break + + try: + await asyncio.wait_for(proc.wait(), timeout=60) + except asyncio.TimeoutError: + print("[pibt-runner] proc did not exit within 60s of stdout close; killing", flush=True) + proc.kill() + await proc.wait() + + try: + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + stderr_task.cancel() + finally: + if proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + if not stderr_task.done(): + stderr_task.cancel() + _stop_browser(browser_id) + + duration = time.time() - start + stderr_tail = "\n".join(stderr_buf[-50:]) + + if not saw_agent_end and proc.returncode not in (0, None): + raise RuntimeError( + f"pi exited with code {proc.returncode} before emitting agent_end. " + f"steps_captured={len(steps)} duration={duration:.1f}s stderr_tail:\n{stderr_tail[-2000:]}" + ) + + match = FINAL_ANSWER_RE.search(last_assistant_text or "") + answer = match.group(1).strip() if match else (last_assistant_text.strip() or "") + + if not saw_agent_end: + final_result = ( + f"[pi_no_agent_end] {answer}" + if answer + else "[pi_no_agent_end] Agent did not complete task." + ) + else: + final_result = answer or "Agent did not emit FINAL ANSWER line" + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=_collect_screenshots(), + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + + +async def main(): + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + early_params = parse_params() + if "task_timeout" in early_params: + os.environ["TASK_TIMEOUT"] = early_params["task_timeout"] + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/frameworks/pibt/system_prompt.md b/frameworks/pibt/system_prompt.md new file mode 100644 index 0000000..e3ec34a --- /dev/null +++ b/frameworks/pibt/system_prompt.md @@ -0,0 +1,13 @@ +You are evaluating a benchmark task by driving a real browser via the pi browser-harness CDP tools (`cdp_connect`, `cdp_eval`, `cdp_status`, `cdp_targets`, `cdp_use_target`). The browser-harness-js extension is already installed. + +Hard rules: +- Connect once at the start by calling `cdp_connect` with `wsUrl` set to `process.env.BU_CDP_WS` (the env var holds the WebSocket URL of a live browser-use cloud browser). Example: `cdp_connect({ "wsUrl": "" })`. Read the env var with `cdp_eval` first if you need to: `return process.env.BU_CDP_WS`. +- Drive the browser exclusively through `cdp_eval`. Use idiomatic helpers: `gotoUrl(url)`, `waitForLoad()`, `js("...")` or `js(() => ...)`, `pageInfo()`, `clickAtXY(x, y)`, `typeText(text)`, `pressKey(key)`, `scroll({dy})`, `captureScreenshot({path})`. For raw CDP, use `cdp("Domain.method", params)`. NEVER use `session.send(...)` or `session..(...)` -- that is not the contract. +- Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing each shot. Pass it as the path: `await captureScreenshot({ path: "/tmp/shots/step_001.png" })`. Never overwrite a previous screenshot path. The PNG is also attached inline to the tool result automatically. +- Do not install other browser tools, do not start a different browser, do not use Playwright. +- Do not ask clarifying questions. If ambiguous, pick the most reasonable interpretation and proceed. +- When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: + +FINAL ANSWER: + +If the task has no textual answer (e.g. "book a flight"), write `FINAL ANSWER: done` and describe what you did in the preceding text. The judge reads your full transcript, not just this line -- but the line must be present for the run to be scored. diff --git a/frameworks/stagehand/__init__.py b/frameworks/stagehand/__init__.py new file mode 100644 index 0000000..5f28270 --- /dev/null +++ b/frameworks/stagehand/__init__.py @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/frameworks/stagehand/executor.js b/frameworks/stagehand/executor.js new file mode 100644 index 0000000..61eca97 --- /dev/null +++ b/frameworks/stagehand/executor.js @@ -0,0 +1,65 @@ +/** + * Stagehand agent executor. + * + * Reads TASK_DESCRIPTION and BROWSER from env. + * Runs the Stagehand agent and prints a JSON result to stdout. + * + * Expected stdout format: + * { + * "final_result": "...", + * "steps": ["step 1", "step 2", ...], + * "screenshots_b64": ["base64...", ...], + * "num_steps": 10, + * "duration_seconds": 45.2, + * "cost": 0.05 + * } + */ + +// TODO: Implement Stagehand execution +// const { Stagehand } = require("@browserbasehq/stagehand"); + +async function main() { + const taskDescription = process.env.TASK_DESCRIPTION; + const browser = process.env.BROWSER || "browserbase"; + + if (!taskDescription) { + console.error("TASK_DESCRIPTION env var is required"); + process.exit(1); + } + + const startTime = Date.now(); + + // TODO: Initialize Stagehand with appropriate env (BROWSERBASE or LOCAL) + // const stagehand = new Stagehand({ + // env: browser === "browserbase" ? "BROWSERBASE" : "LOCAL", + // modelName: "anthropic/claude-sonnet-4-20250514", + // modelClientOptions: { apiKey: process.env.ANTHROPIC_API_KEY }, + // }); + // await stagehand.init(); + // + // const page = stagehand.context.pages()[0]; + // const agent = stagehand.agent({ modelName: "anthropic/claude-sonnet-4-20250514" }); + // const result = await agent.execute({ instruction: taskDescription }); + // + // await stagehand.close(); + + const durationSeconds = (Date.now() - startTime) / 1000; + + // TODO: Map Stagehand result to standard format + const output = { + final_result: "NOT IMPLEMENTED", + steps: [], + screenshots_b64: [], + num_steps: 0, + duration_seconds: durationSeconds, + cost: 0, + }; + + // Print JSON to stdout for the Python wrapper to parse + console.log(JSON.stringify(output)); +} + +main().catch((err) => { + console.error(err); + process.exit(1); +}); diff --git a/frameworks/stagehand/package.json b/frameworks/stagehand/package.json new file mode 100644 index 0000000..7a963ae --- /dev/null +++ b/frameworks/stagehand/package.json @@ -0,0 +1,8 @@ +{ + "name": "benchmark-stagehand-executor", + "private": true, + "type": "commonjs", + "dependencies": { + "@browserbasehq/stagehand": "^2.0.0" + } +} diff --git a/frameworks/stagehand/run_task.py b/frameworks/stagehand/run_task.py new file mode 100644 index 0000000..1bc2b36 --- /dev/null +++ b/frameworks/stagehand/run_task.py @@ -0,0 +1,86 @@ +"""Run a single benchmark task using the Stagehand agent framework. + +Stagehand is a TypeScript framework. This Python entry point: +1. Loads the task and wires up Laminar (shared infra) +2. Shells out to node executor.js which runs the Stagehand agent +3. Parses the JSON result from stdout into ExecutionResult +4. Feeds it into the shared judge flow +""" + +import json +import os +import subprocess +import sys +import asyncio +from pathlib import Path + +# Add project root to path for sibling imports +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from dotenv import load_dotenv +from laminar import LaminarService +from frameworks import ( + ExecutionResult, + load_tasks, + interleave, + run_and_judge, + parse_params, + validate_params, +) + +load_dotenv() + +ACCEPTED_PARAMS: dict[str, str] = {} + +EXECUTOR_DIR = Path(__file__).resolve().parent +EXECUTOR_SCRIPT = EXECUTOR_DIR / "executor.js" + + +async def execute(task_description: str) -> ExecutionResult: + """Run the Stagehand agent via node subprocess.""" + browser_name = os.environ.get("BROWSER", "browserbase") + + env = {**os.environ, "TASK_DESCRIPTION": task_description, "BROWSER": browser_name} + proc = subprocess.run( + ["node", str(EXECUTOR_SCRIPT)], + capture_output=True, + text=True, + timeout=900, + env=env, + cwd=str(EXECUTOR_DIR), + ) + + if proc.returncode != 0: + raise RuntimeError(f"Stagehand executor failed: {proc.stderr}") + + data = json.loads(proc.stdout) + return ExecutionResult( + final_result=data.get("final_result", ""), + steps=data.get("steps", []), + screenshots_b64=data.get("screenshots_b64", []), + num_steps=data.get("num_steps", 0), + duration_seconds=data.get("duration_seconds", 0), + cost=data.get("cost", 0), + ) + + +async def main(): + validate_params(parse_params(), ACCEPTED_PARAMS) + task_index = int(os.environ["TASK_INDEX"]) + eval_id = os.environ["EVAL_ID"] + benchmark = os.environ.get("BENCHMARK", "BU_Bench_V1") + + tasks = load_tasks(benchmark) + if len(tasks) == 100: + tasks = interleave(tasks) + task = tasks[task_index] + task["_index"] = task_index + + LaminarService.initialize() + LaminarService.attach_evaluation(eval_id) + + await run_and_judge(task, execute) + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/laminar.py b/laminar.py new file mode 100644 index 0000000..1cb7d9e --- /dev/null +++ b/laminar.py @@ -0,0 +1,49 @@ +"""No-op Laminar compatibility layer for public local verification.""" + +from typing import Any + + +class LaminarService: + @classmethod + def initialize(cls) -> bool: + return False + + @classmethod + def is_enabled(cls) -> bool: + return False + + @classmethod + def create_evaluation(cls, *args: Any, **kwargs: Any) -> None: + return None + + @classmethod + def attach_evaluation(cls, eval_id: str) -> None: + return None + + @classmethod + def get_eval_id(cls) -> None: + return None + + @classmethod + def get_eval_url(cls) -> None: + return None + + @classmethod + def create_datapoint(cls, task: dict[str, Any]) -> None: + return None + + @classmethod + def set_datapoint_score( + cls, + datapoint_id: str | None, + score: int, + final_result: str, + agent_steps: list[str], + metrics: dict[str, Any], + judgement: dict[str, Any], + ) -> None: + return None + + @classmethod + def set_datapoint_error(cls, datapoint_id: str | None, error_msg: str) -> None: + return None diff --git a/lmnr.py b/lmnr.py new file mode 100644 index 0000000..c454556 --- /dev/null +++ b/lmnr.py @@ -0,0 +1,38 @@ +"""Local no-op subset of lmnr used by the public benchmark runners. + +The remote benchmark runner can attach traces to Laminar. Public +verification writes local JSON artifacts instead, so these hooks are inert. +""" + +from collections.abc import Callable +from typing import Any, TypeVar + +F = TypeVar("F", bound=Callable[..., Any]) + + +def observe(*args: Any, **kwargs: Any): + if args and callable(args[0]) and len(args) == 1 and not kwargs: + return args[0] + + def decorator(fn: F) -> F: + return fn + + return decorator + + +class Laminar: + @staticmethod + def initialize(*args: Any, **kwargs: Any) -> None: + return None + + @staticmethod + def serialize_span_context() -> None: + return None + + @staticmethod + def get_trace_id() -> None: + return None + + +class LaminarClient: + pass diff --git a/models.py b/models.py new file mode 100644 index 0000000..9ad0e2c --- /dev/null +++ b/models.py @@ -0,0 +1,50 @@ +"""Public model registry for local benchmark verification.""" + +import os + +from browser_use import ChatGoogle +from browser_use.llm import ChatAnthropic, ChatBrowserUse, ChatOpenAI + + +def _openai(model: str): + return ChatOpenAI(model=model, api_key=os.getenv("OPENAI_API_KEY")) + + +def _anthropic(model: str): + return ChatAnthropic(model=model, api_key=os.getenv("ANTHROPIC_API_KEY")) + + +def _google(model: str): + return ChatGoogle(model=model, api_key=os.getenv("GOOGLE_API_KEY")) + + +def _openrouter(model: str): + return ChatOpenAI( + model=model, + base_url="https://openrouter.ai/api/v1", + api_key=os.getenv("OPENROUTER_API_KEY"), + ) + + +MODELS = { + "bu-1-0": lambda: ChatBrowserUse(model="bu-1-0"), + "bu-2-0": lambda: ChatBrowserUse(model="bu-2-0"), + "gpt-4.1": lambda: _openai("gpt-4.1"), + "gpt-5": lambda: _openai("gpt-5"), + "gpt-5-mini": lambda: _openai("gpt-5-mini"), + "gpt-5.1-codex-mini": lambda: _openai("gpt-5.1-codex-mini"), + "claude-3-5-haiku": lambda: _anthropic("claude-3-5-haiku"), + "claude-haiku-4-5": lambda: _anthropic("claude-haiku-4-5"), + "claude-sonnet-4-5": lambda: _anthropic("claude-sonnet-4-5"), + "claude-sonnet-4-6": lambda: _anthropic("claude-sonnet-4-6"), + "claude-opus-4-5": lambda: _anthropic("claude-opus-4-5"), + "claude-opus-4-6": lambda: _anthropic("claude-opus-4-6"), + "claude-opus-4-7": lambda: _anthropic("claude-opus-4-7"), + "gemini-2.5-flash-lite": lambda: _google("gemini-2.5-flash-lite"), + "gemini-2.5-flash": lambda: _google("gemini-2.5-flash"), + "gemini-3-flash-preview": lambda: _google("gemini-3-flash-preview"), + "gemini-3-pro-preview": lambda: _google("gemini-3-pro-preview"), + "gemini-3.1-pro-preview": lambda: _google("gemini-3.1-pro-preview"), + "gemini-3-1-pro-preview": lambda: _google("gemini-3.1-pro-preview"), + "kimi-k2.5": lambda: _openrouter("moonshotai/kimi-k2.5"), +} diff --git a/run_framework_eval.py b/run_framework_eval.py new file mode 100644 index 0000000..8114b4e --- /dev/null +++ b/run_framework_eval.py @@ -0,0 +1,295 @@ +"""Run BU_Bench_V1 through any registered framework adapter. + +The public verifier decrypts BU_Bench_V1.enc in memory, runs the selected +adapter, judges each trace, writes summaries under ignored results/, and writes +task-level traces under ignored run_data/. Do not publish run_data/ artifacts: +they include decrypted task text, ground truth, model outputs, and screenshots. + +Examples: + uv run python run_framework_eval.py --list-frameworks + uv run python run_framework_eval.py --framework browser-use --browser browser-use-cloud --model bu-2-0 + uv run python run_framework_eval.py --framework browser-use-cloud-api-v3 --model bu-ultra + uv run python run_framework_eval.py --framework bcode-v012 --framework-ref v0.1.2 --model gpt-5 --tasks 5 + +Useful options: + --framework + --framework-ref + --browser + --model + --tasks 10 + --parallel 3 + --params key=value,key=value + +Adapter prerequisites: + browser-use: + Install the desired browser-use package/ref into the uv environment. + browser-use-cloud-api-v2, browser-use-cloud-api-v3: + Set BROWSER_USE_API_KEY; no browser provider is needed. + bcode, bcode-v012: + Install bcode at $HOME/.bcode/bin/bcode; set BROWSER_USE_API_KEY and + model provider keys. + browserbase-agent: + Run `npm install --prefix frameworks/browserbase_agent`; set + BROWSERBASE_API_KEY and BROWSERBASE_PROJECT_ID. + claude-code-harness, codex-harness, pi-harness: + Install the agent CLI, clone browser-use/browser-harness at the desired + ref to /tmp/browser-harness, and install it into the uv environment. + claude-code-harness-js: + Install Claude Code, clone/install browser-use/browser-harness-js, and + put browser-harness-js on PATH. + claude-code-harness-ab: + Install Claude Code and agent-browser, then install its browser assets. + claude-code-harness-bu-cli: + Install Claude Code and browser-use CLI at the desired ref. + pibt: + Install pi, clone/install browser-use/pi-agent-extensions to + /tmp/pi-agent-extensions, and install its JS dependencies. + but: + Install browser-use/browser-use-terminal to /tmp/but with + `uv sync --project /tmp/but`. + but-rust: + Build /tmp/but-rust/target/release/browser-use-terminal and provide + browser-harness to the worker. +""" + +import argparse +import asyncio +import json +import os +import re +import sys +from datetime import datetime +from pathlib import Path + +from dotenv import load_dotenv + +from frameworks import FRAMEWORKS, framework_to_module, interleave, load_tasks + +ROOT_DIR = Path(__file__).resolve().parent + + +def _safe_part(value: str) -> str: + return re.sub(r"[^A-Za-z0-9_.-]+", "-", value).strip("-") or "unknown" + + +def _selected_indices(total: int, args: argparse.Namespace) -> list[int]: + if args.task_indices: + indices = [int(x.strip()) for x in args.task_indices.split(",") if x.strip()] + else: + count = args.tasks if args.tasks is not None else total - args.task_start + indices = list(range(args.task_start, min(args.task_start + count, total))) + bad = [i for i in indices if i < 0 or i >= total] + if bad: + raise SystemExit(f"Task index out of range: {bad[:5]} for benchmark size {total}") + return indices + + +async def _run_one( + *, + task_index: int, + framework: str, + model: str, + browser: str, + benchmark: str, + params: str, + run_data_dir: Path, + task_results_dir: Path, + task_timeout: int | None, +) -> dict: + module_name = framework_to_module(framework) + runner = ROOT_DIR / "frameworks" / module_name / "run_task.py" + if not runner.exists(): + return { + "task_index": task_index, + "task_id": None, + "score": 0, + "steps": 0, + "duration": 0, + "cost": 0, + "error": f"Missing framework runner: {runner}", + } + + result_file = task_results_dir / f"task_{task_index}.json" + env = os.environ.copy() + env.update( + { + "MODEL": model, + "TASK_INDEX": str(task_index), + "EVAL_ID": "local", + "FRAMEWORK": framework, + "BROWSER": browser, + "BENCHMARK": benchmark, + "PARAMS": params, + "LOCAL_RESULT_FILE": str(result_file), + "RUN_DATA_DIR": str(run_data_dir), + "BROWSER_USE_SETUP_LOGGING": "false", + } + ) + if task_timeout is not None: + env["TASK_TIMEOUT"] = str(task_timeout) + if os.environ.get("NO_INTERLEAVE") == "1": + env["NO_INTERLEAVE"] = "1" + + proc = await asyncio.create_subprocess_exec( + sys.executable, + str(runner), + cwd=str(ROOT_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, stderr = await proc.communicate() + if stdout: + print(stdout.decode("utf-8", errors="replace"), end="") + if stderr: + print(stderr.decode("utf-8", errors="replace"), end="", file=sys.stderr) + + if result_file.exists(): + return json.loads(result_file.read_text(encoding="utf-8")) + + return { + "task_index": task_index, + "task_id": None, + "score": 0, + "steps": 0, + "duration": 0, + "cost": 0, + "error": f"Runner exited {proc.returncode} without writing {result_file}", + } + + +async def _run_all(args: argparse.Namespace) -> list[dict]: + tasks = load_tasks(args.benchmark) + if not args.no_interleave: + tasks = interleave(tasks) + indices = _selected_indices(len(tasks), args) + + framework_info = FRAMEWORKS[args.framework] + browser = args.browser or framework_info.browsers[0] + if browser not in framework_info.browsers: + valid = ", ".join(framework_info.browsers) + raise SystemExit( + f"Browser {browser!r} is not supported by {args.framework!r}. " + f"Valid browsers: {valid}" + ) + + run_start = datetime.now().strftime("%Y%m%d_%H%M%S") + run_key = ( + f"{args.benchmark}_framework_{_safe_part(args.framework)}" + f"_browser_{_safe_part(browser)}" + f"_model_{_safe_part(args.model)}" + ) + run_data_dir = ROOT_DIR / "run_data" / f"{run_key}_start_at_{run_start}" + task_results_dir = run_data_dir / "_task_results" + results_file = ROOT_DIR / "results" / f"{run_key}.json" + + print( + f"Running {len(indices)} task(s): benchmark={args.benchmark} " + f"framework={args.framework} browser={browser} model={args.model}" + ) + if framework_info.repo: + print(f"Framework repo: {framework_info.repo} ref={args.framework_ref}") + + semaphore = asyncio.Semaphore(args.parallel) + + async def guarded(i: int) -> dict: + async with semaphore: + return await _run_one( + task_index=i, + framework=args.framework, + model=args.model, + browser=browser, + benchmark=args.benchmark, + params=args.params, + run_data_dir=run_data_dir, + task_results_dir=task_results_dir, + task_timeout=args.task_timeout, + ) + + results = await asyncio.gather(*(guarded(i) for i in indices)) + + run_entry = { + "run_start": run_start, + "benchmark": args.benchmark, + "framework": args.framework, + "framework_ref": args.framework_ref, + "browser": browser, + "model": args.model, + "params": args.params, + "task_indices": indices, + "tasks_completed": len(results), + "tasks_successful": sum(1 for r in results if r.get("score") == 1), + "total_steps": sum(int(r.get("steps", 0) or 0) for r in results), + "total_duration": sum(float(r.get("duration", 0) or 0) for r in results), + "total_cost": sum(float(r.get("cost", 0) or 0) for r in results), + "task_results": [ + { + "task_id": r.get("task_id"), + "task_index": r.get("task_index"), + "score": r.get("score"), + "steps": r.get("steps", 0), + "duration": r.get("duration", 0), + "cost": r.get("cost", 0), + **({"error": r["error"]} if r.get("error") else {}), + } + for r in results + ], + } + + results_file.parent.mkdir(parents=True, exist_ok=True) + previous = json.loads(results_file.read_text()) if results_file.exists() else [] + previous.append(run_entry) + results_file.write_text(json.dumps(previous, indent=2), encoding="utf-8") + + print( + f"Run complete: {run_entry['tasks_successful']}/{run_entry['tasks_completed']} " + f"successful, {run_entry['total_steps']} steps, " + f"{run_entry['total_duration']:.1f}s, ${run_entry['total_cost']:.2f}" + ) + print(f"Summary: {results_file}") + print(f"Trace artifacts: {run_data_dir}") + return results + + +def _print_frameworks() -> None: + for name, info in sorted(FRAMEWORKS.items()): + browsers = ", ".join(info.browsers) + repo = f" repo={info.repo}" if info.repo else "" + notes = f" ({info.notes})" if info.notes else "" + print(f"{name}: browsers=[{browsers}]{repo}{notes}") + + +def main() -> None: + load_dotenv() + parser = argparse.ArgumentParser(description="Run public BU_Bench_V1 reverification") + parser.add_argument("--benchmark", default="BU_Bench_V1") + parser.add_argument("--framework", choices=sorted(FRAMEWORKS), default="browser-use") + parser.add_argument("--framework-ref", default="installed") + parser.add_argument("--browser", default=None) + parser.add_argument("--model", default="bu-2-0") + parser.add_argument("--params", default="") + parser.add_argument("--tasks", type=int, default=None) + parser.add_argument("--task-start", type=int, default=0) + parser.add_argument("--task-indices", default="") + parser.add_argument("--parallel", type=int, default=1) + parser.add_argument("--task-timeout", type=int, default=None) + parser.add_argument( + "--no-interleave", + action="store_true", + help="Use raw encrypted task order instead of the distributed runner order.", + ) + parser.add_argument("--list-frameworks", action="store_true") + args = parser.parse_args() + + if args.list_frameworks: + _print_frameworks() + return + + if args.no_interleave: + os.environ["NO_INTERLEAVE"] = "1" + + asyncio.run(_run_all(args)) + + +if __name__ == "__main__": + main() From e3a3ebf647666024a0de5728ee17ece6b0c77ffe Mon Sep 17 00:00:00 2001 From: Alezander9 Date: Tue, 12 May 2026 16:28:06 -0700 Subject: [PATCH 2/2] Address framework verifier review comments --- frameworks/__init__.py | 2 +- frameworks/bcode/run_task.py | 36 +- frameworks/browser_use/run_task.py | 37 +- frameworks/browserbase_agent/package.json | 2 +- frameworks/but/run_task.py | 47 ++- frameworks/but_rust/run_task.py | 329 +++++++++--------- .../claude_code_harness/system_prompt.md | 2 +- frameworks/claude_code_harness_ab/run_task.py | 58 +-- .../claude_code_harness_ab/system_prompt.md | 14 +- .../claude_code_harness_bu_cli/run_task.py | 59 ++-- .../system_prompt.md | 14 +- frameworks/claude_code_harness_js/run_task.py | 37 +- .../claude_code_harness_js/system_prompt.md | 2 +- frameworks/claude_cua/run_task.py | 15 +- frameworks/codex_harness/run_task.py | 93 ++--- frameworks/codex_harness/system_prompt.md | 2 +- frameworks/pi_harness/system_prompt.md | 2 +- frameworks/pibt/run_task.py | 45 ++- frameworks/stagehand/executor.js | 20 +- run_framework_eval.py | 2 + 20 files changed, 442 insertions(+), 376 deletions(-) diff --git a/frameworks/__init__.py b/frameworks/__init__.py index 18c6390..ca44e7f 100644 --- a/frameworks/__init__.py +++ b/frameworks/__init__.py @@ -306,7 +306,7 @@ async def run_and_judge( _maybe_write_local_result(data) return data - except BaseException as e: + except Exception as e: error_msg = f"{type(e).__name__}: {e}" tb = traceback.format_exc() print(f"Task {task_id} failed: {error_msg}") diff --git a/frameworks/bcode/run_task.py b/frameworks/bcode/run_task.py index 434fb55..f0ce2b9 100644 --- a/frameworks/bcode/run_task.py +++ b/frameworks/bcode/run_task.py @@ -148,11 +148,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict: def _start_browser() -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -274,15 +280,19 @@ async def execute(task_description: str) -> ExecutionResult: errors: list[str] = [] stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd="/tmp", - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd="/tmp", + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + _stop_browser(browser_id) + raise try: async for raw in _iter_lines(proc.stdout): diff --git a/frameworks/browser_use/run_task.py b/frameworks/browser_use/run_task.py index ac521ab..9ade8a5 100644 --- a/frameworks/browser_use/run_task.py +++ b/frameworks/browser_use/run_task.py @@ -48,28 +48,33 @@ async def execute( ) -> ExecutionResult: """Run a browser-use agent on the task and return a standardized result.""" provider = BROWSERS[browser_name] - cdp_url = await provider.connect() - if cdp_url: - browser = Browser(cdp_url=cdp_url) - else: - headless = getattr(provider, "HEADLESS", True) - browser = Browser(headless=headless) - - agent = Agent( - task=task_description, - llm=llm, - browser=browser, - use_judge=False, - use_vision=use_vision, - ) + browser = None try: + cdp_url = await provider.connect() + if cdp_url: + browser = Browser(cdp_url=cdp_url) + else: + headless = getattr(provider, "HEADLESS", True) + browser = Browser(headless=headless) + + agent = Agent( + task=task_description, + llm=llm, + browser=browser, + use_judge=False, + use_vision=use_vision, + ) history = await agent.run() finally: + if browser is not None: + try: + await browser.kill() + except Exception: + pass try: - await browser.kill() + await provider.disconnect() except Exception: pass - await provider.disconnect() return ExecutionResult( final_result=history.final_result() or "Agent did not return a result", diff --git a/frameworks/browserbase_agent/package.json b/frameworks/browserbase_agent/package.json index 58d1e87..55b6c5d 100644 --- a/frameworks/browserbase_agent/package.json +++ b/frameworks/browserbase_agent/package.json @@ -4,6 +4,6 @@ "type": "module", "description": "Node executor for the browserbase-agent eval framework: drives Stagehand SDK against Browserbase cloud. Pinned to a Stagehand version that has the opus-4-7 temperature fix (PRs #2006/#2018, shipped in client 3.4.0).", "dependencies": { - "@browserbasehq/stagehand": "^3.4.0" + "@browserbasehq/stagehand": "3.4.0" } } diff --git a/frameworks/but/run_task.py b/frameworks/but/run_task.py index 30066cb..0864789 100644 --- a/frameworks/but/run_task.py +++ b/frameworks/but/run_task.py @@ -126,11 +126,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict: def _start_browser() -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -260,8 +266,12 @@ async def execute(task_description: str) -> ExecutionResult: # passing the env var costs nothing on the current version. parent_span_context = Laminar.serialize_span_context() - system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") - full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" + try: + system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" + except Exception: + _stop_browser(browser_id) + raise env = { **os.environ, @@ -300,15 +310,19 @@ async def execute(task_description: str) -> ExecutionResult: stdout_chunks: list[str] = [] session_id: str | None = None - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=BUT_PROJECT_DIR, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=BUT_PROJECT_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + _stop_browser(browser_id) + raise try: async for raw in _iter_lines(proc.stdout): @@ -385,7 +399,8 @@ async def execute(task_description: str) -> ExecutionResult: if not final_text: for event in reversed(events): if (event.get("type") or "") in ("assistant.message", "message.assistant"): - text = ((event.get("payload") or {}).get("text") or "").strip() + payload = event.get("payload") or {} + text = (payload.get("text") or payload.get("content") or "").strip() if text: final_text = text break diff --git a/frameworks/but_rust/run_task.py b/frameworks/but_rust/run_task.py index f54b9b1..8627770 100644 --- a/frameworks/but_rust/run_task.py +++ b/frameworks/but_rust/run_task.py @@ -118,11 +118,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict: def _start_browser() -> tuple[str, str]: + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -190,171 +196,172 @@ async def execute(task_description: str) -> ExecutionResult: task_idx = os.environ.get("TASK_INDEX", "0") browser_id, cdp_ws = _start_browser() - - state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}" - if state_dir.exists(): - shutil.rmtree(state_dir) - state_dir.mkdir(parents=True) - - parent_span_context = Laminar.serialize_span_context() - - system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") - full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" - - env = { - **os.environ, - # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS - # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short - # circuits. Pass both URL forms for robustness. - "BU_CDP_WS": cdp_ws, - # Force flush on one-shot CLI runs so OTLP spans actually leave the - # process before exit (see docs/README on this branch). - "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1", - } - if parent_span_context: - # Forward-compat: but-rust telemetry doesn't honor this yet, but it - # doesn't error on unknown env either. - env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context - - # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE - # the subcommand. - cmd_run = [ - BUT_RUST_BIN, - "--state-dir", str(state_dir), - subcommand, - full_task, - "--model", model, - ] - start = time.time() - stdout_buf: list[str] = [] - stderr_buf: list[str] = [] - - proc = await asyncio.create_subprocess_exec( - *cmd_run, - cwd=BUT_RUST_REPO_DIR, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf)) - stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf)) - try: - await proc.wait() - await asyncio.wait_for(stdout_task, timeout=10) - await asyncio.wait_for(stderr_task, timeout=10) - except asyncio.TimeoutError: - for t in (stdout_task, stderr_task): - if not t.done(): - t.cancel() - finally: - if proc.returncode is None: - proc.kill() - try: - await asyncio.wait_for(proc.wait(), timeout=10) - except asyncio.TimeoutError: - pass - - # `run-openai`/etc print the session_id as the final non-empty stdout line. - session_id = "" - for line in reversed(stdout_buf): - line = line.strip() - if line and not line.startswith("{"): - session_id = line - break - - if not session_id: - _stop_browser(browser_id) - raise RuntimeError( - f"but-rust: no session_id captured from stdout (exit={proc.returncode}). " - f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + state_dir = STATE_ROOT / f"task-{task_idx}-{int(time.time() * 1000)}" + if state_dir.exists(): + shutil.rmtree(state_dir) + state_dir.mkdir(parents=True) + + parent_span_context = Laminar.serialize_span_context() + + system_prompt = SYSTEM_PROMPT_PATH.read_text(encoding="utf-8") + full_task = f"{system_prompt.strip()}\n\nTask:\n{task_description}" + + env = { + **os.environ, + # The Python worker (spawned by the Rust agent loop) honors BU_CDP_WS + # directly via `_ensure_managed_chrome`/`_ensure_cloud_browser` short + # circuits. Pass both URL forms for robustness. + "BU_CDP_WS": cdp_ws, + # Force flush on one-shot CLI runs so OTLP spans actually leave the + # process before exit (see docs/README on this branch). + "LLM_BROWSER_LAMINAR_FLUSH_ON_FINISH": "1", + } + if parent_span_context: + # Forward-compat: but-rust telemetry doesn't honor this yet, but it + # doesn't error on unknown env either. + env["LMNR_PARENT_SPAN_CONTEXT"] = parent_span_context + + # `--state-dir` is a TOP-LEVEL arg on the Rust CLI -- must come BEFORE + # the subcommand. + cmd_run = [ + BUT_RUST_BIN, + "--state-dir", str(state_dir), + subcommand, + full_task, + "--model", model, + ] + + stdout_buf: list[str] = [] + stderr_buf: list[str] = [] + + proc = await asyncio.create_subprocess_exec( + *cmd_run, + cwd=BUT_RUST_REPO_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, ) + stdout_task = asyncio.create_task(_read_stream(proc.stdout, "but-rust-stdout", stdout_buf)) + stderr_task = asyncio.create_task(_read_stream(proc.stderr, "but-rust-stderr", stderr_buf)) - # Dump events for this session. - cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id] - events_proc = await asyncio.create_subprocess_exec( - *cmd_events, - cwd=BUT_RUST_REPO_DIR, - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - events_stdout, events_stderr = await events_proc.communicate() - _stop_browser(browser_id) - duration = time.time() - start - - events: list[dict] = [] - for line in events_stdout.decode("utf-8", errors="replace").splitlines(): - line = line.strip() - if not line: - continue try: - events.append(json.loads(line)) - except json.JSONDecodeError: - continue - - steps: list[str] = [] - final_text = "" - total_cost = 0.0 - errors: list[str] = [] - - for event in events: - if (s := _format_step_from_event(event)): - steps.append(s) - etype = event.get("type") or "" - payload = event.get("payload") or {} - if etype == "session.done": - done_result = (payload.get("result") or "").strip() - if done_result: - final_text = done_result - elif etype in ("model.usage", "llm.usage"): - cost_usd = payload.get("cost_usd") or payload.get("cost") - if cost_usd is not None: + await proc.wait() + await asyncio.wait_for(stdout_task, timeout=10) + await asyncio.wait_for(stderr_task, timeout=10) + except asyncio.TimeoutError: + for t in (stdout_task, stderr_task): + if not t.done(): + t.cancel() + finally: + if proc.returncode is None: + proc.kill() try: - total_cost += float(cost_usd) - except (TypeError, ValueError): + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: pass - elif etype in ("tool.failed", "session.failed", "error"): - err = payload.get("error") or payload.get("message") or "" - if err: - errors.append(str(err)) - print(f"[but-rust-error] {str(err)[:500]}", flush=True) - - if not final_text: - for event in reversed(events): - if (event.get("type") or "") in ("assistant.message", "message.assistant"): - text = ((event.get("payload") or {}).get("text") or "").strip() - if text: - final_text = text - break - - if proc.returncode not in (0, None) and not final_text and not steps: - raise RuntimeError( - f"but-rust exited with code {proc.returncode} before producing output. " - f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" - ) - answer = (final_text or "").strip() - if errors and not answer: - final_result = f"[but_rust_error] {errors[0][:500]}" - elif errors: - final_result = f"[but_rust_error_recovered] {answer}" - else: - final_result = answer or "[but_rust_no_output]" - - screenshots = _collect_screenshots(state_dir, session_id) - - return ExecutionResult( - final_result=final_result, - steps=steps, - screenshots_b64=screenshots, - num_steps=len(steps), - duration_seconds=duration, - cost=total_cost, - ) + # `run-openai`/etc print the session_id as the final non-empty stdout line. + session_id = "" + for line in reversed(stdout_buf): + line = line.strip() + if line and not line.startswith("{"): + session_id = line + break + + if not session_id: + raise RuntimeError( + f"but-rust: no session_id captured from stdout (exit={proc.returncode}). " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + # Dump events for this session. + cmd_events = [BUT_RUST_BIN, "--state-dir", str(state_dir), "events", session_id] + events_proc = await asyncio.create_subprocess_exec( + *cmd_events, + cwd=BUT_RUST_REPO_DIR, + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + events_stdout, _events_stderr = await events_proc.communicate() + duration = time.time() - start + + events: list[dict] = [] + for line in events_stdout.decode("utf-8", errors="replace").splitlines(): + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + + steps: list[str] = [] + final_text = "" + total_cost = 0.0 + errors: list[str] = [] + + for event in events: + if (s := _format_step_from_event(event)): + steps.append(s) + etype = event.get("type") or "" + payload = event.get("payload") or {} + if etype == "session.done": + done_result = (payload.get("result") or "").strip() + if done_result: + final_text = done_result + elif etype in ("model.usage", "llm.usage"): + cost_usd = payload.get("cost_usd") or payload.get("cost") + if cost_usd is not None: + try: + total_cost += float(cost_usd) + except (TypeError, ValueError): + pass + elif etype in ("tool.failed", "session.failed", "error"): + err = payload.get("error") or payload.get("message") or "" + if err: + errors.append(str(err)) + print(f"[but-rust-error] {str(err)[:500]}", flush=True) + + if not final_text: + for event in reversed(events): + if (event.get("type") or "") in ("assistant.message", "message.assistant"): + payload = event.get("payload") or {} + text = (payload.get("text") or payload.get("content") or "").strip() + if text: + final_text = text + break + + if proc.returncode not in (0, None) and not final_text and not steps: + raise RuntimeError( + f"but-rust exited with code {proc.returncode} before producing output. " + f"stderr_tail:\n{chr(10).join(stderr_buf[-50:])[-2000:]}" + ) + + answer = (final_text or "").strip() + if errors and not answer: + final_result = f"[but_rust_error] {errors[0][:500]}" + elif errors: + final_result = f"[but_rust_error_recovered] {answer}" + else: + final_result = answer or "[but_rust_no_output]" + + screenshots = _collect_screenshots(state_dir, session_id) + + return ExecutionResult( + final_result=final_result, + steps=steps, + screenshots_b64=screenshots, + num_steps=len(steps), + duration_seconds=duration, + cost=total_cost, + ) + finally: + _stop_browser(browser_id) async def main(): diff --git a/frameworks/claude_code_harness/system_prompt.md b/frameworks/claude_code_harness/system_prompt.md index d1566de..5520ae4 100644 --- a/frameworks/claude_code_harness/system_prompt.md +++ b/frameworks/claude_code_harness/system_prompt.md @@ -5,7 +5,7 @@ Hard rules: - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. - Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: FINAL ANSWER: diff --git a/frameworks/claude_code_harness_ab/run_task.py b/frameworks/claude_code_harness_ab/run_task.py index baddc77..307c50b 100644 --- a/frameworks/claude_code_harness_ab/run_task.py +++ b/frameworks/claude_code_harness_ab/run_task.py @@ -117,11 +117,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" if browser_name != "browser-use-cloud": raise ValueError(f"Unsupported browser for claude-code-harness-ab: {browser_name}") + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -261,16 +267,11 @@ async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> Non print(f"[claude-stderr] {s}", flush=True) -async def _close_agent_browser_sessions() -> None: - """Best-effort: tell agent-browser to shut down all daemons. - - agent-browser spawns a per-session background daemon (one per - `--session` name). `close --all` quits every active session so a - leaked daemon does not survive across tasks on the same runner. - """ +async def _close_agent_browser_session(session_name: str) -> None: + """Best-effort: tell agent-browser to shut down this task's daemon.""" try: stop_proc = await asyncio.create_subprocess_exec( - "agent-browser", "close", "--all", + "agent-browser", "--session", session_name, "close", stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.DEVNULL, ) @@ -291,12 +292,15 @@ async def execute(task_description: str) -> ExecutionResult: _reset_dir(WORK_DIR) # Pre-provision a remote browser; pass its WS URL to the agent via env. - # The agent runs: agent-browser --cdp "$BU_CDP_WS" open + # The agent runs: + # agent-browser --session "$AB_SESSION" --cdp "$BU_CDP_WS" open browser_id, cdp_ws = _start_browser(browser_name) + session_name = f"eval-{os.environ.get('TASK_INDEX', '0')}-{os.getpid()}" env = { **os.environ, "BU_CDP_WS": cdp_ws, + "AB_SESSION": session_name, "DISABLE_TELEMETRY": "1", "DISABLE_AUTOUPDATER": "1", "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", @@ -319,16 +323,20 @@ async def execute(task_description: str) -> ExecutionResult: result_errors: list[str] = [] stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=str(WORK_DIR), - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + await _close_agent_browser_session(session_name) + _stop_browser(browser_id) + raise async def _iter_stdout_lines(): assert proc.stdout is not None @@ -399,9 +407,7 @@ async def _iter_stdout_lines(): pass if not stderr_task.done(): stderr_task.cancel() - # Best-effort: close any agent-browser daemon(s) the agent left running - # so they don't leak across tasks on the same runner. - await _close_agent_browser_sessions() + await _close_agent_browser_session(session_name) _stop_browser(browser_id) duration = time.time() - start diff --git a/frameworks/claude_code_harness_ab/system_prompt.md b/frameworks/claude_code_harness_ab/system_prompt.md index eec2715..fcdcfea 100644 --- a/frameworks/claude_code_harness_ab/system_prompt.md +++ b/frameworks/claude_code_harness_ab/system_prompt.md @@ -2,25 +2,25 @@ Hard rules: - Use the `agent-browser` CLI for every browser interaction. It is on your PATH. Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `agent-browser` only. -- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running: +- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` and `AB_SESSION` from your environment and running: ``` - agent-browser --cdp "$BU_CDP_WS" open + agent-browser --session "$AB_SESSION" --cdp "$BU_CDP_WS" open ``` - All subsequent `agent-browser ` calls automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`snapshot`, `click @e2`, `screenshot`, etc.). + All subsequent `agent-browser ` calls must include `--session "$AB_SESSION"` and automatically reuse this daemon -- you do NOT need to pass `--cdp` again, and you should NOT call `agent-browser open` a second time without a URL. Just issue the next verb (`--session "$AB_SESSION" snapshot`, `--session "$AB_SESSION" click @e2`, `--session "$AB_SESSION" screenshot`, etc.). - Before issuing your first command, read the bundled skill so you know the full command surface and current best-practice workflow: ``` agent-browser skills get core ``` Use `agent-browser skills get core --full` for the complete command reference. The CLI also accepts `--help` on any subcommand. -- Prefer the accessibility-tree workflow: `agent-browser snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser click @eN` / `agent-browser fill @eN ""` to interact. Fall back to CSS selectors or `find role --name "..."` semantic locators when refs are insufficient. +- Prefer the accessibility-tree workflow: `agent-browser --session "$AB_SESSION" snapshot -i` to list interactive elements with stable `@eN` refs, then `agent-browser --session "$AB_SESSION" click @eN` / `agent-browser --session "$AB_SESSION" fill @eN ""` to interact. Fall back to CSS selectors or `find role --name "..."` semantic locators when refs are insufficient. - Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Use the `--screenshot-dir` / explicit-path form so files land on disk and the judge can see them: ``` - agent-browser screenshot /tmp/shots/step_001.png - agent-browser screenshot /tmp/shots/step_002.png + agent-browser --session "$AB_SESSION" screenshot /tmp/shots/step_001.png + agent-browser --session "$AB_SESSION" screenshot /tmp/shots/step_002.png ``` Never overwrite a previous screenshot path. Annotated screenshots (`--annotate`) are fine for visual reasoning, but still write to a new numbered filename. - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: diff --git a/frameworks/claude_code_harness_bu_cli/run_task.py b/frameworks/claude_code_harness_bu_cli/run_task.py index 1f8f164..dc0adad 100644 --- a/frameworks/claude_code_harness_bu_cli/run_task.py +++ b/frameworks/claude_code_harness_bu_cli/run_task.py @@ -116,11 +116,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" if browser_name != "browser-use-cloud": raise ValueError(f"Unsupported browser for claude-code-harness-bu-cli: {browser_name}") + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -260,17 +266,11 @@ async def _drain_stderr(proc: asyncio.subprocess.Process, buf: list[str]) -> Non print(f"[claude-stderr] {s}", flush=True) -async def _close_browser_use_sessions() -> None: - """Best-effort: tell browser-use to shut down all daemons. - - The browser-use CLI spawns a per-session background daemon (one per - `--session` name; default is "default"). `close --all` quits every - active session so a leaked daemon does not survive across tasks on the - same runner. - """ +async def _close_browser_use_session(session_name: str) -> None: + """Best-effort: tell browser-use to shut down this task's daemon.""" try: stop_proc = await asyncio.create_subprocess_exec( - "browser-use", "close", "--all", + "browser-use", "--session", session_name, "close", stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.DEVNULL, ) @@ -291,12 +291,15 @@ async def execute(task_description: str) -> ExecutionResult: _reset_dir(WORK_DIR) # Pre-provision a remote browser; pass its WS URL to the agent via env. - # The agent runs: browser-use --cdp-url "$BU_CDP_WS" open + # The agent runs: + # browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open browser_id, cdp_ws = _start_browser(browser_name) + session_name = f"eval-{os.environ.get('TASK_INDEX', '0')}-{os.getpid()}" env = { **os.environ, "BU_CDP_WS": cdp_ws, + "BU_SESSION": session_name, "DISABLE_TELEMETRY": "1", "DISABLE_AUTOUPDATER": "1", "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1", @@ -320,16 +323,20 @@ async def execute(task_description: str) -> ExecutionResult: result_errors: list[str] = [] stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=str(WORK_DIR), - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + await _close_browser_use_session(session_name) + _stop_browser(browser_id) + raise async def _iter_stdout_lines(): assert proc.stdout is not None @@ -400,9 +407,7 @@ async def _iter_stdout_lines(): pass if not stderr_task.done(): stderr_task.cancel() - # Best-effort: close any browser-use daemon(s) the agent left running - # so they don't leak across tasks on the same runner. - await _close_browser_use_sessions() + await _close_browser_use_session(session_name) _stop_browser(browser_id) duration = time.time() - start diff --git a/frameworks/claude_code_harness_bu_cli/system_prompt.md b/frameworks/claude_code_harness_bu_cli/system_prompt.md index fa9789d..48c3887 100644 --- a/frameworks/claude_code_harness_bu_cli/system_prompt.md +++ b/frameworks/claude_code_harness_bu_cli/system_prompt.md @@ -2,21 +2,21 @@ Hard rules: - Use the `browser-use` CLI for every browser interaction. It is on your PATH (aliases: `bu`, `browser`, `browseruse` all work). Do NOT install other browser tools, do NOT use Playwright/Puppeteer directly, do NOT call any built-in WebFetch -- drive the live browser via `browser-use` only. -- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` from your environment and running: +- A live remote browser is already attached. Connect to it once, at the start, by reading `BU_CDP_WS` and `BU_SESSION` from your environment and running: ``` - browser-use --cdp-url "$BU_CDP_WS" open + browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open ``` - All subsequent `browser-use ` calls automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`state`, `click 5`, `input 3 "text"`, `screenshot`, etc.). + All subsequent `browser-use ` calls must include `--session "$BU_SESSION"` and automatically reuse the running daemon over the same CDP attachment -- you do NOT need to pass `--cdp-url` again, and you should NOT call `browser-use open` a second time without a URL. Just issue the next verb (`--session "$BU_SESSION" state`, `--session "$BU_SESSION" click 5`, `--session "$BU_SESSION" input 3 "text"`, `--session "$BU_SESSION" screenshot`, etc.). - Before issuing your first interaction command, read the bundled SKILL.md so you know the full command surface, common workflows, and troubleshooting tips. It is at `~/.claude/skills/browser-use/SKILL.md`. If you have a Read tool, read that file. Otherwise: `cat ~/.claude/skills/browser-use/SKILL.md`. -- Standard workflow per the SKILL: (1) `browser-use --cdp-url "$BU_CDP_WS" open ` to attach + navigate, (2) `browser-use state` to see clickable elements with indices, (3) `browser-use click ` / `browser-use input "text"` to interact, (4) `browser-use state` or `browser-use screenshot` to verify, (5) repeat. +- Standard workflow per the SKILL: (1) `browser-use --session "$BU_SESSION" --cdp-url "$BU_CDP_WS" open ` to attach + navigate, (2) `browser-use --session "$BU_SESSION" state` to see clickable elements with indices, (3) `browser-use --session "$BU_SESSION" click ` / `browser-use --session "$BU_SESSION" input "text"` to interact, (4) `browser-use --session "$BU_SESSION" state` or `browser-use --session "$BU_SESSION" screenshot` to verify, (5) repeat. - Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot. Pass an explicit path to `browser-use screenshot`: ``` - browser-use screenshot /tmp/shots/step_001.png - browser-use screenshot /tmp/shots/step_002.png + browser-use --session "$BU_SESSION" screenshot /tmp/shots/step_001.png + browser-use --session "$BU_SESSION" screenshot /tmp/shots/step_002.png ``` Never overwrite a previous screenshot path. - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - Do not spawn or kill any browser processes; the remote Chrome is managed by the eval harness. Do not call `browser-use cloud connect` or `browser-use connect` -- the browser is already provisioned and attached via `--cdp-url`. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: diff --git a/frameworks/claude_code_harness_js/run_task.py b/frameworks/claude_code_harness_js/run_task.py index 8141989..7d08911 100644 --- a/frameworks/claude_code_harness_js/run_task.py +++ b/frameworks/claude_code_harness_js/run_task.py @@ -114,11 +114,17 @@ def _start_browser(browser_name: str) -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" if browser_name != "browser-use-cloud": raise ValueError(f"Unsupported browser for claude-code-harness-js: {browser_name}") + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -297,16 +303,19 @@ async def execute(task_description: str) -> ExecutionResult: result_errors: list[str] = [] stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=str(WORK_DIR), - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=str(WORK_DIR), + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + _stop_browser(browser_id) + raise async def _iter_stdout_lines(): assert proc.stdout is not None diff --git a/frameworks/claude_code_harness_js/system_prompt.md b/frameworks/claude_code_harness_js/system_prompt.md index b5d375d..4eaca11 100644 --- a/frameworks/claude_code_harness_js/system_prompt.md +++ b/frameworks/claude_code_harness_js/system_prompt.md @@ -13,7 +13,7 @@ Hard rules: JS ``` - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: FINAL ANSWER: diff --git a/frameworks/claude_cua/run_task.py b/frameworks/claude_cua/run_task.py index 737d231..43461ab 100644 --- a/frameworks/claude_cua/run_task.py +++ b/frameworks/claude_cua/run_task.py @@ -13,7 +13,6 @@ import asyncio import os import sys -import time from pathlib import Path # Add project root to path for sibling imports @@ -45,8 +44,6 @@ async def execute(task_description: str) -> ExecutionResult: 4. Loop: parse tool_use blocks, execute actions, screenshot, send tool_result 5. Collect all steps and final text response """ - start = time.time() - # import anthropic # client = anthropic.Anthropic() # @@ -63,15 +60,9 @@ async def execute(task_description: str) -> ExecutionResult: # ) # ... execute actions, collect screenshots, break on end_turn ... - duration = time.time() - start - - return ExecutionResult( - final_result="NOT IMPLEMENTED", - steps=[], - screenshots_b64=[], - num_steps=0, - duration_seconds=duration, - cost=0, + raise NotImplementedError( + "claude-cua is not implemented in this public verifier. " + "Do not enable this adapter until it returns real task traces." ) diff --git a/frameworks/codex_harness/run_task.py b/frameworks/codex_harness/run_task.py index 5838e3c..4932681 100644 --- a/frameworks/codex_harness/run_task.py +++ b/frameworks/codex_harness/run_task.py @@ -263,30 +263,34 @@ async def execute(task_description: str) -> ExecutionResult: # Pre-provision the browser so Codex starts with a live CDP attach. _start_browser(browser_name, bu_name) - # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by - # default but accepts `CODEX_API_KEY` env explicitly (the only auth env - # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read - # by codex (it's for the OpenAI Python SDK). We mirror the workflow's - # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks - # both bcode (which uses OPENAI_API_KEY directly) and codex-harness. - # - # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness` - # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but - # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4 - # showed the agent self-recovered by prepending the venv dir, but that - # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc - # in our system prompt + SKILL.md works on the first try. - harness_venv_bin = f"{HARNESS_DIR}/.venv/bin" - existing_path = os.environ.get("PATH", "") - env = { - **os.environ, - "BU_NAME": bu_name, - "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""), - "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin, - } - - cmd = _build_codex_cmd(model_name, sandbox) - prompt = _compose_prompt(task_description) + try: + # Codex CLI auth: `codex exec` reuses saved auth (~/.codex/auth.json) by + # default but accepts `CODEX_API_KEY` env explicitly (the only auth env + # supported by `codex exec` per docs). `OPENAI_API_KEY` alone is NOT read + # by codex (it's for the OpenAI Python SDK). We mirror the workflow's + # OPENAI_API_KEY into CODEX_API_KEY here so the same repo secret unlocks + # both bcode (which uses OPENAI_API_KEY directly) and codex-harness. + # + # PATH: `uv pip install /tmp/browser-harness` puts the `browser-harness` + # console_script at /tmp/browser-harness/.venv/bin/browser-harness, but + # codex subprocess doesn't inherit the `uv run` PATH boost. Smoke #4 + # showed the agent self-recovered by prepending the venv dir, but that + # cost ~4 steps. Prepend explicitly so the bare `browser-harness` heredoc + # in our system prompt + SKILL.md works on the first try. + harness_venv_bin = f"{HARNESS_DIR}/.venv/bin" + existing_path = os.environ.get("PATH", "") + env = { + **os.environ, + "BU_NAME": bu_name, + "CODEX_API_KEY": os.environ.get("CODEX_API_KEY") or os.environ.get("OPENAI_API_KEY", ""), + "PATH": f"{harness_venv_bin}:{existing_path}" if existing_path else harness_venv_bin, + } + + cmd = _build_codex_cmd(model_name, sandbox) + prompt = _compose_prompt(task_description) + except Exception: + _stop_browser(browser_name, bu_name) + raise start = time.time() steps: list[str] = [] @@ -299,23 +303,34 @@ async def execute(task_description: str) -> ExecutionResult: error_events: list[str] = [] stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=HARNESS_DIR, - env=env, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, # 256 MiB safety cap on line buffer - ) + proc: asyncio.subprocess.Process | None = None + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=HARNESS_DIR, + env=env, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, # 256 MiB safety cap on line buffer + ) - # Pipe the prompt in on stdin and close. - assert proc.stdin is not None - proc.stdin.write(prompt.encode("utf-8")) - await proc.stdin.drain() - proc.stdin.close() + # Pipe the prompt in on stdin and close. + assert proc.stdin is not None + proc.stdin.write(prompt.encode("utf-8")) + await proc.stdin.drain() + proc.stdin.close() - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + if proc is not None and proc.returncode is None: + proc.kill() + try: + await asyncio.wait_for(proc.wait(), timeout=10) + except asyncio.TimeoutError: + pass + _stop_browser(browser_name, bu_name) + raise async def _iter_stdout_lines(): """Yield one JSONL line at a time. Codex item.completed payloads for diff --git a/frameworks/codex_harness/system_prompt.md b/frameworks/codex_harness/system_prompt.md index 7908e81..9807d51 100644 --- a/frameworks/codex_harness/system_prompt.md +++ b/frameworks/codex_harness/system_prompt.md @@ -6,7 +6,7 @@ Hard rules: - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. - Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - Work fully autonomously. Do not stop early to summarize partial progress -- keep driving the browser until the task is genuinely complete (or you have hit a dead end). When you reach an answer, deliver it in the format below and exit. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: diff --git a/frameworks/pi_harness/system_prompt.md b/frameworks/pi_harness/system_prompt.md index d1566de..5520ae4 100644 --- a/frameworks/pi_harness/system_prompt.md +++ b/frameworks/pi_harness/system_prompt.md @@ -5,7 +5,7 @@ Hard rules: - A browser daemon is already running under the `BU_NAME` in the environment and is attached to a live browser. Do not start, stop, or restart daemons. Do not call `start_remote_daemon` or `stop_remote_daemon`. Do not run `pkill`. - Save every screenshot to `/tmp/shots/step_.png` where N is a zero-padded 3-digit integer starting at 001 and incrementing on each shot (e.g. `screenshot("/tmp/shots/step_001.png")`). Never overwrite a previous screenshot path. - Do not ask the user clarifying questions. If the task is ambiguous, pick the most reasonable interpretation and proceed. -- Do not edit files outside the current working directory. +- Do not edit files outside the current working directory, except for the required screenshots under /tmp/shots. - When the task is complete, end your final assistant message with exactly one line in this format and nothing after it: FINAL ANSWER: diff --git a/frameworks/pibt/run_task.py b/frameworks/pibt/run_task.py index d8fd61b..71ede68 100644 --- a/frameworks/pibt/run_task.py +++ b/frameworks/pibt/run_task.py @@ -96,11 +96,17 @@ def _bu(path: str, method: str, body: dict | None = None) -> dict: def _start_browser() -> tuple[str, str]: """Allocate a browser-use-cloud session. Returns (browser_id, cdp_ws).""" + browser_id = None info = _bu("/browsers", "POST", {}) - cdp_ws = json.loads( - urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() - )["webSocketDebuggerUrl"] - return info["id"], cdp_ws + browser_id = info["id"] + try: + cdp_ws = json.loads( + urllib.request.urlopen(f"{info['cdpUrl']}/json/version", timeout=15).read() + )["webSocketDebuggerUrl"] + return browser_id, cdp_ws + except Exception: + _stop_browser(browser_id) + raise def _stop_browser(browser_id: str | None) -> None: @@ -283,8 +289,12 @@ async def execute(task_description: str) -> ExecutionResult: "PI_TELEMETRY": "0", } - system_prompt = SYSTEM_PROMPT_FILE.read_text() - cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + try: + system_prompt = SYSTEM_PROMPT_FILE.read_text() + cmd = _build_pi_cmd(task_description, model_name, thinking, system_prompt) + except Exception: + _stop_browser(browser_id) + raise start = time.time() steps: list[str] = [] @@ -293,16 +303,19 @@ async def execute(task_description: str) -> ExecutionResult: saw_agent_end = False stderr_buf: list[str] = [] - proc = await asyncio.create_subprocess_exec( - *cmd, - cwd=EXTENSIONS_DIR, # pi loads the package.json `pi.extensions` from CWD - env=env, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - limit=256 * 1024 * 1024, - ) - - stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + try: + proc = await asyncio.create_subprocess_exec( + *cmd, + cwd=EXTENSIONS_DIR, # pi loads the package.json `pi.extensions` from CWD + env=env, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + limit=256 * 1024 * 1024, + ) + stderr_task = asyncio.create_task(_drain_stderr(proc, stderr_buf)) + except Exception: + _stop_browser(browser_id) + raise async def _iter_stdout_lines(): assert proc.stdout is not None diff --git a/frameworks/stagehand/executor.js b/frameworks/stagehand/executor.js index 61eca97..c4fdef8 100644 --- a/frameworks/stagehand/executor.js +++ b/frameworks/stagehand/executor.js @@ -27,8 +27,6 @@ async function main() { process.exit(1); } - const startTime = Date.now(); - // TODO: Initialize Stagehand with appropriate env (BROWSERBASE or LOCAL) // const stagehand = new Stagehand({ // env: browser === "browserbase" ? "BROWSERBASE" : "LOCAL", @@ -43,20 +41,10 @@ async function main() { // // await stagehand.close(); - const durationSeconds = (Date.now() - startTime) / 1000; - - // TODO: Map Stagehand result to standard format - const output = { - final_result: "NOT IMPLEMENTED", - steps: [], - screenshots_b64: [], - num_steps: 0, - duration_seconds: durationSeconds, - cost: 0, - }; - - // Print JSON to stdout for the Python wrapper to parse - console.log(JSON.stringify(output)); + throw new Error( + `Stagehand executor is not implemented for browser=${browser}. ` + + "Use browserbase-agent for Stagehand SDK reverification or implement frameworks/stagehand/executor.js before enabling this adapter." + ); } main().catch((err) => { diff --git a/run_framework_eval.py b/run_framework_eval.py index 8114b4e..f80f85a 100644 --- a/run_framework_eval.py +++ b/run_framework_eval.py @@ -190,6 +190,8 @@ async def _run_all(args: argparse.Namespace) -> list[dict]: if framework_info.repo: print(f"Framework repo: {framework_info.repo} ref={args.framework_ref}") + if args.parallel < 1: + raise SystemExit("--parallel must be >= 1") semaphore = asyncio.Semaphore(args.parallel) async def guarded(i: int) -> dict: