Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
BROWSER_USE_API_KEY=
GOOGLE_API_KEY=
OPENAI_API_KEY=
ANTHROPIC_API_KEY=

# Optional model providers used by some framework/model combinations
GOOGLE_GENERATIVE_AI_API_KEY=
FIREWORKS_API_KEY=
OPENROUTER_API_KEY=
DEEPSEEK_API_KEY=
MOONSHOT_API_KEY=
DASHSCOPE_API_KEY=

# Optional: only needed if using --browser with the corresponding provider
ANCHORBROWSER_API_KEY=
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,7 @@ __pycache__/
.env
run_data/
DESIGN.MD
results/
results/
BU_Bench_V1.json
Stealth_Bench_V1.json
benchmarks/*.json
18 changes: 18 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,24 @@ uv run python run_eval.py

Results are saved to `results/` and detailed traces to `run_data/`.

### Re-verifying Framework Results

Use `run_framework_eval.py` to rerun BU_Bench_V1 through a framework adapter.
It decrypts `BU_Bench_V1.enc` in memory and writes local outputs to ignored
`results/` and `run_data/`.

```bash
uv run python run_framework_eval.py --list-frameworks
uv run python run_framework_eval.py --framework browser-use --browser browser-use-cloud --model bu-2-0
```

See the comment at the top of `run_framework_eval.py` for framework-specific
setup, options, and examples.

Important: `run_data/` traces include decrypted task text, ground truth, model
outputs, and screenshots. They are gitignored for local verification only. Do
not publish or commit them.

### Swapping Models

Edit `run_eval.py` to change the model:
Expand Down
57 changes: 29 additions & 28 deletions browsers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
"""Browser provider registry.
"""Browser provider registry."""

Each provider module exports:
async def connect() -> str -- returns a CDP WebSocket URL
async def disconnect() -> None -- cleans up the session

Usage:
from browsers import get_provider
provider = get_provider("anchor")
cdp_url = await provider.connect()
...
await provider.disconnect()
"""

import asyncio
import importlib

import httpx
from browsers.util import retry_on_429
from browsers import (
anchor,
browser_use_cloud,
browserbase,
browserless,
driver,
hyperbrowser,
local_headful,
local_headless,
onkernel,
rebrowser,
steel,
)

PROVIDERS = [
"anchor",
Expand All @@ -30,22 +30,23 @@ async def disconnect() -> None -- cleans up the session
"steel",
]

BROWSERS = {
"anchor": anchor,
"browser-use-cloud": browser_use_cloud,
"browserbase": browserbase,
"browserless": browserless,
"driver": driver,
"hyperbrowser": hyperbrowser,
"local_headful": local_headful,
"local_headless": local_headless,
"onkernel": onkernel,
"rebrowser": rebrowser,
"steel": steel,
}


def get_provider(name: str):
"""Import and return a browser provider module by name."""
if name not in PROVIDERS:
raise ValueError(f"Unknown browser provider: {name}. Available: {PROVIDERS}")
return importlib.import_module(f"browsers.{name}")


async def retry_on_429(fn, max_retries=10, max_wait=30):
"""Call fn(), retrying with capped exponential backoff on 429 responses."""
for attempt in range(max_retries + 1):
try:
return await fn()
except httpx.HTTPStatusError as e:
if e.response.status_code != 429 or attempt == max_retries:
raise
wait = min(2**attempt, max_wait)
print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s")
await asyncio.sleep(wait)
30 changes: 20 additions & 10 deletions browsers/browser_use_cloud.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
"""browser-use Cloud -- https://browser-use.com

Requires: BROWSER_USE_API_KEY env var.
"""
"""browser-use cloud browser provider."""

import os

import httpx

from browsers import retry_on_429
from browsers.util import retry_on_429

MAX_CONCURRENT = 200

_session_id: str | None = None


def _api_base() -> str:
base = os.environ.get("BU_CLOUD_API_BASE", "https://api.browser-use.com").rstrip("/")
version = os.environ.get("BU_CLOUD_API_VERSION", "v2")
return f"{base}/api/{version}"


def _api_key() -> str:
return os.environ.get("BU_CLOUD_API_KEY") or os.environ["BROWSER_USE_API_KEY"]


async def connect() -> str:
global _session_id

async def _create():
async with httpx.AsyncClient() as client:
resp = await client.post(
"https://api.browser-use.com/api/v2/browsers",
headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]},
f"{_api_base()}/browsers",
headers={"X-Browser-Use-API-Key": _api_key()},
json={},
timeout=90,
)
Expand All @@ -36,10 +45,11 @@ async def disconnect() -> None:
if not _session_id:
return
async with httpx.AsyncClient() as client:
await client.patch(
f"https://api.browser-use.com/api/v2/browsers/{_session_id}",
headers={"X-Browser-Use-API-Key": os.environ["BROWSER_USE_API_KEY"]},
resp = await client.patch(
f"{_api_base()}/browsers/{_session_id}",
headers={"X-Browser-Use-API-Key": _api_key()},
json={"action": "stop"},
timeout=30,
)
resp.raise_for_status()
_session_id = None
16 changes: 16 additions & 0 deletions browsers/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import asyncio

import httpx


async def retry_on_429(fn, max_retries=10, max_wait=30):
"""Call fn(), retrying with capped exponential backoff on 429 responses."""
for attempt in range(max_retries + 1):
try:
return await fn()
except httpx.HTTPStatusError as e:
if e.response.status_code != 429 or attempt == max_retries:
raise
wait = min(2**attempt, max_wait)
print(f"[429] Rate limited, retry {attempt + 1}/{max_retries} in {wait}s")
await asyncio.sleep(wait)
Loading