Skip to content

Commit 0092494

Browse files
author
Shrey Modi
committed
openenvrolloutprocessor
1 parent 467fffb commit 0092494

File tree

3 files changed

+480
-0
lines changed

3 files changed

+480
-0
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import asyncio
2+
import os
3+
import shutil
4+
from typing import Any, Dict, List
5+
6+
import pytest
7+
8+
from eval_protocol.models import EvaluationRow, Message
9+
from eval_protocol.pytest.types import RolloutProcessorConfig
10+
from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
11+
12+
# Skip these integration-heavy tests on CI runners by default
13+
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
14+
15+
16+
@pytest.mark.integration
17+
def test_openenv_browsergym_basic():
18+
"""
19+
Very basic integration test to ensure OpenEnv + BrowserGym can run a single-step rollout.
20+
Skips automatically if Docker is not available.
21+
"""
22+
if shutil.which("docker") is None:
23+
pytest.skip("Docker not available on PATH; skipping OpenEnv BrowserGym basic test.")
24+
25+
# Build a minimal EvaluationRow (messages can be empty; processor will add user prompts)
26+
rows: List[EvaluationRow] = [EvaluationRow(messages=[Message(role="user", content="start")])]
27+
28+
# Use tasks that are known to exist; requires MiniWoB server reachable from containers.
29+
tasks = ["click-test"]
30+
miniwob_url = os.getenv("MINIWOB_URL", "http://172.17.0.1:8888/miniwob/")
31+
32+
# Construct the processor with a trivial action_parser; the model output will still be generated
33+
# but we parse to a safe noop action to minimize flakiness for the environment step.
34+
from envs.browsergym_env import BrowserGymAction # type: ignore
35+
36+
processor = OpenEnvRolloutProcessor(
37+
env_factory=None,
38+
prompt_builder=lambda obs, step, history: "Do nothing",
39+
action_parser=lambda text: BrowserGymAction(action_str="noop()"),
40+
tasks=tasks,
41+
miniwob_url=miniwob_url,
42+
docker_image="browsergym-env:latest",
43+
benchmark="miniwob",
44+
timeout_ms=10000,
45+
num_generations=1,
46+
)
47+
48+
# Completion params: rely on an available provider/model in the environment
49+
completion_params: Dict[str, Any] = {
50+
"model": os.getenv(
51+
"OPENENV_TEST_MODEL",
52+
# Default to a Fireworks public model id used elsewhere in tests; requires FIREWORKS_API_KEY
53+
"fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
54+
),
55+
"temperature": 0.0,
56+
"max_tokens": 16,
57+
}
58+
59+
# Limit to a single step to keep the test fast and robust
60+
config = RolloutProcessorConfig(
61+
completion_params=completion_params,
62+
semaphore=asyncio.Semaphore(1),
63+
steps=1,
64+
)
65+
66+
loop = asyncio.new_event_loop()
67+
asyncio.set_event_loop(loop)
68+
try:
69+
async def _run_all():
70+
tasks_ = processor(rows, config)
71+
return await asyncio.gather(*tasks_)
72+
73+
completed_rows = loop.run_until_complete(_run_all())
74+
finally:
75+
loop.close()
76+
77+
assert len(completed_rows) == 1
78+
# Basic sanity checks that a rollout happened and usage is populated
79+
row = completed_rows[0]
80+
assert row is not None
81+
assert row.execution_metadata is not None
82+
assert getattr(row.execution_metadata, "duration_seconds", 0.0) >= 0.0
83+
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
from typing import Any, Dict, List
2+
import os
3+
import re
4+
5+
import pytest
6+
from eval_protocol.models import EvaluationRow, Message, EvaluateResult
7+
from eval_protocol.pytest import evaluation_test
8+
from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor
9+
import pytest
10+
11+
# Skip these integration-heavy tests on CI runners by default
12+
pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI")
13+
14+
15+
def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]:
16+
"""
17+
Adapter: convert simple {"id": "...", "prompt": "..."} rows into EvaluationRows.
18+
"""
19+
rows: List[EvaluationRow] = []
20+
for row in data:
21+
prompt = str(row.get("prompt", "start"))
22+
rows.append(EvaluationRow(messages=[Message(role="user", content=prompt)]))
23+
return rows
24+
25+
26+
# ---- prompt_builder and action_parser modeled after browsergym_grpo_evalp.py ----
27+
28+
ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL)
29+
30+
31+
def _as_scalar(x: Any) -> Any:
32+
try:
33+
return x.item()
34+
except Exception:
35+
return x
36+
37+
38+
def _extract_goal_url_title(observation: Any) -> tuple[str, str, str]:
39+
goal = getattr(observation, "goal", "") or ""
40+
url = getattr(observation, "url", "") or ""
41+
title = ""
42+
metadata = getattr(observation, "metadata", {}) or {}
43+
obs_dict = metadata.get("browsergym_obs", {}) or {}
44+
if not goal:
45+
goal = obs_dict.get("goal") or ""
46+
if not url:
47+
url = obs_dict.get("url") or ""
48+
titles = obs_dict.get("open_pages_titles") or ()
49+
active_idx = _as_scalar(obs_dict.get("active_page_index"))
50+
try:
51+
active_idx = int(active_idx)
52+
except Exception:
53+
active_idx = 0
54+
if isinstance(titles, (list, tuple)) and 0 <= active_idx < len(titles):
55+
title = titles[active_idx] or ""
56+
return goal, url, title
57+
58+
59+
def _extract_clickable_elements_lines(observation: Any) -> List[str]:
60+
metadata = getattr(observation, "metadata", {}) or {}
61+
obs_dict = metadata.get("browsergym_obs", {}) or {}
62+
extra_props = obs_dict.get("extra_element_properties", {}) or {}
63+
axtree_object = obs_dict.get("axtree_object") or {}
64+
focused_bid = obs_dict.get("focused_element_bid")
65+
bid_to_desc: Dict[str, tuple[str, str]] = {}
66+
try:
67+
nodes = axtree_object.get("nodes") or []
68+
for node in nodes:
69+
bid = node.get("browsergym_id")
70+
if bid is None:
71+
continue
72+
role = ""
73+
name = ""
74+
rf = node.get("role") or {}
75+
if isinstance(rf, dict):
76+
role = str(rf.get("value", "")).strip()
77+
nf = node.get("name") or {}
78+
if isinstance(nf, dict):
79+
name = str(nf.get("value", "")).strip()
80+
bid_to_desc[str(bid)] = (role, name)
81+
except Exception:
82+
pass
83+
lines: List[str] = []
84+
for bid in sorted(extra_props.keys(), key=lambda x: str(x)):
85+
props = extra_props[bid] or {}
86+
if not props.get("clickable"):
87+
continue
88+
bbox = props.get("bbox") or []
89+
bbox_str = ", ".join(str(v) for v in bbox) if bbox else "?"
90+
role, name = bid_to_desc.get(str(bid), ("", ""))
91+
focus_tag = " [FOCUSED]" if (str(bid) == str(focused_bid)) else ""
92+
rn = (role or "-")
93+
if name:
94+
rn = f"{rn} | {name}"
95+
vis = props.get("visibility")
96+
vis_str = f"{vis:.2f}" if isinstance(vis, (int, float)) else str(vis) if vis is not None else "?"
97+
lines.append(f"- BID {bid}{focus_tag}: {rn} | bbox({bbox_str}) | visibility={vis_str}")
98+
return lines
99+
100+
101+
def _rank_clickables_lines(observation: Any, goal: str, top_n: int = 8) -> tuple[List[str], str | None]:
102+
metadata = getattr(observation, "metadata", {}) or {}
103+
obs_dict = metadata.get("browsergym_obs", {}) or {}
104+
goal_lc = (goal or "").lower().strip()
105+
extra_props = obs_dict.get("extra_element_properties", {}) or {}
106+
axtree_object = obs_dict.get("axtree_object") or {}
107+
focused_bid = str(obs_dict.get("focused_element_bid") or "")
108+
bid_to_desc: Dict[str, tuple[str, str]] = {}
109+
try:
110+
nodes = axtree_object.get("nodes") or []
111+
for node in nodes:
112+
bid = node.get("browsergym_id")
113+
if bid is None:
114+
continue
115+
role = ""
116+
name = ""
117+
rf = node.get("role") or {}
118+
if isinstance(rf, dict):
119+
role = str(rf.get("value", "")).strip()
120+
nf = node.get("name") or {}
121+
if isinstance(nf, dict):
122+
name = str(nf.get("value", "")).strip()
123+
bid_to_desc[str(bid)] = (role, name)
124+
except Exception:
125+
pass
126+
scored: List[tuple[float, str, str, str, str]] = []
127+
for bid_key in sorted(extra_props.keys(), key=lambda x: str(x)):
128+
props = extra_props[bid_key] or {}
129+
if not props.get("clickable"):
130+
continue
131+
role, name = bid_to_desc.get(str(bid_key), ("", ""))
132+
name_lc = (name or "").lower()
133+
score = 0.0
134+
if goal_lc and name_lc and (goal_lc in name_lc or name_lc in goal_lc):
135+
score += 2.0
136+
if (role or "").lower() == "button":
137+
score += 1.0
138+
if str(bid_key) == focused_bid:
139+
score += 0.5
140+
vis = props.get("visibility")
141+
try:
142+
vis_f = float(vis)
143+
score += max(0.0, min(1.0, vis_f))
144+
except Exception:
145+
pass
146+
bbox = props.get("bbox") or []
147+
bbox_str = ", ".join(str(v) for v in bbox) if bbox else "?"
148+
rn = (role or "-")
149+
if name:
150+
rn = f"{rn} | {name}"
151+
vis_str = f"{vis:.2f}" if isinstance(vis, (int, float)) else str(vis) if vis is not None else "?"
152+
scored.append((score, str(bid_key), rn, bbox_str, vis_str))
153+
scored.sort(key=lambda t: t[0], reverse=True)
154+
lines: List[str] = []
155+
recommended = scored[0][1] if scored else None
156+
for idx, (score, bid, rn, bbox_str, vis_str) in enumerate(scored[:top_n], start=1):
157+
lines.append(f"{idx}. BID {bid}: score={score:.2f} | {rn} | bbox({bbox_str}) | visibility={vis_str}")
158+
return lines, recommended
159+
160+
161+
def prompt_builder(observation: Any, step: int, history: List[str]) -> str:
162+
goal, url, title = _extract_goal_url_title(observation)
163+
url = url or "(unknown)"
164+
error_note = "Yes" if getattr(observation, "last_action_error", False) else "No"
165+
clickables_block = "\n".join(_extract_clickable_elements_lines(observation)) or "(none detected)"
166+
ranked_lines, rec = _rank_clickables_lines(observation, goal, top_n=10)
167+
ranked_block = "\n".join(ranked_lines) or "(none)"
168+
text = getattr(observation, "text", "") or ""
169+
text = text[:2048]
170+
metadata = getattr(observation, "metadata", {}) or {}
171+
obs_dict = metadata.get("browsergym_obs", {}) or {}
172+
focused_bid = obs_dict.get("focused_element_bid") or ""
173+
last_action = obs_dict.get("last_action") or ""
174+
return (
175+
f"Step: {step}\n"
176+
f"Goal: {goal}\n"
177+
f"Current URL: {url}\n"
178+
f"Title: {title}\n"
179+
f"Previous steps:\n" + ("\n".join(history[-4:]) if history else "None") + "\n"
180+
f"Last action: {last_action}\n"
181+
f"Last action error: {error_note}\n"
182+
f"Focused BID: {focused_bid}\n\n"
183+
f"Clickable elements (BID: role | name | bbox | visibility):\n{clickables_block}\n\n"
184+
f"Ranked clickable candidates (best first):\n{ranked_block}\n"
185+
f"Recommended BID: {rec or '(none)'}\n\n"
186+
"Instructions:\n"
187+
"- Choose the most relevant clickable BID to achieve the goal.\n"
188+
"- Prefer role=button or elements whose name matches the goal.\n"
189+
"- Reply with a single action, e.g., click('13') or noop().\n\n"
190+
f"Page excerpt:\n{text}\n\n"
191+
"Reply with exactly one BrowserGym action string."
192+
).strip()
193+
194+
195+
def action_parser(response_text: str):
196+
try:
197+
from envs.browsergym_env import BrowserGymAction # type: ignore
198+
except Exception:
199+
pytest.skip("OpenEnv (envs.browsergym_env) is not installed; skipping BrowserGym test.")
200+
raise
201+
if not response_text:
202+
return BrowserGymAction(action_str="noop()")
203+
for raw in response_text.splitlines():
204+
line = raw.strip()
205+
if not line:
206+
continue
207+
m = ACTION_PATTERN.search(line)
208+
if m:
209+
parsed = re.sub(r"\s+", " ", m.group(0))
210+
return BrowserGymAction(action_str=parsed)
211+
m = ACTION_PATTERN.search(response_text)
212+
if m:
213+
parsed = re.sub(r"\s+", " ", m.group(0))
214+
return BrowserGymAction(action_str=parsed)
215+
return BrowserGymAction(action_str="noop()")
216+
217+
218+
try:
219+
from envs.browsergym_env import BrowserGymEnv # type: ignore
220+
_HAS_BG = True
221+
except Exception:
222+
_HAS_BG = False
223+
224+
225+
@evaluation_test( # type: ignore[misc]
226+
input_dataset=["tests/pytest/data/openenv_browsergym_dataset.jsonl"],
227+
dataset_adapter=openenv_dataset_to_rows,
228+
completion_params=[
229+
{
230+
"temperature": 0.0,
231+
"max_tokens": 32,
232+
"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct",
233+
}
234+
],
235+
# Keep concurrency and steps low for a quick health-check
236+
num_runs=1,
237+
max_concurrent_rollouts=1,
238+
mode="pointwise",
239+
rollout_processor=(
240+
OpenEnvRolloutProcessor(
241+
env_client_cls=BrowserGymEnv if _HAS_BG else None,
242+
prompt_builder=prompt_builder,
243+
action_parser=action_parser,
244+
tasks=[
245+
"click-test",
246+
"click-button",
247+
"click-button-sequence",
248+
"click-checkboxes",
249+
"click-checkboxes-soft",
250+
"click-checkboxes-large",
251+
"click-checkboxes-transfer",
252+
],
253+
miniwob_url=os.getenv("MINIWOB_URL", "http://172.17.0.1:8888/miniwob/"),
254+
docker_image="browsergym-env:latest",
255+
benchmark="miniwob",
256+
timeout_ms=10000,
257+
num_generations=1,
258+
)
259+
if _HAS_BG
260+
else None
261+
),
262+
)
263+
def test_openenv_browsergym_eval(row: EvaluationRow) -> EvaluationRow:
264+
"""
265+
Smoke test to ensure OpenEnv + BrowserGym MiniWoB runs and returns a row.
266+
The evaluation harness will assert basic invariants (no exceptions, etc.).
267+
"""
268+
if not _HAS_BG:
269+
pytest.skip("OpenEnv (envs.browsergym_env) is not installed; skipping BrowserGym test.")
270+
# Extract step rewards from the sentinel system message injected by the rollout processor
271+
step_rewards: List[float] = []
272+
try:
273+
for msg in row.messages or []:
274+
if msg.role == "system" and isinstance(msg.content, str) and msg.content.startswith("__ep_step_rewards__:"):
275+
import json as _json
276+
payload = msg.content.split(":", 1)[1]
277+
step_rewards = _json.loads(payload) or []
278+
break
279+
except Exception:
280+
step_rewards = []
281+
282+
total = float(sum(step_rewards)) if step_rewards else 0.0
283+
# Map total reward to a score in [0,1]; MiniWoB rewards are typically 0/1 or -1/1
284+
score = max(0.0, min(1.0, total))
285+
reason = f"Total reward={total:.2f} across {len(step_rewards)} steps"
286+
row.evaluation_result = EvaluateResult(score=score, reason=reason)
287+
return row
288+

0 commit comments

Comments
 (0)