|
| 1 | +from typing import Any, Dict, List |
| 2 | +import os |
| 3 | +import re |
| 4 | + |
| 5 | +import pytest |
| 6 | +from eval_protocol.models import EvaluationRow, Message, EvaluateResult |
| 7 | +from eval_protocol.pytest import evaluation_test |
| 8 | +from eval_protocol.pytest.openenv_rollout_processor import OpenEnvRolloutProcessor |
| 9 | +import pytest |
| 10 | + |
| 11 | +# Skip these integration-heavy tests on CI runners by default |
| 12 | +pytestmark = pytest.mark.skipif(os.getenv("CI") == "true", reason="Skip OpenEnv integration tests on CI") |
| 13 | + |
| 14 | + |
| 15 | +def openenv_dataset_to_rows(data: List[Dict[str, Any]]) -> List[EvaluationRow]: |
| 16 | + """ |
| 17 | + Adapter: convert simple {"id": "...", "prompt": "..."} rows into EvaluationRows. |
| 18 | + """ |
| 19 | + rows: List[EvaluationRow] = [] |
| 20 | + for row in data: |
| 21 | + prompt = str(row.get("prompt", "start")) |
| 22 | + rows.append(EvaluationRow(messages=[Message(role="user", content=prompt)])) |
| 23 | + return rows |
| 24 | + |
| 25 | + |
| 26 | +# ---- prompt_builder and action_parser modeled after browsergym_grpo_evalp.py ---- |
| 27 | + |
| 28 | +ACTION_PATTERN = re.compile(r"[A-Za-z_]+\s*\(.*\)", re.DOTALL) |
| 29 | + |
| 30 | + |
| 31 | +def _as_scalar(x: Any) -> Any: |
| 32 | + try: |
| 33 | + return x.item() |
| 34 | + except Exception: |
| 35 | + return x |
| 36 | + |
| 37 | + |
| 38 | +def _extract_goal_url_title(observation: Any) -> tuple[str, str, str]: |
| 39 | + goal = getattr(observation, "goal", "") or "" |
| 40 | + url = getattr(observation, "url", "") or "" |
| 41 | + title = "" |
| 42 | + metadata = getattr(observation, "metadata", {}) or {} |
| 43 | + obs_dict = metadata.get("browsergym_obs", {}) or {} |
| 44 | + if not goal: |
| 45 | + goal = obs_dict.get("goal") or "" |
| 46 | + if not url: |
| 47 | + url = obs_dict.get("url") or "" |
| 48 | + titles = obs_dict.get("open_pages_titles") or () |
| 49 | + active_idx = _as_scalar(obs_dict.get("active_page_index")) |
| 50 | + try: |
| 51 | + active_idx = int(active_idx) |
| 52 | + except Exception: |
| 53 | + active_idx = 0 |
| 54 | + if isinstance(titles, (list, tuple)) and 0 <= active_idx < len(titles): |
| 55 | + title = titles[active_idx] or "" |
| 56 | + return goal, url, title |
| 57 | + |
| 58 | + |
| 59 | +def _extract_clickable_elements_lines(observation: Any) -> List[str]: |
| 60 | + metadata = getattr(observation, "metadata", {}) or {} |
| 61 | + obs_dict = metadata.get("browsergym_obs", {}) or {} |
| 62 | + extra_props = obs_dict.get("extra_element_properties", {}) or {} |
| 63 | + axtree_object = obs_dict.get("axtree_object") or {} |
| 64 | + focused_bid = obs_dict.get("focused_element_bid") |
| 65 | + bid_to_desc: Dict[str, tuple[str, str]] = {} |
| 66 | + try: |
| 67 | + nodes = axtree_object.get("nodes") or [] |
| 68 | + for node in nodes: |
| 69 | + bid = node.get("browsergym_id") |
| 70 | + if bid is None: |
| 71 | + continue |
| 72 | + role = "" |
| 73 | + name = "" |
| 74 | + rf = node.get("role") or {} |
| 75 | + if isinstance(rf, dict): |
| 76 | + role = str(rf.get("value", "")).strip() |
| 77 | + nf = node.get("name") or {} |
| 78 | + if isinstance(nf, dict): |
| 79 | + name = str(nf.get("value", "")).strip() |
| 80 | + bid_to_desc[str(bid)] = (role, name) |
| 81 | + except Exception: |
| 82 | + pass |
| 83 | + lines: List[str] = [] |
| 84 | + for bid in sorted(extra_props.keys(), key=lambda x: str(x)): |
| 85 | + props = extra_props[bid] or {} |
| 86 | + if not props.get("clickable"): |
| 87 | + continue |
| 88 | + bbox = props.get("bbox") or [] |
| 89 | + bbox_str = ", ".join(str(v) for v in bbox) if bbox else "?" |
| 90 | + role, name = bid_to_desc.get(str(bid), ("", "")) |
| 91 | + focus_tag = " [FOCUSED]" if (str(bid) == str(focused_bid)) else "" |
| 92 | + rn = (role or "-") |
| 93 | + if name: |
| 94 | + rn = f"{rn} | {name}" |
| 95 | + vis = props.get("visibility") |
| 96 | + vis_str = f"{vis:.2f}" if isinstance(vis, (int, float)) else str(vis) if vis is not None else "?" |
| 97 | + lines.append(f"- BID {bid}{focus_tag}: {rn} | bbox({bbox_str}) | visibility={vis_str}") |
| 98 | + return lines |
| 99 | + |
| 100 | + |
| 101 | +def _rank_clickables_lines(observation: Any, goal: str, top_n: int = 8) -> tuple[List[str], str | None]: |
| 102 | + metadata = getattr(observation, "metadata", {}) or {} |
| 103 | + obs_dict = metadata.get("browsergym_obs", {}) or {} |
| 104 | + goal_lc = (goal or "").lower().strip() |
| 105 | + extra_props = obs_dict.get("extra_element_properties", {}) or {} |
| 106 | + axtree_object = obs_dict.get("axtree_object") or {} |
| 107 | + focused_bid = str(obs_dict.get("focused_element_bid") or "") |
| 108 | + bid_to_desc: Dict[str, tuple[str, str]] = {} |
| 109 | + try: |
| 110 | + nodes = axtree_object.get("nodes") or [] |
| 111 | + for node in nodes: |
| 112 | + bid = node.get("browsergym_id") |
| 113 | + if bid is None: |
| 114 | + continue |
| 115 | + role = "" |
| 116 | + name = "" |
| 117 | + rf = node.get("role") or {} |
| 118 | + if isinstance(rf, dict): |
| 119 | + role = str(rf.get("value", "")).strip() |
| 120 | + nf = node.get("name") or {} |
| 121 | + if isinstance(nf, dict): |
| 122 | + name = str(nf.get("value", "")).strip() |
| 123 | + bid_to_desc[str(bid)] = (role, name) |
| 124 | + except Exception: |
| 125 | + pass |
| 126 | + scored: List[tuple[float, str, str, str, str]] = [] |
| 127 | + for bid_key in sorted(extra_props.keys(), key=lambda x: str(x)): |
| 128 | + props = extra_props[bid_key] or {} |
| 129 | + if not props.get("clickable"): |
| 130 | + continue |
| 131 | + role, name = bid_to_desc.get(str(bid_key), ("", "")) |
| 132 | + name_lc = (name or "").lower() |
| 133 | + score = 0.0 |
| 134 | + if goal_lc and name_lc and (goal_lc in name_lc or name_lc in goal_lc): |
| 135 | + score += 2.0 |
| 136 | + if (role or "").lower() == "button": |
| 137 | + score += 1.0 |
| 138 | + if str(bid_key) == focused_bid: |
| 139 | + score += 0.5 |
| 140 | + vis = props.get("visibility") |
| 141 | + try: |
| 142 | + vis_f = float(vis) |
| 143 | + score += max(0.0, min(1.0, vis_f)) |
| 144 | + except Exception: |
| 145 | + pass |
| 146 | + bbox = props.get("bbox") or [] |
| 147 | + bbox_str = ", ".join(str(v) for v in bbox) if bbox else "?" |
| 148 | + rn = (role or "-") |
| 149 | + if name: |
| 150 | + rn = f"{rn} | {name}" |
| 151 | + vis_str = f"{vis:.2f}" if isinstance(vis, (int, float)) else str(vis) if vis is not None else "?" |
| 152 | + scored.append((score, str(bid_key), rn, bbox_str, vis_str)) |
| 153 | + scored.sort(key=lambda t: t[0], reverse=True) |
| 154 | + lines: List[str] = [] |
| 155 | + recommended = scored[0][1] if scored else None |
| 156 | + for idx, (score, bid, rn, bbox_str, vis_str) in enumerate(scored[:top_n], start=1): |
| 157 | + lines.append(f"{idx}. BID {bid}: score={score:.2f} | {rn} | bbox({bbox_str}) | visibility={vis_str}") |
| 158 | + return lines, recommended |
| 159 | + |
| 160 | + |
| 161 | +def prompt_builder(observation: Any, step: int, history: List[str]) -> str: |
| 162 | + goal, url, title = _extract_goal_url_title(observation) |
| 163 | + url = url or "(unknown)" |
| 164 | + error_note = "Yes" if getattr(observation, "last_action_error", False) else "No" |
| 165 | + clickables_block = "\n".join(_extract_clickable_elements_lines(observation)) or "(none detected)" |
| 166 | + ranked_lines, rec = _rank_clickables_lines(observation, goal, top_n=10) |
| 167 | + ranked_block = "\n".join(ranked_lines) or "(none)" |
| 168 | + text = getattr(observation, "text", "") or "" |
| 169 | + text = text[:2048] |
| 170 | + metadata = getattr(observation, "metadata", {}) or {} |
| 171 | + obs_dict = metadata.get("browsergym_obs", {}) or {} |
| 172 | + focused_bid = obs_dict.get("focused_element_bid") or "" |
| 173 | + last_action = obs_dict.get("last_action") or "" |
| 174 | + return ( |
| 175 | + f"Step: {step}\n" |
| 176 | + f"Goal: {goal}\n" |
| 177 | + f"Current URL: {url}\n" |
| 178 | + f"Title: {title}\n" |
| 179 | + f"Previous steps:\n" + ("\n".join(history[-4:]) if history else "None") + "\n" |
| 180 | + f"Last action: {last_action}\n" |
| 181 | + f"Last action error: {error_note}\n" |
| 182 | + f"Focused BID: {focused_bid}\n\n" |
| 183 | + f"Clickable elements (BID: role | name | bbox | visibility):\n{clickables_block}\n\n" |
| 184 | + f"Ranked clickable candidates (best first):\n{ranked_block}\n" |
| 185 | + f"Recommended BID: {rec or '(none)'}\n\n" |
| 186 | + "Instructions:\n" |
| 187 | + "- Choose the most relevant clickable BID to achieve the goal.\n" |
| 188 | + "- Prefer role=button or elements whose name matches the goal.\n" |
| 189 | + "- Reply with a single action, e.g., click('13') or noop().\n\n" |
| 190 | + f"Page excerpt:\n{text}\n\n" |
| 191 | + "Reply with exactly one BrowserGym action string." |
| 192 | + ).strip() |
| 193 | + |
| 194 | + |
| 195 | +def action_parser(response_text: str): |
| 196 | + try: |
| 197 | + from envs.browsergym_env import BrowserGymAction # type: ignore |
| 198 | + except Exception: |
| 199 | + pytest.skip("OpenEnv (envs.browsergym_env) is not installed; skipping BrowserGym test.") |
| 200 | + raise |
| 201 | + if not response_text: |
| 202 | + return BrowserGymAction(action_str="noop()") |
| 203 | + for raw in response_text.splitlines(): |
| 204 | + line = raw.strip() |
| 205 | + if not line: |
| 206 | + continue |
| 207 | + m = ACTION_PATTERN.search(line) |
| 208 | + if m: |
| 209 | + parsed = re.sub(r"\s+", " ", m.group(0)) |
| 210 | + return BrowserGymAction(action_str=parsed) |
| 211 | + m = ACTION_PATTERN.search(response_text) |
| 212 | + if m: |
| 213 | + parsed = re.sub(r"\s+", " ", m.group(0)) |
| 214 | + return BrowserGymAction(action_str=parsed) |
| 215 | + return BrowserGymAction(action_str="noop()") |
| 216 | + |
| 217 | + |
| 218 | +try: |
| 219 | + from envs.browsergym_env import BrowserGymEnv # type: ignore |
| 220 | + _HAS_BG = True |
| 221 | +except Exception: |
| 222 | + _HAS_BG = False |
| 223 | + |
| 224 | + |
| 225 | +@evaluation_test( # type: ignore[misc] |
| 226 | + input_dataset=["tests/pytest/data/openenv_browsergym_dataset.jsonl"], |
| 227 | + dataset_adapter=openenv_dataset_to_rows, |
| 228 | + completion_params=[ |
| 229 | + { |
| 230 | + "temperature": 0.0, |
| 231 | + "max_tokens": 32, |
| 232 | + "model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct", |
| 233 | + } |
| 234 | + ], |
| 235 | + # Keep concurrency and steps low for a quick health-check |
| 236 | + num_runs=1, |
| 237 | + max_concurrent_rollouts=1, |
| 238 | + mode="pointwise", |
| 239 | + rollout_processor=( |
| 240 | + OpenEnvRolloutProcessor( |
| 241 | + env_client_cls=BrowserGymEnv if _HAS_BG else None, |
| 242 | + prompt_builder=prompt_builder, |
| 243 | + action_parser=action_parser, |
| 244 | + tasks=[ |
| 245 | + "click-test", |
| 246 | + "click-button", |
| 247 | + "click-button-sequence", |
| 248 | + "click-checkboxes", |
| 249 | + "click-checkboxes-soft", |
| 250 | + "click-checkboxes-large", |
| 251 | + "click-checkboxes-transfer", |
| 252 | + ], |
| 253 | + miniwob_url=os.getenv("MINIWOB_URL", "http://172.17.0.1:8888/miniwob/"), |
| 254 | + docker_image="browsergym-env:latest", |
| 255 | + benchmark="miniwob", |
| 256 | + timeout_ms=10000, |
| 257 | + num_generations=1, |
| 258 | + ) |
| 259 | + if _HAS_BG |
| 260 | + else None |
| 261 | + ), |
| 262 | +) |
| 263 | +def test_openenv_browsergym_eval(row: EvaluationRow) -> EvaluationRow: |
| 264 | + """ |
| 265 | + Smoke test to ensure OpenEnv + BrowserGym MiniWoB runs and returns a row. |
| 266 | + The evaluation harness will assert basic invariants (no exceptions, etc.). |
| 267 | + """ |
| 268 | + if not _HAS_BG: |
| 269 | + pytest.skip("OpenEnv (envs.browsergym_env) is not installed; skipping BrowserGym test.") |
| 270 | + # Extract step rewards from the sentinel system message injected by the rollout processor |
| 271 | + step_rewards: List[float] = [] |
| 272 | + try: |
| 273 | + for msg in row.messages or []: |
| 274 | + if msg.role == "system" and isinstance(msg.content, str) and msg.content.startswith("__ep_step_rewards__:"): |
| 275 | + import json as _json |
| 276 | + payload = msg.content.split(":", 1)[1] |
| 277 | + step_rewards = _json.loads(payload) or [] |
| 278 | + break |
| 279 | + except Exception: |
| 280 | + step_rewards = [] |
| 281 | + |
| 282 | + total = float(sum(step_rewards)) if step_rewards else 0.0 |
| 283 | + # Map total reward to a score in [0,1]; MiniWoB rewards are typically 0/1 or -1/1 |
| 284 | + score = max(0.0, min(1.0, total)) |
| 285 | + reason = f"Total reward={total:.2f} across {len(step_rewards)} steps" |
| 286 | + row.evaluation_result = EvaluateResult(score=score, reason=reason) |
| 287 | + return row |
| 288 | + |
0 commit comments