From 5ceeb60369954f4986a078b79b3a853846140a01 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 31 Oct 2025 17:18:49 +0000 Subject: [PATCH 01/61] miniwob with mcp browser backend, first draft --- experiments/run_miniwob.py | 25 ++++ experiments/test_mcp.py | 42 ++++++ src/agentlab/backends/browser/base.py | 40 ++++++ src/agentlab/backends/browser/env.py | 82 +++++++++++ .../backends/browser/mcp_playwright.json | 17 +++ .../backends/browser/mcp_playwright.py | 14 ++ src/agentlab/benchmarks/miniwob/__init__.py | 4 + src/agentlab/benchmarks/miniwob/benchmark.py | 50 +++++++ src/agentlab/benchmarks/miniwob/task.py | 131 ++++++++++++++++++ src/agentlab/benchmarks/web_task.py | 22 +++ 10 files changed, 427 insertions(+) create mode 100644 experiments/run_miniwob.py create mode 100644 experiments/test_mcp.py create mode 100644 src/agentlab/backends/browser/base.py create mode 100644 src/agentlab/backends/browser/env.py create mode 100644 src/agentlab/backends/browser/mcp_playwright.json create mode 100644 src/agentlab/backends/browser/mcp_playwright.py create mode 100644 src/agentlab/benchmarks/miniwob/__init__.py create mode 100644 src/agentlab/benchmarks/miniwob/benchmark.py create mode 100644 src/agentlab/benchmarks/miniwob/task.py create mode 100644 src/agentlab/benchmarks/web_task.py diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py new file mode 100644 index 00000000..3d33a0b9 --- /dev/null +++ b/experiments/run_miniwob.py @@ -0,0 +1,25 @@ +import logging +import os + +from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config +from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.benchmarks.miniwob import MiniWobBenchmark +from agentlab.experiments.study import make_study + +fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" +logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()]) + +if __name__ == "__main__": + config = load_config("gaia_l1") + study = make_study( + benchmark=MiniWobBenchmark(backend=MCPPlaywright()), + agent_args=TapeAgentArgs(agent_name=config.name, config=config), + comment=config.comment, + logging_level=logging.INFO, + logging_level_stdout=logging.INFO, + ) + if os.environ.get("AGENTLAB_DEBUG"): + study.exp_args_list = study.exp_args_list[:3] + study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") + else: + study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/experiments/test_mcp.py b/experiments/test_mcp.py new file mode 100644 index 00000000..34b9734a --- /dev/null +++ b/experiments/test_mcp.py @@ -0,0 +1,42 @@ +from tapeagents.environment import FunctionCall +from tapeagents.mcp import ToolCallAction + +from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.benchmarks.miniwob.task import get_miniwob_tasks + + +def main(): + tasks = get_miniwob_tasks() + task = tasks[0] + setup_js = task.get_setup_js() + + backend = MCPPlaywright() + print("="*100) + # 1. goto task url + print("URL: ", task.url) + obs = backend.call_tool("browser_navigate", {"url": task.url}) + print("------") + print(obs) + print("-"*100) + + # 2. eval js + obs = backend.run_js(setup_js) + print("------") + print(obs) + print("-"*100) + + # 3. validate + print("\n\nVALIDATE") + js = task.get_task_validate_js() + print(js) + obs = backend.run_js(js) + print("------") + print(obs) + print("-"*100) + +if __name__ == "__main__": + main() + + + + \ No newline at end of file diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py new file mode 100644 index 00000000..0bfbcbe9 --- /dev/null +++ b/src/agentlab/backends/browser/base.py @@ -0,0 +1,40 @@ +from tapeagents.environment import FunctionCall +from tapeagents.mcp import MCPEnvironment, ToolCallAction +from tapeagents.tool_calling import as_openai_tool + + +class BrowserBackend(): + def run_js(self, js: str): + raise NotImplementedError + + def call_tool(self, tool_name: str, arguments: dict) -> str: + raise NotImplementedError + + def tools_description(self) -> str: + raise NotImplementedError + + def tools(self) -> list[dict]: + raise NotImplementedError + + +class MCPBrowserBackend(BrowserBackend): + def __init__(self, config_path: str): + self.config_path = config_path + self.mcp = MCPEnvironment(config_path=self.config_path) + self.mcp.initialize() + + def call_tool(self, tool_name: str, arguments: dict) -> str: + action = ToolCallAction( + function=FunctionCall(name=tool_name, arguments=arguments) + ) + tool_result = self.mcp.step(action) + return tool_result.content.content[0].text + + + def tools_description(self) -> str: + return self.mcp.tools_description() + + def tools(self) -> list[dict]: + actions = self.mcp.actions() + tools = [as_openai_tool(a).model_dump() for a in actions] + return tools diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py new file mode 100644 index 00000000..48375255 --- /dev/null +++ b/src/agentlab/backends/browser/env.py @@ -0,0 +1,82 @@ +import logging +import time +from typing import Any, Literal + +from tapeagents.core import Action, Observation, StopStep + +from agentlab.backends.browser.base import BrowserBackend +from agentlab.benchmarks.abstract_env import AbstractEnv +from agentlab.benchmarks.miniwob.task import AbstractWebTask + +logger = logging.getLogger(__name__) + + +class PageObservation(Observation): + kind: Literal["page_observation"] = "page_observation" + content: str + +class BrowserAction(Action): + kind: Literal["browser_action"] = "browser_action" + name: str + arguments: dict[str, Any] + + +class BrowserEnv(AbstractEnv): + def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0): + self.task_name = task_name + self.task = task + self.seed = seed + self.backend = backend + self._turns = 0 + + def reset(self, seed: int): + self.seed = seed + setup_js = self.task.get_setup_js() + if setup_js: + js_result_str = self.backend.run_js(setup_js) + logger.info(f"Task reset result: {js_result_str}") + + def step(self, action: BrowserAction) -> tuple[Observation, float, bool, bool, dict]: + logger.info(f"BrowserEnv.step() called with action {type(action)}") + + action_exec_start = time.time() + finished = isinstance(action, StopStep) + if finished: + observation = Observation() # empty observation + else: + observation = self._step(action) + action_exec_stop = time.time() + self._turns += 1 + + truncated = self._turns >= self.max_turns + + if self.task.validate_per_step or finished or truncated: + reward = self.calculate_reward(action, observation) + else: + reward = None + + env_info = { + "step_metadata": observation.metadata, + "action_exec_start": action_exec_start, + "action_exec_stop": action_exec_stop, + "action_exec_timeout": 0.0, + } + obs_view = observation.short_view() if isinstance(observation, Observation) else observation + logger.info(f"Action result in observation: {obs_view}") + return observation, reward, finished, truncated, env_info + + def _step(self, action: Action) -> PageObservation: + tool_result = self.backend.call_tool(action.name, action.arguments) + return PageObservation(content=tool_result) + + def calculate_reward(self, action: Action, observation: PageObservation) -> float: + validate_js = self.task.get_step_validate_js() + validate_result = self.backend.run_js(validate_js) + reward, other = self.task.parse_validation_result(validate_result) + return reward + + def close(self): + teardown_js = self.task.get_teardown_js() + if teardown_js: + js_result_str = self.backend.run_js(teardown_js) + logger.info(f"Task teardown result: {js_result_str}") diff --git a/src/agentlab/backends/browser/mcp_playwright.json b/src/agentlab/backends/browser/mcp_playwright.json new file mode 100644 index 00000000..f184712b --- /dev/null +++ b/src/agentlab/backends/browser/mcp_playwright.json @@ -0,0 +1,17 @@ +{ + "mcpServers": { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + "--browser", + "chromium", + "--headless", + "--isolated" + ], + "env": { + "PLAYWRIGHT_BROWSERS_PATH": "" + } + } + } +} diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py new file mode 100644 index 00000000..cced4b95 --- /dev/null +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -0,0 +1,14 @@ +from agentlab.backends.browser.base import MCPBrowserBackend + +DEFAULT_CONFIG_PATH = "src/agentlab/backends/browser/mcp_playwright.json" + +class MCPPlaywright(MCPBrowserBackend): + def __init__(self, config_path: str | None = None): + super().__init__(config_path or DEFAULT_CONFIG_PATH) + + def run_js(self, js: str): + raw_response = self.call_tool("browser_evaluate", {"function": js}) + _, half_response = raw_response.split("### Result", maxsplit=1) + result_str, _ = half_response.split("\n### Ran", maxsplit=1) + result_str = result_str.strip() + return result_str diff --git a/src/agentlab/benchmarks/miniwob/__init__.py b/src/agentlab/benchmarks/miniwob/__init__.py new file mode 100644 index 00000000..558ed21e --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/__init__.py @@ -0,0 +1,4 @@ +from .benchmark import MiniWobBenchmark +from .task import MiniWobTask + +__all__ = ["MiniWobBenchmark", "MiniWobTask"] \ No newline at end of file diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py new file mode 100644 index 00000000..3808a697 --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -0,0 +1,50 @@ +import logging +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from pydantic import ConfigDict + +from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.env import BrowserEnv +from agentlab.benchmarks.abstract_env import AbstractBenchmark, AbstractEnvArgs +from agentlab.benchmarks.miniwob.task import MiniWobTask, get_miniwob_tasks + +logger = logging.getLogger(__name__) + + +@dataclass +class MiniwobArgs(AbstractEnvArgs): + task: MiniWobTask + task_seed: int + task_name: str + backend: BrowserBackend + + def __init__(self, task_name: str, task: MiniWobTask, backend: BrowserBackend, task_seed: int = 0): + self.task_name = task_name + self.task = task + self.task_seed = task_seed + self.backend = backend + + def make_env(self, exp_dir: Path, action_mapping=None) -> BrowserEnv: + env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed) + return env + + +class MiniWobBenchmark(AbstractBenchmark): + model_config = ConfigDict(arbitrary_types_allowed=True) + + backend: BrowserBackend + name: str = "miniwob" + env_args_list: list[MiniwobArgs] = None # type: ignore + dataset: list[MiniWobTask] = None # type: ignore + + def model_post_init(self, __context: Any) -> None: + self.env_args_list = [] + if self.dataset is None: + self.dataset = get_miniwob_tasks() + for task in self.dataset: + name = f"miniwob.{task.task_id}" + env_args = MiniwobArgs(task_name=name, task=task, backend=self.backend) + self.env_args_list.append(env_args) + logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks") diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py new file mode 100644 index 00000000..6d17cffe --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -0,0 +1,131 @@ + +import os +from typing import Any + +from browsergym.miniwob import ALL_MINIWOB_TASKS + +from agentlab.benchmarks.web_task import AbstractWebTask + + +class MiniWobTask(AbstractWebTask): + dataset: str = "miniwob" + task_id: str + desc: str + subdomain: str + base_url: str = None + url: str = None + remove_human_display: bool = True + episode_max_time: int = 1000000 + + def model_post_init(self, __context: Any): + self.url = f"{self.base_url}/{self.subdomain}.html" + + def get_setup_js(self) -> str: + if self.remove_human_display: + js = r""" +let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover']; +let __display_divs = {}; +let __query_div_hidden_copy = null; + +removeDisplay = function() { + core.clearTimer(); + document.body.removeEventListener('click', core.canvasDrawClick); + + __query_div_hidden_copy = document.getElementById('query').cloneNode(true); + document.getElementById('query').innerHTML = ''; + + for (i in __display_ids) { + elem_id = __display_ids[i]; + elem = document.getElementById(elem_id); + // remove elem from the document + elem.remove(); + // but keep it stored somewhere to bring back later + __display_divs[elem_id] = elem; + } +}; + +bringBackDisplay = function() { + document.getElementById('query').innerHTML = __query_div_hidden_copy.innerHTML; + for (var elem_id in __display_divs){ + document.body.appendChild(__display_divs[elem_id]); + } + core.createDisplay(); +}; + +core.endEpisode_legacy = core.endEpisode; +core.startEpisodeReal_legacy = core.startEpisodeReal; +core.getUtterance_legacy = core.getUtterance; + +core.getUtterance = function () { + bringBackDisplay(); + utterance = core.getUtterance_legacy(); + removeDisplay(); + return utterance; +}; + +core.endEpisode = function(reward, time_proportional, reason){ + bringBackDisplay(); + core.endEpisode_legacy(reward, time_proportional, reason); + removeDisplay(); +}; + +core.startEpisodeReal = function() { + bringBackDisplay(); + core.startEpisodeReal_legacy(); + removeDisplay(); +}; + +removeDisplay(); +""" + else: + js = "" + js += f""" +Math.seedrandom(42); +core.EPISODE_MAX_TIME = {self.episode_max_time}; +core.startEpisodeReal(); +while (!WOB_TASK_READY) {{ + await new Promise(resolve => setTimeout(resolve, 100)); +}} +return core.getUtterance(); + """ + return f"async () => {{{js}}}" + + def get_teardown_js(self) -> str: + return "" + + def get_step_validate_js(self) -> str: + return """() => { +return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; +}""" + + def get_task_validate_js(self) -> str: + return """() => { +return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; +}""" + + + def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: + chunks = [c.strip() for c in validation_result.split(",")] + raw_reward = float(chunks[1]) + done = bool(chunks[3]) + reward = float(raw_reward > 0) + return reward, { + "raw_reward": raw_reward, + "reward_reason": chunks[2], + "done": done, + } + +def get_miniwob_tasks(base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000) -> list[MiniWobTask]: + if base_url is None: + base_url = os.environ.get("MINIWOB_URL") + if base_url is None: + raise ValueError("MINIWOB_URL environment variable is not set") + return [ + MiniWobTask( + task_id=task.subdomain, + desc=task.desc, + subdomain=task.subdomain, + base_url=base_url, + remove_human_display=remove_human_display, + episode_max_time=episode_max_time) for task in ALL_MINIWOB_TASKS + ] \ No newline at end of file diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py new file mode 100644 index 00000000..220c99e9 --- /dev/null +++ b/src/agentlab/benchmarks/web_task.py @@ -0,0 +1,22 @@ + +from pydantic import BaseModel + + +class AbstractWebTask(BaseModel): + name: str + validate_per_step: bool = False + + def get_setup_js(self) -> str: + raise NotImplementedError + + def get_teardown_js(self) -> str: + raise NotImplementedError + + def get_task_validate_js(self) -> str: + raise NotImplementedError + + def get_step_validate_js(self) -> str: + raise NotImplementedError + + def parse_validation_result(self, validate_result: str) -> tuple[float, dict]: + raise NotImplementedError From 450dacf8486324d09b44e543f92029bc1c5fd999 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 31 Oct 2025 19:18:08 +0000 Subject: [PATCH 02/61] actions whitelist, fixes, support new order of the agent env creation in the loop --- experiments/run_miniwob.py | 1 + src/agentlab/agents/tapeagent/agent.py | 9 +++- src/agentlab/backends/browser/base.py | 54 ++++++++++--------- src/agentlab/backends/browser/env.py | 54 ++++++++++++++----- .../backends/browser/mcp_playwright.py | 20 ++++++- src/agentlab/benchmarks/miniwob/benchmark.py | 32 ++--------- src/agentlab/benchmarks/miniwob/task.py | 11 +++- src/agentlab/benchmarks/web_task.py | 12 ++++- src/agentlab/experiments/loop.py | 30 ++++++----- 9 files changed, 139 insertions(+), 84 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 3d33a0b9..0f5bd0cf 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -8,6 +8,7 @@ fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()]) +logger = logging.getLogger(__name__) if __name__ == "__main__": config = load_config("gaia_l1") diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index eefda1d1..4844d46b 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -9,6 +9,7 @@ from tapeagents.agent import Agent from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought from tapeagents.core import Tape as BaseTape +from tapeagents.tool_calling import ToolSpec from agentlab.agents.agent_args import AgentArgs @@ -40,8 +41,12 @@ def load_config(config_name: str) -> DictConfig: class TapeAgentArgs(AgentArgs): config: DictConfig = None # type: ignore - def make_agent(self) -> bgym.Agent: - agent: Agent = hydra.utils.instantiate(self.config.agent) + def make_agent(self, known_actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: + if known_actions is None: + agent = hydra.utils.instantiate(self.config.agent) + else: + tools_description = "\n".join([action.description() for action in known_actions]) + agent = hydra.utils.instantiate(self.config.agent, known_actions=known_actions, tools_description=tools_description) return TapeAgent(agent=agent) diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 0bfbcbe9..636eb3fe 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -1,40 +1,44 @@ -from tapeagents.environment import FunctionCall -from tapeagents.mcp import MCPEnvironment, ToolCallAction -from tapeagents.tool_calling import as_openai_tool +from pydantic import BaseModel +from tapeagents.mcp import MCPEnvironment +from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec -class BrowserBackend(): - def run_js(self, js: str): +class BrowserBackend(BaseModel): + def initialize(self) -> None: raise NotImplementedError - def call_tool(self, tool_name: str, arguments: dict) -> str: + def run_js(self, js: str): raise NotImplementedError - def tools_description(self) -> str: + def goto(self, url: str) -> str: raise NotImplementedError - def tools(self) -> list[dict]: + def step(self, action: ToolCallAction) -> str: raise NotImplementedError + def actions(self) -> tuple[ToolSpec]: + raise NotImplementedError -class MCPBrowserBackend(BrowserBackend): - def __init__(self, config_path: str): - self.config_path = config_path - self.mcp = MCPEnvironment(config_path=self.config_path) - self.mcp.initialize() - def call_tool(self, tool_name: str, arguments: dict) -> str: - action = ToolCallAction( - function=FunctionCall(name=tool_name, arguments=arguments) - ) - tool_result = self.mcp.step(action) - return tool_result.content.content[0].text +class MCPBrowserBackend(BrowserBackend): + config_path: str + _mcp = None + + def initialize(self) -> None: + self._mcp = MCPEnvironment(config_path=self.config_path) + self._mcp.initialize() - def tools_description(self) -> str: - return self.mcp.tools_description() + def step(self, action: ToolCallAction) -> str: + return self._call_mcp(action) - def tools(self) -> list[dict]: - actions = self.mcp.actions() - tools = [as_openai_tool(a).model_dump() for a in actions] - return tools + def call_tool(self, tool_name: str, arguments: dict) -> str: + return self._call_mcp(ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments))) + + def _call_mcp(self, action: ToolCallAction) -> str: + tool_result = self._mcp.step(action) + texts = [c.text for c in tool_result.content.content] + return "\n\n".join(texts) + + def actions(self) -> tuple[ToolSpec]: + return self._mcp.actions() \ No newline at end of file diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 48375255..9661fde5 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -1,43 +1,48 @@ import logging import time -from typing import Any, Literal +from dataclasses import dataclass +from pathlib import Path +from typing import Literal from tapeagents.core import Action, Observation, StopStep +from tapeagents.tool_calling import ToolCallAction, ToolSpec from agentlab.backends.browser.base import BrowserBackend -from agentlab.benchmarks.abstract_env import AbstractEnv +from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs from agentlab.benchmarks.miniwob.task import AbstractWebTask logger = logging.getLogger(__name__) +class GoalObservation(Observation): + kind: Literal["goal_observation"] = "goal_observation" + goal: str class PageObservation(Observation): kind: Literal["page_observation"] = "page_observation" content: str -class BrowserAction(Action): - kind: Literal["browser_action"] = "browser_action" - name: str - arguments: dict[str, Any] - class BrowserEnv(AbstractEnv): def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0): self.task_name = task_name self.task = task self.seed = seed - self.backend = backend self._turns = 0 + self.backend = backend + self.backend.initialize() def reset(self, seed: int): self.seed = seed + logger.info(f"Open task URL: {self.task.url}") + page_content = self.backend.goto(self.task.url) setup_js = self.task.get_setup_js() if setup_js: js_result_str = self.backend.run_js(setup_js) logger.info(f"Task reset result: {js_result_str}") + return [GoalObservation(goal=js_result_str), PageObservation(content=page_content)], {} - def step(self, action: BrowserAction) -> tuple[Observation, float, bool, bool, dict]: - logger.info(f"BrowserEnv.step() called with action {type(action)}") + def step(self, action: ToolCallAction) -> tuple[Observation, float, bool, bool, dict]: + logger.info(f"BrowserEnv.step() called with action {action.function.name}") action_exec_start = time.time() finished = isinstance(action, StopStep) @@ -65,8 +70,8 @@ def step(self, action: BrowserAction) -> tuple[Observation, float, bool, bool, d logger.info(f"Action result in observation: {obs_view}") return observation, reward, finished, truncated, env_info - def _step(self, action: Action) -> PageObservation: - tool_result = self.backend.call_tool(action.name, action.arguments) + def _step(self, action: ToolCallAction) -> PageObservation: + tool_result = self.backend.step(action) return PageObservation(content=tool_result) def calculate_reward(self, action: Action, observation: PageObservation) -> float: @@ -80,3 +85,28 @@ def close(self): if teardown_js: js_result_str = self.backend.run_js(teardown_js) logger.info(f"Task teardown result: {js_result_str}") + + def actions(self) -> list[ToolSpec]: + all_actions = self.backend.actions() + filtered_actions = self.task.filter_actions(all_actions) + logger.info(f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}") + return filtered_actions + + +@dataclass +class BrowserEnvArgs(AbstractEnvArgs): + task: AbstractWebTask + task_seed: int + task_name: str + backend: BrowserBackend + + def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0): + self.task_name = task_name + self.task = task + self.task_seed = task_seed + self.backend = backend + + def make_env(self, exp_dir: Path) -> BrowserEnv: + env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed) + return env + diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index cced4b95..bb4371b4 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -1,10 +1,16 @@ +import logging + +from tapeagents.tool_calling import ToolCallAction + from agentlab.backends.browser.base import MCPBrowserBackend +logger = logging.getLogger(__name__) + DEFAULT_CONFIG_PATH = "src/agentlab/backends/browser/mcp_playwright.json" + class MCPPlaywright(MCPBrowserBackend): - def __init__(self, config_path: str | None = None): - super().__init__(config_path or DEFAULT_CONFIG_PATH) + config_path: str = DEFAULT_CONFIG_PATH def run_js(self, js: str): raw_response = self.call_tool("browser_evaluate", {"function": js}) @@ -12,3 +18,13 @@ def run_js(self, js: str): result_str, _ = half_response.split("\n### Ran", maxsplit=1) result_str = result_str.strip() return result_str + + def step(self, action: ToolCallAction) -> str: + tool_result = self._call_mcp(action) + logger.info(f"Tool result: {tool_result}") + snapshot = self.call_tool("browser_snapshot", {}) + return snapshot + + def goto(self, url: str) -> str: + tool_result = self.call_tool("browser_navigate", {"url": url}) + return tool_result diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py index 3808a697..2851ef29 100644 --- a/src/agentlab/benchmarks/miniwob/benchmark.py +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -1,42 +1,18 @@ import logging -from dataclasses import dataclass -from pathlib import Path from typing import Any -from pydantic import ConfigDict - from agentlab.backends.browser.base import BrowserBackend -from agentlab.backends.browser.env import BrowserEnv -from agentlab.benchmarks.abstract_env import AbstractBenchmark, AbstractEnvArgs +from agentlab.backends.browser.env import BrowserEnvArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark from agentlab.benchmarks.miniwob.task import MiniWobTask, get_miniwob_tasks logger = logging.getLogger(__name__) -@dataclass -class MiniwobArgs(AbstractEnvArgs): - task: MiniWobTask - task_seed: int - task_name: str - backend: BrowserBackend - - def __init__(self, task_name: str, task: MiniWobTask, backend: BrowserBackend, task_seed: int = 0): - self.task_name = task_name - self.task = task - self.task_seed = task_seed - self.backend = backend - - def make_env(self, exp_dir: Path, action_mapping=None) -> BrowserEnv: - env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed) - return env - - class MiniWobBenchmark(AbstractBenchmark): - model_config = ConfigDict(arbitrary_types_allowed=True) - backend: BrowserBackend name: str = "miniwob" - env_args_list: list[MiniwobArgs] = None # type: ignore + env_args_list: list[BrowserEnvArgs] = None # type: ignore dataset: list[MiniWobTask] = None # type: ignore def model_post_init(self, __context: Any) -> None: @@ -45,6 +21,6 @@ def model_post_init(self, __context: Any) -> None: self.dataset = get_miniwob_tasks() for task in self.dataset: name = f"miniwob.{task.task_id}" - env_args = MiniwobArgs(task_name=name, task=task, backend=self.backend) + env_args = BrowserEnvArgs(task_name=name, task=task, backend=self.backend) self.env_args_list.append(env_args) logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks") diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 6d17cffe..e0d0b3e2 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -1,6 +1,6 @@ import os -from typing import Any +from typing import Any, ClassVar from browsergym.miniwob import ALL_MINIWOB_TASKS @@ -16,6 +16,15 @@ class MiniWobTask(AbstractWebTask): url: str = None remove_human_display: bool = True episode_max_time: int = 1000000 + actions_whitelist: ClassVar[list[str]] = [ + "browser_press_key", + "browser_type", + "browser_navigate", + "browser_click", + "browser_drag", + "browser_hover", + "browser_select_option", + ] def model_post_init(self, __context: Any): self.url = f"{self.base_url}/{self.subdomain}.html" diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index 220c99e9..e8588d54 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -1,11 +1,19 @@ +from typing import ClassVar from pydantic import BaseModel +from tapeagents.tool_calling import ToolSpec class AbstractWebTask(BaseModel): - name: str + dataset: str + url: str validate_per_step: bool = False - + actions_whitelist: ClassVar[list[str]] = [] + + @classmethod + def filter_actions(cls, actions: list[ToolSpec]) -> list[str]: + return [action for action in actions if action.function.name in cls.actions_whitelist] + def get_setup_js(self) -> str: raise NotImplementedError diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index de4b976a..82bbc8ab 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -25,6 +25,8 @@ from PIL import Image from tqdm import tqdm +from agentlab.backends.browser.env import BrowserEnvArgs + try: from agentlab.agents.tapeagent import TapeAgent, save_tape except ImportError: @@ -414,19 +416,23 @@ def run(self): env, step_info, err_msg, stack_trace = None, None, None, None try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") - agent = self.agent_args.make_agent() - if hasattr(agent, "set_task_name"): - agent.set_task_name(self.env_args.task_name) - - logger.debug("Agent created.") - - env = self.env_args.make_env( - action_mapping=agent.action_set.to_python_code, - exp_dir=self.exp_dir, - use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), - ) + if isinstance(self.env_args, BrowserEnvArgs): + env = self.env_args.make_env(exp_dir=self.exp_dir) + logger.debug("Environment created.") + agent = self.agent_args.make_agent(known_actions=env.actions()) + logger.debug(f"Agent created with actions: {env.actions()}") + else: + agent = self.agent_args.make_agent() + if hasattr(agent, "set_task_name"): + agent.set_task_name(self.env_args.task_name) + logger.debug("Agent created.") + env = self.env_args.make_env( + action_mapping=agent.action_set.to_python_code, + exp_dir=self.exp_dir, + use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), + ) + logger.debug("Environment created.") - logger.debug("Environment created.") step_info = StepInfo(step=0) episode_info = [step_info] step_info.from_reset( From 2e2b8a674c05b25f252fe89126ce3165217651ea Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 31 Oct 2025 19:18:36 +0000 Subject: [PATCH 03/61] miniwob config --- experiments/run_miniwob.py | 2 +- src/agentlab/agents/tapeagent/conf/miniwob.yaml | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) create mode 100644 src/agentlab/agents/tapeagent/conf/miniwob.yaml diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 0f5bd0cf..f860ae9f 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) if __name__ == "__main__": - config = load_config("gaia_l1") + config = load_config("miniwob") study = make_study( benchmark=MiniWobBenchmark(backend=MCPPlaywright()), agent_args=TapeAgentArgs(agent_name=config.name, config=config), diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml new file mode 100644 index 00000000..acc2c655 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -0,0 +1,9 @@ +defaults: + - llm: gpt5-mini + - agent: plan_act + - _self_ + +name: miniwob_tapeagent +comment: MiniWob TapeAgent +parallel_backend: ray +n_jobs: 32 \ No newline at end of file From 630569a6172d6b50e049ce46c6a697aef4a4b62b Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 31 Oct 2025 19:21:05 +0000 Subject: [PATCH 04/61] llm config --- src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml diff --git a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml new file mode 100644 index 00000000..84dbe3b3 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml @@ -0,0 +1,6 @@ +_target_: tapeagents.llms.LiteLLM +model_name: gpt-5-mini-2025-08-07 +use_cache: true +context_size: 128000 +parameters: + temperature: 1.0 \ No newline at end of file From 8be56ce86e388f2bdda0f9263698cdc44f4baa9a Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 3 Nov 2025 16:12:18 +0000 Subject: [PATCH 05/61] fixes, use firefox --- src/agentlab/actions.py | 47 +++++++++++++++++++ src/agentlab/agents/tapeagent/agent.py | 8 ++-- src/agentlab/backends/browser/env.py | 5 +- .../backends/browser/mcp_playwright.json | 2 +- .../backends/browser/mcp_playwright.py | 10 ++-- src/agentlab/experiments/loop.py | 2 +- 6 files changed, 64 insertions(+), 10 deletions(-) create mode 100644 src/agentlab/actions.py diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py new file mode 100644 index 00000000..9aa3fa01 --- /dev/null +++ b/src/agentlab/actions.py @@ -0,0 +1,47 @@ +from bgym import AbstractActionSet +from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec + +from agentlab.llm.llm_utils import parse_html_tags_raise + + +class ToolsActionSet(AbstractActionSet): + def __init__(self, actions:list[ToolSpec]): + self.actions = actions + + def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str: + tools_description = "\n".join([action.description() for action in self.actions]) + return tools_description + + def example_action(self, abstract: bool) -> str: + if abstract: + return """ +{ + "name": "", + "arguments": { + "": "", + "": "", + ... + } +} + +""" + else: + return """ +{ + "name": "browser_navigate", + "arguments": { + "url": "https://www.google.com" + } +} + +""" + @classmethod + def parse_action(cls, llm_output: str) -> ToolCallAction: + content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) + if not valid or "action" not in content_dict: + raise ValueError(f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}") + action_str = content_dict["action"] + return ToolCallAction(function=FunctionCall(name=action_str["name"], arguments=action_str["arguments"])) + + def to_python_code(self, action) -> str: + return action.model_dump_json(indent=2) \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index 4844d46b..a0062801 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -41,12 +41,12 @@ def load_config(config_name: str) -> DictConfig: class TapeAgentArgs(AgentArgs): config: DictConfig = None # type: ignore - def make_agent(self, known_actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: - if known_actions is None: + def make_agent(self, actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: + if actions is None: agent = hydra.utils.instantiate(self.config.agent) else: - tools_description = "\n".join([action.description() for action in known_actions]) - agent = hydra.utils.instantiate(self.config.agent, known_actions=known_actions, tools_description=tools_description) + tools_description = "\n".join([action.description() for action in actions]) + agent = hydra.utils.instantiate(self.config.agent, known_actions=actions, tools_description=tools_description) return TapeAgent(agent=agent) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 9661fde5..e0f911e2 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -7,6 +7,7 @@ from tapeagents.core import Action, Observation, StopStep from tapeagents.tool_calling import ToolCallAction, ToolSpec +from agentlab.actions import ToolsActionSet from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs from agentlab.benchmarks.miniwob.task import AbstractWebTask @@ -41,7 +42,9 @@ def reset(self, seed: int): logger.info(f"Task reset result: {js_result_str}") return [GoalObservation(goal=js_result_str), PageObservation(content=page_content)], {} - def step(self, action: ToolCallAction) -> tuple[Observation, float, bool, bool, dict]: + def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, bool, dict]: + if isinstance(action, str): + action = ToolsActionSet.parse_action(action) logger.info(f"BrowserEnv.step() called with action {action.function.name}") action_exec_start = time.time() diff --git a/src/agentlab/backends/browser/mcp_playwright.json b/src/agentlab/backends/browser/mcp_playwright.json index f184712b..ad30c794 100644 --- a/src/agentlab/backends/browser/mcp_playwright.json +++ b/src/agentlab/backends/browser/mcp_playwright.json @@ -5,7 +5,7 @@ "args": [ "@playwright/mcp@latest", "--browser", - "chromium", + "firefox", "--headless", "--isolated" ], diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index bb4371b4..7c17a4c7 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -14,9 +14,13 @@ class MCPPlaywright(MCPBrowserBackend): def run_js(self, js: str): raw_response = self.call_tool("browser_evaluate", {"function": js}) - _, half_response = raw_response.split("### Result", maxsplit=1) - result_str, _ = half_response.split("\n### Ran", maxsplit=1) - result_str = result_str.strip() + try: + _, half_response = raw_response.split("### Result", maxsplit=1) + result_str, _ = half_response.split("\n### Ran", maxsplit=1) + result_str = result_str.strip() + except Exception as e: + logger.error(f"Error parsing JS result: {e}. Raw result: {raw_response}") + raise e return result_str def step(self, action: ToolCallAction) -> str: diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 82bbc8ab..15a37048 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -419,7 +419,7 @@ def run(self): if isinstance(self.env_args, BrowserEnvArgs): env = self.env_args.make_env(exp_dir=self.exp_dir) logger.debug("Environment created.") - agent = self.agent_args.make_agent(known_actions=env.actions()) + agent = self.agent_args.make_agent(actions=env.actions()) logger.debug(f"Agent created with actions: {env.actions()}") else: agent = self.agent_args.make_agent() From 9acd97d00dce9815f1670cfdff0c77469cd9d764 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 3 Nov 2025 16:40:41 +0000 Subject: [PATCH 06/61] plan_react agent with function calling and sonnet llm --- .../conf/agent/plan_react_fcall.yaml | 57 +++++++++++++++++++ .../agents/tapeagent/conf/llm/sonnet.yaml | 6 ++ .../agents/tapeagent/conf/miniwob.yaml | 4 +- 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml create mode 100644 src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml diff --git a/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml b/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml new file mode 100644 index 00000000..69788ed2 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml @@ -0,0 +1,57 @@ +_target_: tapeagents.agent.Agent +name : web_agent +max_iterations: 2 +llms: + default: ${llm} +templates: + system_prompt: | + You are an expert AI Agent trained to assist users with complex web tasks. + Your role is to understand user queries, perform actions and respond in a helpful and accurate manner. + Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration. + Do not express emotions or opinions about user questions. + allowed_tools: | + You have access to the following tools: + {tools_description} + thought_format: | + Important! Respond with the plain text, do not include any JSON or code. + Do not output anything besides what I asked in this message. + +nodes: + - _target_: tapeagents.nodes.StandardNode + name: plan + system_prompt: ${agent.templates.system_prompt} + guidance: | + Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task. + Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet. + Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed. + Start with the title "Plan". + ${agent.templates.thought_format} + steps_prompt: ${agent.templates.allowed_tools} + + - _target_: tapeagents.nodes.StandardNode + name: reflect + system_prompt: ${agent.templates.system_prompt} + guidance: | + Produce the reasoning with a bullet-point list of thoughts strictly following the rules: + 1. Summarize the last observation and describe any webpage interactions/effects. + 2. Evaluate action success, explain impact on task/plan, and describe any errors with solutions. + 3. If the last action was not successful, ask yourself about the reasons for failure. + 4. List next steps to accomplish current plan step and propose next immediate action. + + Additional notes for web page observations: + - Accept cookie consents first + - Quote relevant observation parts verbatim + - Close popups before interacting + - If last action was not successful, check if the target element is visible, use scrolling if its not. + ${agent.templates.thought_format} + steps_prompt: ${agent.templates.allowed_tools} + + - _target_: tapeagents.nodes.StandardNode + name: act + system_prompt: ${agent.templates.system_prompt} + guidance: Produce an function call that performs the proposed step, if the task is complete, produce the final step. + steps: + - tapeagents.core.FinalStep + use_known_actions: true + use_function_calls: true + next_node: reflect \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml b/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml new file mode 100644 index 00000000..01120ec9 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml @@ -0,0 +1,6 @@ +_target_: tapeagents.llms.Claude +model_name: claude-sonnet-4-5-20250929 +use_cache: false +context_size: 128000 +parameters: + temperature: 0.1 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml index acc2c655..1867cf20 100644 --- a/src/agentlab/agents/tapeagent/conf/miniwob.yaml +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -1,6 +1,6 @@ defaults: - - llm: gpt5-mini - - agent: plan_act + - llm: sonnet + - agent: plan_react_fcall - _self_ name: miniwob_tapeagent From cfc85c6db489284bcbd8d7788229d2d406e97af7 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 3 Nov 2025 16:42:13 +0000 Subject: [PATCH 07/61] fixes --- experiments/run_miniwob.py | 5 ++++- experiments/test_mcp.py | 6 +++--- src/agentlab/backends/browser/env.py | 5 +++-- src/agentlab/benchmarks/miniwob/task.py | 1 + src/agentlab/benchmarks/web_task.py | 1 + 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index f860ae9f..12495960 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -1,6 +1,8 @@ import logging import os +from dotenv import load_dotenv + from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark @@ -9,6 +11,7 @@ fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()]) logger = logging.getLogger(__name__) +load_dotenv() if __name__ == "__main__": config = load_config("miniwob") @@ -20,7 +23,7 @@ logging_level_stdout=logging.INFO, ) if os.environ.get("AGENTLAB_DEBUG"): - study.exp_args_list = study.exp_args_list[:3] + study.exp_args_list = study.exp_args_list[:1] study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") else: study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/experiments/test_mcp.py b/experiments/test_mcp.py index 34b9734a..09eb7469 100644 --- a/experiments/test_mcp.py +++ b/experiments/test_mcp.py @@ -1,6 +1,3 @@ -from tapeagents.environment import FunctionCall -from tapeagents.mcp import ToolCallAction - from agentlab.backends.browser.mcp_playwright import MCPPlaywright from agentlab.benchmarks.miniwob.task import get_miniwob_tasks @@ -11,6 +8,9 @@ def main(): setup_js = task.get_setup_js() backend = MCPPlaywright() + backend.initialize() + print(backend.actions()) + print("="*100) # 1. goto task url print("URL: ", task.url) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index e0f911e2..784fd973 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -29,6 +29,7 @@ def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBacken self.task = task self.seed = seed self._turns = 0 + self.max_turns = task.max_turns self.backend = backend self.backend.initialize() @@ -45,7 +46,7 @@ def reset(self, seed: int): def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, bool, dict]: if isinstance(action, str): action = ToolsActionSet.parse_action(action) - logger.info(f"BrowserEnv.step() called with action {action.function.name}") + logger.info(f"BrowserEnv.step() called with action {action}") action_exec_start = time.time() finished = isinstance(action, StopStep) @@ -61,7 +62,7 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, if self.task.validate_per_step or finished or truncated: reward = self.calculate_reward(action, observation) else: - reward = None + reward = 0.0 env_info = { "step_metadata": observation.metadata, diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index e0d0b3e2..be925f11 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -16,6 +16,7 @@ class MiniWobTask(AbstractWebTask): url: str = None remove_human_display: bool = True episode_max_time: int = 1000000 + max_turns: int = 10 actions_whitelist: ClassVar[list[str]] = [ "browser_press_key", "browser_type", diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index e8588d54..e8627519 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -9,6 +9,7 @@ class AbstractWebTask(BaseModel): url: str validate_per_step: bool = False actions_whitelist: ClassVar[list[str]] = [] + max_turns: int = 100 @classmethod def filter_actions(cls, actions: list[ToolSpec]) -> list[str]: From f278c0f1be055383bcb5d0873114189471f11625 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 13:14:27 +0000 Subject: [PATCH 08/61] fix done state parsing --- .../backends/browser/mcp_playwright.py | 6 ++- src/agentlab/benchmarks/miniwob/task.py | 44 ++++++++++++------- 2 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 7c17a4c7..212e9c7b 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -26,8 +26,10 @@ def run_js(self, js: str): def step(self, action: ToolCallAction) -> str: tool_result = self._call_mcp(action) logger.info(f"Tool result: {tool_result}") - snapshot = self.call_tool("browser_snapshot", {}) - return snapshot + return tool_result + + def page_snapshot(self) -> str: + return self.call_tool("browser_snapshot", {}) def goto(self, url: str) -> str: tool_result = self.call_tool("browser_navigate", {"url": url}) diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index be925f11..5ff528f1 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -1,4 +1,4 @@ - +import logging import os from typing import Any, ClassVar @@ -6,6 +6,8 @@ from agentlab.benchmarks.web_task import AbstractWebTask +logger = logging.getLogger(__name__) + class MiniWobTask(AbstractWebTask): dataset: str = "miniwob" @@ -17,10 +19,10 @@ class MiniWobTask(AbstractWebTask): remove_human_display: bool = True episode_max_time: int = 1000000 max_turns: int = 10 + validate_per_step: bool = True actions_whitelist: ClassVar[list[str]] = [ "browser_press_key", "browser_type", - "browser_navigate", "browser_click", "browser_drag", "browser_hover", @@ -29,9 +31,10 @@ class MiniWobTask(AbstractWebTask): def model_post_init(self, __context: Any): self.url = f"{self.base_url}/{self.subdomain}.html" - + def get_setup_js(self) -> str: if self.remove_human_display: + logger.info("Remove human display") js = r""" let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover']; let __display_divs = {}; @@ -93,10 +96,12 @@ def get_setup_js(self) -> str: Math.seedrandom(42); core.EPISODE_MAX_TIME = {self.episode_max_time}; core.startEpisodeReal(); +start_time = Date.now(); while (!WOB_TASK_READY) {{ await new Promise(resolve => setTimeout(resolve, 100)); }} -return core.getUtterance(); +ready_time = Date.now(); +return {{'goal': core.getUtterance(), 'done': WOB_DONE_GLOBAL, 'task_start_time': ready_time - start_time}}; """ return f"async () => {{{js}}}" @@ -113,29 +118,34 @@ def get_task_validate_js(self) -> str: return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; }""" - def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: + logger.info(f"Validation result: {validation_result}") chunks = [c.strip() for c in validation_result.split(",")] raw_reward = float(chunks[1]) - done = bool(chunks[3]) + done = chunks[3].strip().lower() == "true" reward = float(raw_reward > 0) return reward, { - "raw_reward": raw_reward, - "reward_reason": chunks[2], - "done": done, + "raw_reward": raw_reward, + "reward_reason": chunks[2], + "done": done, } -def get_miniwob_tasks(base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000) -> list[MiniWobTask]: + +def get_miniwob_tasks( + base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000 +) -> list[MiniWobTask]: if base_url is None: base_url = os.environ.get("MINIWOB_URL") if base_url is None: raise ValueError("MINIWOB_URL environment variable is not set") return [ MiniWobTask( - task_id=task.subdomain, - desc=task.desc, - subdomain=task.subdomain, - base_url=base_url, - remove_human_display=remove_human_display, - episode_max_time=episode_max_time) for task in ALL_MINIWOB_TASKS - ] \ No newline at end of file + task_id=task.subdomain, + desc=task.desc, + subdomain=task.subdomain, + base_url=base_url, + remove_human_display=remove_human_display, + episode_max_time=episode_max_time, + ) + for task in ALL_MINIWOB_TASKS + ] From 4e27c3a0e829a331dd0de8e8cb2011272b3080a9 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 13:17:26 +0000 Subject: [PATCH 09/61] fixes --- experiments/run_miniwob.py | 21 ++++- src/agentlab/actions.py | 41 ++++++++-- .../agents/generic_agent/generic_agent.py | 19 ++++- src/agentlab/backends/browser/base.py | 18 ++++- src/agentlab/backends/browser/env.py | 80 ++++++++++++++----- src/agentlab/benchmarks/miniwob/__init__.py | 2 +- src/agentlab/benchmarks/miniwob/benchmark.py | 7 ++ 7 files changed, 150 insertions(+), 38 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 12495960..36aa5212 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -1,12 +1,16 @@ import logging import os +from bgym import DEFAULT_BENCHMARKS from dotenv import load_dotenv +from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_4o +from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study +from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()]) @@ -15,15 +19,26 @@ if __name__ == "__main__": config = load_config("miniwob") + + # benchmark = DEFAULT_BENCHMARKS["miniwob"]() + agent_args = GenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"], + flags=FLAGS_GPT_4o, + ) + + benchmark = MiniWobBenchmark(backend=MCPPlaywright()) + # agent_args =TapeAgentArgs(agent_name=config.name, config=config) + + study = make_study( - benchmark=MiniWobBenchmark(backend=MCPPlaywright()), - agent_args=TapeAgentArgs(agent_name=config.name, config=config), + benchmark=benchmark, + agent_args=agent_args, comment=config.comment, logging_level=logging.INFO, logging_level_stdout=logging.INFO, ) if os.environ.get("AGENTLAB_DEBUG"): - study.exp_args_list = study.exp_args_list[:1] + study.exp_args_list = study.exp_args_list[1:2] study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") else: study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index 9aa3fa01..b009f27d 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -1,15 +1,27 @@ +import json +import logging + from bgym import AbstractActionSet from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec from agentlab.llm.llm_utils import parse_html_tags_raise +logger = logging.getLogger(__name__) + class ToolsActionSet(AbstractActionSet): - def __init__(self, actions:list[ToolSpec]): + multiaction: bool = False + strict: bool = False + + def __init__(self, actions: list[ToolSpec]): self.actions = actions def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str: - tools_description = "\n".join([action.description() for action in self.actions]) + descs = [] + for action in self.actions: + desc = f"## {action.description()}.\n Schema: {action.model_dump_json(indent=2)}" + descs.append(desc) + tools_description = "\n".join(descs) return tools_description def example_action(self, abstract: bool) -> str: @@ -35,13 +47,26 @@ def example_action(self, abstract: bool) -> str: } """ + @classmethod def parse_action(cls, llm_output: str) -> ToolCallAction: - content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) - if not valid or "action" not in content_dict: - raise ValueError(f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}") - action_str = content_dict["action"] - return ToolCallAction(function=FunctionCall(name=action_str["name"], arguments=action_str["arguments"])) + logger.info(f"Parsing action: {llm_output}") + if "" in llm_output: + content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) + if not valid or "action" not in content_dict: + raise ValueError( + f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}" + ) + action_str = content_dict["action"] + else: + action_str = llm_output + try: + action_dict = json.loads(action_str) + except json.JSONDecodeError: + raise ValueError(f"Failed to parse action: {action_str}") + return ToolCallAction( + function=FunctionCall(name=action_dict["name"], arguments=action_dict["arguments"]) + ) def to_python_code(self, action) -> str: - return action.model_dump_json(indent=2) \ No newline at end of file + return action diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index d1f48f76..74a1a3f7 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -16,7 +16,9 @@ import bgym from bgym import Benchmark from browsergym.experiments.agent import Agent, AgentInfo +from tapeagents.tool_calling import ToolSpec +from agentlab.actions import ToolsActionSet from agentlab.agents import dynamic_prompting as dp from agentlab.agents.agent_args import AgentArgs from agentlab.llm.chat_api import BaseModelArgs @@ -65,9 +67,12 @@ def prepare(self): def close(self): return self.chat_model_args.close_server() - def make_agent(self): + def make_agent(self, actions: list[ToolSpec] | None = None): return GenericAgent( - chat_model_args=self.chat_model_args, flags=self.flags, max_retry=self.max_retry + chat_model_args=self.chat_model_args, + flags=self.flags, + max_retry=self.max_retry, + actions=actions, ) @@ -78,6 +83,7 @@ def __init__( chat_model_args: BaseModelArgs, flags: GenericPromptFlags, max_retry: int = 4, + actions: list[ToolSpec] | None = None, ): self.chat_llm = chat_model_args.make_model() @@ -85,8 +91,13 @@ def __init__( self.max_retry = max_retry self.flags = flags - self.action_set = self.flags.action.action_set.make_action_set() - self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs) + if actions is not None: + self.action_set = ToolsActionSet(actions=actions) + self.flags.action.action_set = self.action_set + self._obs_preprocessor = lambda obs: obs + else: + self.action_set = self.flags.action.action_set.make_action_set() + self._obs_preprocessor = dp.make_obs_preprocessor(flags.obs) self._check_flag_constancy() self.reset(seed=None) diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 636eb3fe..619a5a84 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -13,18 +13,23 @@ def run_js(self, js: str): def goto(self, url: str) -> str: raise NotImplementedError + def page_snapshot(self) -> str: + raise NotImplementedError + def step(self, action: ToolCallAction) -> str: raise NotImplementedError def actions(self) -> tuple[ToolSpec]: raise NotImplementedError + def close(self) -> None: + raise NotImplementedError class MCPBrowserBackend(BrowserBackend): config_path: str _mcp = None - + def initialize(self) -> None: self._mcp = MCPEnvironment(config_path=self.config_path) self._mcp.initialize() @@ -33,12 +38,17 @@ def step(self, action: ToolCallAction) -> str: return self._call_mcp(action) def call_tool(self, tool_name: str, arguments: dict) -> str: - return self._call_mcp(ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments))) - + return self._call_mcp( + ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments)) + ) + def _call_mcp(self, action: ToolCallAction) -> str: tool_result = self._mcp.step(action) texts = [c.text for c in tool_result.content.content] return "\n\n".join(texts) def actions(self) -> tuple[ToolSpec]: - return self._mcp.actions() \ No newline at end of file + return self._mcp.actions() + + def close(self) -> None: + self._mcp.close() diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 784fd973..2ff7b045 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -1,3 +1,4 @@ +import json import logging import time from dataclasses import dataclass @@ -10,21 +11,25 @@ from agentlab.actions import ToolsActionSet from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs -from agentlab.benchmarks.miniwob.task import AbstractWebTask +from agentlab.benchmarks.web_task import AbstractWebTask logger = logging.getLogger(__name__) + class GoalObservation(Observation): kind: Literal["goal_observation"] = "goal_observation" goal: str + class PageObservation(Observation): kind: Literal["page_observation"] = "page_observation" content: str class BrowserEnv(AbstractEnv): - def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0): + def __init__( + self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 + ): self.task_name = task_name self.task = task self.seed = seed @@ -32,16 +37,34 @@ def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBacken self.max_turns = task.max_turns self.backend = backend self.backend.initialize() + self.goal = "" def reset(self, seed: int): self.seed = seed logger.info(f"Open task URL: {self.task.url}") - page_content = self.backend.goto(self.task.url) + self.backend.goto(self.task.url) setup_js = self.task.get_setup_js() if setup_js: - js_result_str = self.backend.run_js(setup_js) - logger.info(f"Task reset result: {js_result_str}") - return [GoalObservation(goal=js_result_str), PageObservation(content=page_content)], {} + js_out = self.backend.run_js(setup_js) + out_dict = json.loads(js_out) + logger.info(f"Task setup result: {out_dict}") + goal = out_dict["goal"] + done = out_dict["done"] + task_start_time = out_dict["task_start_time"] + logger.info(f"Task start time: {task_start_time}") + if done: + raise ValueError("Task is already done") + self.goal = goal + logger.info(f"Task goal: {self.goal}") + page_content = self.backend.page_snapshot() + logger.info(f"Initial obs: {page_content}") + return { + "goal_object": [{"type": "text", "text": self.goal}], + "pruned_html": page_content, + "axtree_txt": "", + "last_action_error": "", + "focused_element_bid": "none", + }, {} def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, bool, dict]: if isinstance(action, str): @@ -51,49 +74,67 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, action_exec_start = time.time() finished = isinstance(action, StopStep) if finished: - observation = Observation() # empty observation + observation = { + "goal_object": [{"type": "text", "text": self.goal}], + "pruned_html": "Task finished", + "axtree_txt": "", + "last_action_error": "", + "focused_element_bid": "none", + } else: observation = self._step(action) action_exec_stop = time.time() self._turns += 1 + logger.info(f"Obs:\n{observation['pruned_html']}") truncated = self._turns >= self.max_turns if self.task.validate_per_step or finished or truncated: - reward = self.calculate_reward(action, observation) + reward, other = self.calculate_reward(action, observation) + if other.get("done", False): + finished = True else: reward = 0.0 + other = {} env_info = { - "step_metadata": observation.metadata, "action_exec_start": action_exec_start, "action_exec_stop": action_exec_stop, "action_exec_timeout": 0.0, - } + } | other obs_view = observation.short_view() if isinstance(observation, Observation) else observation logger.info(f"Action result in observation: {obs_view}") return observation, reward, finished, truncated, env_info - def _step(self, action: ToolCallAction) -> PageObservation: + def _step(self, action: ToolCallAction) -> dict: tool_result = self.backend.step(action) - return PageObservation(content=tool_result) + return { + "goal_object": [{"type": "text", "text": self.goal}], + "pruned_html": tool_result, + "axtree_txt": "", + "last_action_error": "", + "focused_element_bid": "none", + } - def calculate_reward(self, action: Action, observation: PageObservation) -> float: + def calculate_reward(self, action: Action, observation: PageObservation) -> tuple[float, dict]: validate_js = self.task.get_step_validate_js() validate_result = self.backend.run_js(validate_js) reward, other = self.task.parse_validation_result(validate_result) - return reward + return reward, other def close(self): teardown_js = self.task.get_teardown_js() if teardown_js: js_result_str = self.backend.run_js(teardown_js) logger.info(f"Task teardown result: {js_result_str}") + self.backend.close() def actions(self) -> list[ToolSpec]: all_actions = self.backend.actions() filtered_actions = self.task.filter_actions(all_actions) - logger.info(f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}") + logger.info( + f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}" + ) return filtered_actions @@ -104,13 +145,16 @@ class BrowserEnvArgs(AbstractEnvArgs): task_name: str backend: BrowserBackend - def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0): + def __init__( + self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0 + ): self.task_name = task_name self.task = task self.task_seed = task_seed self.backend = backend def make_env(self, exp_dir: Path) -> BrowserEnv: - env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed) + env = BrowserEnv( + task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed + ) return env - diff --git a/src/agentlab/benchmarks/miniwob/__init__.py b/src/agentlab/benchmarks/miniwob/__init__.py index 558ed21e..7b2add6f 100644 --- a/src/agentlab/benchmarks/miniwob/__init__.py +++ b/src/agentlab/benchmarks/miniwob/__init__.py @@ -1,4 +1,4 @@ from .benchmark import MiniWobBenchmark from .task import MiniWobTask -__all__ = ["MiniWobBenchmark", "MiniWobTask"] \ No newline at end of file +__all__ = ["MiniWobBenchmark", "MiniWobTask"] diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py index 2851ef29..bda95d66 100644 --- a/src/agentlab/benchmarks/miniwob/benchmark.py +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -1,6 +1,9 @@ import logging from typing import Any +from pydantic import ConfigDict, Field + +from agentlab.actions import ToolsActionSet from agentlab.backends.browser.base import BrowserBackend from agentlab.backends.browser.env import BrowserEnvArgs from agentlab.benchmarks.abstract_env import AbstractBenchmark @@ -10,10 +13,14 @@ class MiniWobBenchmark(AbstractBenchmark): + model_config = ConfigDict(arbitrary_types_allowed=True) + backend: BrowserBackend name: str = "miniwob" env_args_list: list[BrowserEnvArgs] = None # type: ignore dataset: list[MiniWobTask] = None # type: ignore + is_multi_tab: bool = False + high_level_action_set_args: ToolsActionSet = None def model_post_init(self, __context: Any) -> None: self.env_args_list = [] From d1953d2122e50edb2a0cbd53de842ae8726c07ac Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 15:24:50 +0000 Subject: [PATCH 10/61] refactor loop step_info --- src/agentlab/experiments/loop.py | 136 ++++++++++++++----------------- 1 file changed, 61 insertions(+), 75 deletions(-) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 15a37048..ad527eb2 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -197,51 +197,20 @@ class StepInfo: profiling: StepTimestamps = field(default_factory=StepTimestamps) task_info: dict = None - def from_step(self, env: gym.Env, action: str, obs_preprocessor: callable): - t = self.profiling - t.env_start = time.time() - self.obs, self.reward, self.terminated, self.truncated, env_info = env.step(action) - t.env_stop = time.time() - + def add_action_result(self, action_result: tuple[dict, float, bool, bool, dict]): + self.obs, self.reward, self.terminated, self.truncated, env_info = action_result self.task_info = env_info.get("task_info", None) - self.raw_reward = env_info.get("RAW_REWARD_GLOBAL", None) - t.action_exec_start = env_info["action_exec_start"] # start - t.action_exect_after_timeout = env_info["action_exec_stop"] - t.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"] - t.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None) - t.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None) - t.validation_start = env_info.get("validation_start", None) - t.validation_stop = env_info.get("validation_stop", None) - t.get_observation_start = env_info.get("get_observation_start", None) - t.get_observation_stop = env_info.get("get_observation_stop", None) - - if obs_preprocessor: - self.obs = obs_preprocessor(self.obs) - - def from_action(self, agent: Agent): - self.profiling.agent_start = time.time() - self.action, self.agent_info = agent.get_action(self.obs.copy()) - self.profiling.agent_stop = time.time() - - self.make_stats() - - return self.action - - def from_reset(self, env: gym.Env, seed: int, obs_preprocessor: callable): - t = self.profiling - t.env_start = time.time() - self.obs, env_info = env.reset(seed=seed) - self.reward, self.terminated, self.truncated = 0, False, False - t.env_stop = time.time() - - t.action_exec_start = env_info.get("recording_start_time", t.env_start) - t.action_exect_after_timeout = t.env_stop - t.action_exec_stop = t.env_stop - - if obs_preprocessor: - self.obs = obs_preprocessor(self.obs) + self.profiling.action_exec_start = env_info.get("action_exec_start", None) + self.profiling.action_exect_after_timeout = env_info["action_exec_stop"] + self.profiling.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"] + self.profiling.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None) + self.profiling.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None) + self.profiling.validation_start = env_info.get("validation_start", None) + self.profiling.validation_stop = env_info.get("validation_stop", None) + self.profiling.get_observation_start = env_info.get("get_observation_start", None) + self.profiling.get_observation_stop = env_info.get("get_observation_stop", None) @property def is_done(self): @@ -264,7 +233,7 @@ def make_stats(self): self.stats = stats - def save_step_info(self, exp_dir, save_json=False, save_screenshot=True, save_som=False): + def save(self, exp_dir, save_json=False, save_screenshot=True, save_som=False): # special treatment for some of the observation fields if isinstance(self.obs, dict): # save screenshots to separate files @@ -291,14 +260,15 @@ def save_step_info(self, exp_dir, save_json=False, save_screenshot=True, save_so with gzip.open(exp_dir / f"step_{self.step}.pkl.gz", "wb") as f: pickle.dump(self, f) + logger.debug("Step info saved.") if save_json: with open(exp_dir / "steps_info.json", "w") as f: json.dump(self, f, indent=4, cls=DataclassJSONEncoder) + logger.debug("Step info saved to JSON.") if isinstance(self.obs, dict): # add the screenshots back to the obs - # why do we need this? if screenshot is not None: self.obs["screenshot"] = screenshot if screenshot_som is not None: @@ -416,57 +386,48 @@ def run(self): env, step_info, err_msg, stack_trace = None, None, None, None try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") - if isinstance(self.env_args, BrowserEnvArgs): - env = self.env_args.make_env(exp_dir=self.exp_dir) - logger.debug("Environment created.") - agent = self.agent_args.make_agent(actions=env.actions()) - logger.debug(f"Agent created with actions: {env.actions()}") - else: - agent = self.agent_args.make_agent() - if hasattr(agent, "set_task_name"): - agent.set_task_name(self.env_args.task_name) - logger.debug("Agent created.") - env = self.env_args.make_env( - action_mapping=agent.action_set.to_python_code, - exp_dir=self.exp_dir, - use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), - ) - logger.debug("Environment created.") + env, agent = self.create_env_and_agent() step_info = StepInfo(step=0) - episode_info = [step_info] - step_info.from_reset( - env, seed=self.env_args.task_seed or 0, obs_preprocessor=agent.obs_preprocessor - ) + step_info.profiling.env_start = time.time() + self.obs, env_info = env.reset(seed=self.env_args.task_seed or 0) + step_info.profiling.env_stop = time.time() + step_info.task_info = env_info.get("task_info", None) + if agent.obs_preprocessor: + step_info.obs = agent.obs_preprocessor(step_info.obs) logger.debug("Environment reset.") while not step_info.is_done: # set a limit logger.debug(f"Starting step {step_info.step}.") - action = step_info.from_action(agent) + step_info.profiling.agent_start = time.time() + action, step_info.agent_info = agent.get_action(step_info.obs.copy()) + step_info.profiling.agent_stop = time.time() logger.debug(f"Agent chose action:\n {action}") if action is None: # will end the episode after saving the step info. step_info.truncated = True - step_info.save_step_info( - self.exp_dir, save_screenshot=self.save_screenshot, save_som=self.save_som - ) - logger.debug("Step info saved.") + step_info.save(self.exp_dir, self.save_screenshot, self.save_som) - if hasattr(env.unwrapped, "chat") and isinstance(env.unwrapped.chat, Chat): - _send_chat_info(env.unwrapped.chat, action, step_info.agent_info) - logger.debug("Chat info sent.") + self.maybe_send_chat(env, action, step_info) if action is None: logger.debug("Agent returned None action. Ending episode.") break - step_info = StepInfo(step=step_info.step + 1) episode_info.append(step_info) + # --- End of (obs, action, reward) step, start a new one --- + + step_info = StepInfo(step=step_info.step + 1) logger.debug("Sending action to environment.") - step_info.from_step(env, action, obs_preprocessor=agent.obs_preprocessor) + step_info.profiling.env_start = time.time() + action_result = env.step(action) + step_info.profiling.env_stop = time.time() + step_info.add_action_result(action_result) + if agent.obs_preprocessor: + step_info.obs = agent.obs_preprocessor(step_info.obs) logger.debug("Environment stepped.") if step_info.is_done: logger.debug( @@ -488,7 +449,7 @@ def run(self): finally: try: if step_info is not None: - step_info.save_step_info( + step_info.save( self.exp_dir, save_screenshot=self.save_screenshot, save_som=self.save_som ) except Exception as e: @@ -518,6 +479,30 @@ def run(self): except Exception as e: logger.exception(f"Error while unsetting the logger: {e}") + def create_env_and_agent(self) -> tuple[gym.Env, Agent]: + if isinstance(self.env_args, BrowserEnvArgs): + env = self.env_args.make_env(exp_dir=self.exp_dir) + logger.debug("Environment created.") + agent = self.agent_args.make_agent(actions=env.actions()) + logger.debug(f"Agent created with actions: {env.actions()}") + else: + agent = self.agent_args.make_agent() + if hasattr(agent, "set_task_name"): + agent.set_task_name(self.env_args.task_name) + logger.debug("Agent created.") + env = self.env_args.make_env( + action_mapping=agent.action_set.to_python_code, + exp_dir=self.exp_dir, + use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), + ) + logger.debug("Environment created.") + return env, agent + + def maybe_send_chat(self, env: gym.Env, action: str, step_info: StepInfo): + if hasattr(env.unwrapped, "chat") and isinstance(env.unwrapped.chat, Chat): + _send_chat_info(env.unwrapped.chat, action, step_info.agent_info) + logger.debug("Chat info sent.") + def _set_logger(self): # output logging traces to a log file file_handler = logging.FileHandler(self.exp_dir / "experiment.log") @@ -618,6 +603,7 @@ def _aggregate_episode_stats(episode_info: list[StepInfo]): stats = defaultdict(list) for step_info in episode_info: + step_info.make_stats() if step_info.stats is not None: for key, val in step_info.stats.items(): if val is None: From 5656d0be62d7727040ce2057c2f89e3a351889fa Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 15:27:03 +0000 Subject: [PATCH 11/61] return page snapshot to mcp playwright results --- src/agentlab/backends/browser/mcp_playwright.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 212e9c7b..15969805 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -25,8 +25,8 @@ def run_js(self, js: str): def step(self, action: ToolCallAction) -> str: tool_result = self._call_mcp(action) - logger.info(f"Tool result: {tool_result}") - return tool_result + snapshot = self.page_snapshot() + return f"{tool_result}\n{snapshot}" def page_snapshot(self) -> str: return self.call_tool("browser_snapshot", {}) From b06c4e2b091d378f86984ce5b63dea668ff9875f Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 16:45:04 +0000 Subject: [PATCH 12/61] fix loop --- src/agentlab/experiments/loop.py | 41 +++++++++++++++++++------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index ad527eb2..865fb178 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -204,8 +204,12 @@ def add_action_result(self, action_result: tuple[dict, float, bool, bool, dict]) self.profiling.action_exec_start = env_info.get("action_exec_start", None) self.profiling.action_exect_after_timeout = env_info["action_exec_stop"] - self.profiling.action_exec_stop = env_info["action_exec_stop"] - env_info["action_exec_timeout"] - self.profiling.wait_for_page_loading_start = env_info.get("wait_for_page_loading_start", None) + self.profiling.action_exec_stop = ( + env_info["action_exec_stop"] - env_info["action_exec_timeout"] + ) + self.profiling.wait_for_page_loading_start = env_info.get( + "wait_for_page_loading_start", None + ) self.profiling.wait_for_page_loading_stop = env_info.get("wait_for_page_loading_stop", None) self.profiling.validation_start = env_info.get("validation_start", None) self.profiling.validation_stop = env_info.get("validation_stop", None) @@ -233,7 +237,7 @@ def make_stats(self): self.stats = stats - def save(self, exp_dir, save_json=False, save_screenshot=True, save_som=False): + def save(self, exp_dir, save_screenshot=True, save_som=False, save_json=False): # special treatment for some of the observation fields if isinstance(self.obs, dict): # save screenshots to separate files @@ -241,11 +245,17 @@ def save(self, exp_dir, save_json=False, save_screenshot=True, save_som=False): screenshot_som = self.obs.pop("screenshot_som", None) if save_screenshot and screenshot is not None: - img = Image.fromarray(screenshot) + if isinstance(screenshot, Image.Image): + img = screenshot + else: + img = Image.fromarray(screenshot) img.save(exp_dir / f"screenshot_step_{self.step}.png") if save_som and screenshot_som is not None: - img = Image.fromarray(screenshot_som) + if isinstance(screenshot_som, Image.Image): + img = screenshot_som + else: + img = Image.fromarray(screenshot_som) img.save(exp_dir / f"screenshot_som_step_{self.step}.png") # save goal object (which might contain images) to a separate file to save space @@ -390,7 +400,7 @@ def run(self): step_info = StepInfo(step=0) step_info.profiling.env_start = time.time() - self.obs, env_info = env.reset(seed=self.env_args.task_seed or 0) + step_info.obs, env_info = env.reset(seed=self.env_args.task_seed or 0) step_info.profiling.env_stop = time.time() step_info.task_info = env_info.get("task_info", None) if agent.obs_preprocessor: @@ -402,7 +412,9 @@ def run(self): step_info.profiling.agent_start = time.time() action, step_info.agent_info = agent.get_action(step_info.obs.copy()) step_info.profiling.agent_stop = time.time() - logger.debug(f"Agent chose action:\n {action}") + if step_info.agent_info.get("think", None): + logger.info(f"Agent thought: {step_info.agent_info['think']}") + logger.debug(f"Agent action:\n {action}") if action is None: # will end the episode after saving the step info. @@ -412,10 +424,6 @@ def run(self): self.maybe_send_chat(env, action, step_info) - if action is None: - logger.debug("Agent returned None action. Ending episode.") - break - episode_info.append(step_info) # --- End of (obs, action, reward) step, start a new one --- @@ -433,6 +441,8 @@ def run(self): logger.debug( f"Episode done: terminated: {step_info.terminated}, truncated: {step_info.truncated}." ) + episode_info.append(step_info) + break except Exception as e: err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}" @@ -449,9 +459,7 @@ def run(self): finally: try: if step_info is not None: - step_info.save( - self.exp_dir, save_screenshot=self.save_screenshot, save_som=self.save_som - ) + step_info.save(self.exp_dir, self.save_screenshot, self.save_som) except Exception as e: logger.error(f"Error while saving step info in the finally block: {e}") try: @@ -460,8 +468,7 @@ def run(self): and len(episode_info) > 0 and not (episode_info[-1].terminated or episode_info[-1].truncated) ): - e = KeyboardInterrupt("Early termination??") - err_msg = f"Exception uncaught by agent or environment in task {self.env_args.task_name}.\n{type(e).__name__}:\n{e}" + err_msg = "Last step in episode was not terminated or truncated." logger.info("Saving experiment info.") self.save_summary_info(episode_info, Path(self.exp_dir), err_msg, stack_trace) if TapeAgent is not None and isinstance(agent, TapeAgent): @@ -508,7 +515,7 @@ def _set_logger(self): file_handler = logging.FileHandler(self.exp_dir / "experiment.log") file_handler.setLevel(self.logging_level) # same level as console outputs formatter = logging.Formatter( - "%(asctime)s - %(process)d - %(name)s - %(levelname)s - %(message)s" + "%(asctime)s - %(process)d - %(name)s:%(lineno)d - %(levelname)s - %(message)s" ) file_handler.setFormatter(formatter) # output handler From f5ad036e65c5c648185faa43b3950c282dbd92a4 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 16:46:24 +0000 Subject: [PATCH 13/61] vision support --- src/agentlab/backends/browser/base.py | 22 ++++++++---- src/agentlab/backends/browser/env.py | 26 +++++--------- .../backends/browser/mcp_playwright.json | 4 ++- .../backends/browser/mcp_playwright.py | 36 +++++++++++++++---- src/agentlab/benchmarks/miniwob/task.py | 6 ++-- 5 files changed, 59 insertions(+), 35 deletions(-) diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 619a5a84..774011a8 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -1,7 +1,13 @@ +import logging + +from mcp.types import ImageContent, TextContent +from PIL import Image from pydantic import BaseModel from tapeagents.mcp import MCPEnvironment from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec +logger = logging.getLogger(__name__) + class BrowserBackend(BaseModel): def initialize(self) -> None: @@ -16,6 +22,9 @@ def goto(self, url: str) -> str: def page_snapshot(self) -> str: raise NotImplementedError + def page_screenshot(self) -> Image: + raise NotImplementedError + def step(self, action: ToolCallAction) -> str: raise NotImplementedError @@ -34,18 +43,19 @@ def initialize(self) -> None: self._mcp = MCPEnvironment(config_path=self.config_path) self._mcp.initialize() - def step(self, action: ToolCallAction) -> str: - return self._call_mcp(action) + def step(self, action: ToolCallAction) -> dict: + contents = self._call_mcp(action) + text = "\n".join([c.text for c in contents if c.type == "text"]) + return {"pruned_html": text, "axtree_txt": text} - def call_tool(self, tool_name: str, arguments: dict) -> str: + def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | ImageContent]: return self._call_mcp( ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments)) ) - def _call_mcp(self, action: ToolCallAction) -> str: + def _call_mcp(self, action: ToolCallAction) -> list[TextContent | ImageContent]: tool_result = self._mcp.step(action) - texts = [c.text for c in tool_result.content.content] - return "\n\n".join(texts) + return tool_result.content.content def actions(self) -> tuple[ToolSpec]: return self._mcp.actions() diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 2ff7b045..44dfd90f 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -45,23 +45,16 @@ def reset(self, seed: int): self.backend.goto(self.task.url) setup_js = self.task.get_setup_js() if setup_js: - js_out = self.backend.run_js(setup_js) - out_dict = json.loads(js_out) - logger.info(f"Task setup result: {out_dict}") - goal = out_dict["goal"] - done = out_dict["done"] - task_start_time = out_dict["task_start_time"] - logger.info(f"Task start time: {task_start_time}") - if done: - raise ValueError("Task is already done") - self.goal = goal + self.goal = self.backend.run_js(setup_js) logger.info(f"Task goal: {self.goal}") page_content = self.backend.page_snapshot() - logger.info(f"Initial obs: {page_content}") + screenshot = self.backend.page_screenshot() + logger.info(f"Initial obs: {page_content}\n{screenshot}") return { "goal_object": [{"type": "text", "text": self.goal}], "pruned_html": page_content, - "axtree_txt": "", + "axtree_txt": page_content, + "screenshot": screenshot, "last_action_error": "", "focused_element_bid": "none", }, {} @@ -90,7 +83,7 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, truncated = self._turns >= self.max_turns if self.task.validate_per_step or finished or truncated: - reward, other = self.calculate_reward(action, observation) + reward, other = self.validate_task(action, observation) if other.get("done", False): finished = True else: @@ -107,16 +100,15 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, return observation, reward, finished, truncated, env_info def _step(self, action: ToolCallAction) -> dict: - tool_result = self.backend.step(action) + obs_dict = self.backend.step(action) return { "goal_object": [{"type": "text", "text": self.goal}], - "pruned_html": tool_result, - "axtree_txt": "", + **obs_dict, "last_action_error": "", "focused_element_bid": "none", } - def calculate_reward(self, action: Action, observation: PageObservation) -> tuple[float, dict]: + def validate_task(self, action: Action, observation: PageObservation) -> tuple[float, dict]: validate_js = self.task.get_step_validate_js() validate_result = self.backend.run_js(validate_js) reward, other = self.task.parse_validation_result(validate_result) diff --git a/src/agentlab/backends/browser/mcp_playwright.json b/src/agentlab/backends/browser/mcp_playwright.json index ad30c794..b79e4f77 100644 --- a/src/agentlab/backends/browser/mcp_playwright.json +++ b/src/agentlab/backends/browser/mcp_playwright.json @@ -7,7 +7,9 @@ "--browser", "firefox", "--headless", - "--isolated" + "--isolated", + "--caps", + "vision" ], "env": { "PLAYWRIGHT_BROWSERS_PATH": "" diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 15969805..92748068 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -1,5 +1,9 @@ +import base64 import logging +from io import BytesIO +from mcp.types import ImageContent, TextContent +from PIL import Image from tapeagents.tool_calling import ToolCallAction from agentlab.backends.browser.base import MCPBrowserBackend @@ -13,7 +17,8 @@ class MCPPlaywright(MCPBrowserBackend): config_path: str = DEFAULT_CONFIG_PATH def run_js(self, js: str): - raw_response = self.call_tool("browser_evaluate", {"function": js}) + contents = self.call_tool("browser_evaluate", {"function": js}) + raw_response = "\n".join([c.text for c in contents if c.type == "text"]) try: _, half_response = raw_response.split("### Result", maxsplit=1) result_str, _ = half_response.split("\n### Ran", maxsplit=1) @@ -23,14 +28,31 @@ def run_js(self, js: str): raise e return result_str - def step(self, action: ToolCallAction) -> str: - tool_result = self._call_mcp(action) + def step(self, action: ToolCallAction) -> dict: + contents = self._call_mcp(action) + logger.info(f"Step result has {len(contents)} contents") + tool_result = "\n".join( + [c.text for c in contents if c.type == "text" and "# Ran Playwright code" not in c.text] + ) snapshot = self.page_snapshot() - return f"{tool_result}\n{snapshot}" + screenshot = self.page_screenshot() + return { + "pruned_html": f"{tool_result}\n{snapshot}", + "axtree_txt": snapshot, + "screenshot": screenshot, + } def page_snapshot(self) -> str: - return self.call_tool("browser_snapshot", {}) + contents = self.call_tool("browser_snapshot", {}) + return "\n".join([c.text for c in contents if c.type == "text"]) + + def page_screenshot(self) -> Image: + contents = self.call_tool("browser_take_screenshot", {}) + content = [c for c in contents if c.type == "image"][0] + image_base64 = content.data + image = Image.open(BytesIO(base64.b64decode(image_base64))) + return image def goto(self, url: str) -> str: - tool_result = self.call_tool("browser_navigate", {"url": url}) - return tool_result + contents = self.call_tool("browser_navigate", {"url": url}) + return "\n".join([c.text for c in contents if c.type == "text"]) diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 5ff528f1..80d44e90 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -27,6 +27,7 @@ class MiniWobTask(AbstractWebTask): "browser_drag", "browser_hover", "browser_select_option", + "browser_mouse_click_xy", ] def model_post_init(self, __context: Any): @@ -96,12 +97,10 @@ def get_setup_js(self) -> str: Math.seedrandom(42); core.EPISODE_MAX_TIME = {self.episode_max_time}; core.startEpisodeReal(); -start_time = Date.now(); while (!WOB_TASK_READY) {{ await new Promise(resolve => setTimeout(resolve, 100)); }} -ready_time = Date.now(); -return {{'goal': core.getUtterance(), 'done': WOB_DONE_GLOBAL, 'task_start_time': ready_time - start_time}}; +return core.getUtterance(); """ return f"async () => {{{js}}}" @@ -119,7 +118,6 @@ def get_task_validate_js(self) -> str: }""" def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: - logger.info(f"Validation result: {validation_result}") chunks = [c.strip() for c in validation_result.split(",")] raw_reward = float(chunks[1]) done = chunks[3].strip().lower() == "true" From a8273441a197a2bde512025df2f6ea20c88f26e9 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 16:46:37 +0000 Subject: [PATCH 14/61] fix agent_info as dict --- src/agentlab/agents/generic_agent/generic_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index 74a1a3f7..1bcd2b67 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -168,7 +168,7 @@ def get_action(self, obs): stats=stats, extra_info={"chat_model_args": asdict(self.chat_model_args)}, ) - return ans_dict["action"], agent_info + return ans_dict["action"], asdict(agent_info) def reset(self, seed=None): self.seed = seed From 4117e0a2049e2280332f5a78cd2e0715491b7986 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 17:40:00 +0000 Subject: [PATCH 15/61] remove tapeagents dep from backends core, fixes --- src/agentlab/backends/browser/base.py | 96 +++++++---- src/agentlab/backends/browser/mcp.py | 169 +++++++++++++++++++ src/agentlab/benchmarks/miniwob/benchmark.py | 1 + src/agentlab/benchmarks/miniwob/task.py | 2 + 4 files changed, 236 insertions(+), 32 deletions(-) create mode 100644 src/agentlab/backends/browser/mcp.py diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 774011a8..1f5cacf8 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -1,14 +1,75 @@ import logging +from typing import Any, Callable, Literal -from mcp.types import ImageContent, TextContent +from langchain_core.utils.function_calling import convert_to_openai_tool from PIL import Image from pydantic import BaseModel -from tapeagents.mcp import MCPEnvironment -from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec logger = logging.getLogger(__name__) +class FunctionCall(BaseModel): + """ + A class representing a function call. + + Attributes: + name (str): The name of the function being called. + arguments (Any): The arguments to be passed to the function. + """ + + name: str + arguments: Any + + +class FunctionSpec(BaseModel): + """ + A class representing the specification of a function. + + Attributes: + name (str): The name of the function. + description (str): A brief description of the function. + parameters (dict): A dictionary containing the parameters of the function. + """ + + name: str + description: str + parameters: dict + + +class ToolCallAction(BaseModel): + id: str = "" + function: FunctionCall + + +class ToolSpec(BaseModel): + """ + ToolSpec is a model that represents a tool specification with a type and a function. + + Attributes: + type (Literal["function"]): The type of the tool, which is always "function". + function (FunctionSpec): The specification of the function. + """ + + type: Literal["function"] = "function" + function: FunctionSpec + + def description(self) -> str: + return f"{self.function.name} - {self.function.description}" + + @classmethod + def from_function(cls, function: Callable): + """ + Creates an instance of the class by validating the model from a given function. + + Args: + function (Callable): The function to be converted and validated. + + Returns: + (ToolSpec): An instance of the class with the validated model. + """ + return cls.model_validate(convert_to_openai_tool(function)) + + class BrowserBackend(BaseModel): def initialize(self) -> None: raise NotImplementedError @@ -33,32 +94,3 @@ def actions(self) -> tuple[ToolSpec]: def close(self) -> None: raise NotImplementedError - - -class MCPBrowserBackend(BrowserBackend): - config_path: str - _mcp = None - - def initialize(self) -> None: - self._mcp = MCPEnvironment(config_path=self.config_path) - self._mcp.initialize() - - def step(self, action: ToolCallAction) -> dict: - contents = self._call_mcp(action) - text = "\n".join([c.text for c in contents if c.type == "text"]) - return {"pruned_html": text, "axtree_txt": text} - - def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | ImageContent]: - return self._call_mcp( - ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments)) - ) - - def _call_mcp(self, action: ToolCallAction) -> list[TextContent | ImageContent]: - tool_result = self._mcp.step(action) - return tool_result.content.content - - def actions(self) -> tuple[ToolSpec]: - return self._mcp.actions() - - def close(self) -> None: - self._mcp.close() diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py new file mode 100644 index 00000000..732011ed --- /dev/null +++ b/src/agentlab/backends/browser/mcp.py @@ -0,0 +1,169 @@ +import asyncio +import json +import logging +import os +from contextlib import AsyncExitStack +from datetime import timedelta +from typing import Any + +from mcp import ClientSession, StdioServerParameters, stdio_client +from mcp import Tool as MCPTool +from mcp.types import CallToolResult, ImageContent, TextContent + +from agentlab.backends.browser.base import BrowserBackend, FunctionSpec, ToolCallAction, ToolSpec + +logger = logging.getLogger(__name__) + + +class MCPClient: + def __init__(self, config_path: str, read_timeout_seconds: int = 10) -> None: + self.servers = self.load_config(config_path) + self.sessions: dict[str, ClientSession] = {} + self.tools: dict[str, MCPTool] = {} + self.tool_to_server: dict[str, str] = {} + self.read_timeout_seconds = read_timeout_seconds + self.exit_stack = AsyncExitStack() + self.loop = None + + def initialize(self): + try: + self.loop = asyncio.get_event_loop() + except RuntimeError: + self.loop = asyncio.new_event_loop() + asyncio.set_event_loop(self.loop) + self.loop.run_until_complete(self.start_servers()) + + async def ainitialize(self) -> None: + await self.start_servers() + + async def start_servers(self): + for server_name, server_params in self.servers.items(): + stdio_transport = await self.exit_stack.enter_async_context(stdio_client(server_params)) + stdio, write = stdio_transport + session = await self.exit_stack.enter_async_context( + ClientSession( + stdio, write, read_timeout_seconds=timedelta(seconds=self.read_timeout_seconds) + ) + ) + await session.initialize() + self.sessions[server_name] = session + response = await session.list_tools() + for tool in response.tools: + if tool.name in self.tools: + raise Exception( + f"Tools conflict! Tool {tool.name} already provided by server '{self.tool_to_server[tool.name]}'" + ) + self.tools[tool.name] = tool + self.tool_to_server[tool.name] = server_name + logger.info( + f"Connected to MCP server '{server_name}' with tools: {[tool.name for tool in response.tools]}" + ) + logger.info(f"Started {len(self.servers)} MCP servers") + + def load_config(self, config_path) -> dict[str, StdioServerParameters]: + assert os.path.exists(config_path), f"Config path {config_path} does not exist" + self.config_path = config_path + + try: + with open(config_path, "r") as f: + self.config = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Failed to parse {config_path}, invalid json: {e}") + try: + server_configs: dict[str, dict] = self.config["mcpServers"] + assert isinstance(server_configs, dict), "mcpServers must be a dict" + assert len(server_configs) > 0, "mcpServers dict is empty" + except Exception as e: + raise ValueError(f"Failed to get MCP server configs from {config_path}: {e}") + + servers: dict[str, StdioServerParameters] = {} + for server_name, server_config_dict in server_configs.items(): + try: + server_config_dict = self.prepare_env_vars(server_config_dict) + server_params = StdioServerParameters.model_validate(server_config_dict) + except Exception as e: + raise ValueError(f"Failed to parse server config {server_config_dict}: {e}") + servers[server_name] = server_params + logger.info(f"Loaded {len(servers)} MCP server configs from {config_path}") + return servers + + def prepare_env_vars(self, server_config_dict: dict) -> dict: + if server_env := server_config_dict.get("env"): + for env_var, env_value in server_env.items(): + if ( + env_var in os.environ and not env_value + ): # reuse existing env var value if not set in config + logger.info(f"Set mcp server env var {env_var} from current environment") + server_config_dict["env"][env_var] = os.environ[env_var] + return server_config_dict + + def call_tool(self, tool_name: str, tool_args: dict[str, Any]) -> CallToolResult: + result = self.loop.run_until_complete(self.acall_tool(tool_name, tool_args)) + return result + + async def acall_tool(self, tool_name: str, tool_args: dict[str, Any]) -> CallToolResult: + server_name = self.check_tool_exists(tool_name) + result = await self._call_tool(server_name, tool_name, tool_args) + return result + + async def _call_tool( + self, server_name: str, tool_name: str, tool_args: dict[str, Any] + ) -> CallToolResult: + try: + session = self.sessions[server_name] + result = await session.call_tool(tool_name, tool_args) + except Exception as e: + logger.exception(f"Error calling tool {tool_name}: {e}") + raise e + return result + + def check_tool_exists(self, tool_name): + try: + server_name = self.tool_to_server[tool_name] + except KeyError: + raise Exception(f"Tool {tool_name} not found in any of the MCP servers") + return server_name + + def actions(self) -> tuple[ToolSpec]: + return ( + ToolSpec( + function=FunctionSpec( + name=tool.name, description=tool.description or "", parameters=tool.inputSchema + ) + ) + for tool in self.tools.values() + ) + + async def close(self) -> None: + await self.exit_stack.aclose() + + +class MCPBrowserBackend(BrowserBackend): + config_path: str + _mcp = None + + def initialize(self) -> None: + self._mcp = MCPClient(config_path=self.config_path) + self._mcp.initialize() + + def step(self, action: ToolCallAction) -> dict: + contents = self.call_tool(action.function.name, action.function.arguments) + text = "\n".join([c.text for c in contents if c.type == "text"]) + images = [c for c in contents if c.type == "image"] + return { + "pruned_html": text, + "axtree_txt": text, + "screenshot": images[-1] if images else None, + } + + def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | ImageContent]: + tool_result = self._mcp.call_tool(tool_name, arguments) + if tool_result.isError: + return [TextContent(text=f"Error calling tool {tool_name}: {tool_result.error}")] + return tool_result.content + + def actions(self) -> tuple[ToolSpec]: + return self._mcp.actions() + + def close(self) -> None: + self._mcp.close() diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py index bda95d66..7cacc309 100644 --- a/src/agentlab/benchmarks/miniwob/benchmark.py +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -23,6 +23,7 @@ class MiniWobBenchmark(AbstractBenchmark): high_level_action_set_args: ToolsActionSet = None def model_post_init(self, __context: Any) -> None: + self.name = f"miniwob_{self.backend.__class__.__name__.lower()}" self.env_args_list = [] if self.dataset is None: self.dataset = get_miniwob_tasks() diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 80d44e90..5a46cb2a 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -31,6 +31,8 @@ class MiniWobTask(AbstractWebTask): ] def model_post_init(self, __context: Any): + if self.base_url.endswith("/"): + self.base_url = self.base_url[:-1] self.url = f"{self.base_url}/{self.subdomain}.html" def get_setup_js(self) -> str: From a3fa1c9ff28fc732e0da4f4650de821da0d3d161 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 7 Nov 2025 17:40:50 +0000 Subject: [PATCH 16/61] python playwright backend draft --- src/agentlab/backends/browser/playwright.py | 53 +++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 src/agentlab/backends/browser/playwright.py diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py new file mode 100644 index 00000000..12824201 --- /dev/null +++ b/src/agentlab/backends/browser/playwright.py @@ -0,0 +1,53 @@ +from playwright.sync_api import sync_playwright + +from agentlab.backends.browser.base import BrowserBackend, ToolCallAction + + +class PlaywrightSyncBackend(BrowserBackend): + def __init__(self): + self.actions = { + "browser_press_key": lambda key: self.page.keyboard.press(key), + "browser_type": lambda text: self.page.type(text), + "browser_click": lambda selector: self.page.click(selector), + "browser_drag": lambda from_selector, to_selector: self.drag_and_drop( + from_selector, to_selector + ), + "browser_hover": lambda selector: self.page.hover(selector), + "browser_select_option": lambda selector: self.page.select_option(selector), + "browser_mouse_click_xy": lambda x, y: self.page.mouse.click(x, y), + } + + def drag_and_drop(self, from_selector: str, to_selector: str): + from_elem = self.page.locator(from_selector) + from_elem.hover(timeout=500) + self.page.mouse.down() + + to_elem = self.page.locator(to_selector) + to_elem.hover(timeout=500) + self.page.mouse.up() + + def initialize(self): + self.browser = sync_playwright().start().chromium.launch(headless=True) + self.page = self.browser.new_page() + + def run_js(self, js: str): + return self.page.evaluate(js) + + def goto(self, url: str): + self.page.goto(url) + + def page_snapshot(self): + return self.page.content() + + def page_screenshot(self): + return self.page.screenshot() + + def step(self, action: ToolCallAction): + fn = self.actions[action.function.name] + return fn(**action.function.arguments) + + def actions(self): + return self.page.actions() + + def close(self): + self.browser.close() From 955e0d3edf681bd0a72adb71a15b8bed6fcf2d46 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 10 Nov 2025 13:32:28 +0000 Subject: [PATCH 17/61] fixes --- src/agentlab/backends/browser/mcp.py | 2 +- src/agentlab/backends/browser/mcp_playwright.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index 732011ed..99cd685c 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -163,7 +163,7 @@ def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | Image return tool_result.content def actions(self) -> tuple[ToolSpec]: - return self._mcp.actions() + return list(self._mcp.actions()) def close(self) -> None: self._mcp.close() diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 92748068..5e5e68a5 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -6,7 +6,7 @@ from PIL import Image from tapeagents.tool_calling import ToolCallAction -from agentlab.backends.browser.base import MCPBrowserBackend +from agentlab.backends.browser.mcp import MCPBrowserBackend logger = logging.getLogger(__name__) @@ -29,7 +29,7 @@ def run_js(self, js: str): return result_str def step(self, action: ToolCallAction) -> dict: - contents = self._call_mcp(action) + contents = self.call_tool(action.function.name, action.function.arguments) logger.info(f"Step result has {len(contents)} contents") tool_result = "\n".join( [c.text for c in contents if c.type == "text" and "# Ran Playwright code" not in c.text] From 61a537fbd9a816a7e9a5f2f503a6681ef3f045fa Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 11 Nov 2025 13:01:09 +0000 Subject: [PATCH 18/61] remove tapeagents dep, add task-level obs postprocess --- src/agentlab/actions.py | 4 ++- .../agents/generic_agent/generic_agent.py | 7 ++-- src/agentlab/backends/browser/env.py | 33 +++++-------------- .../backends/browser/mcp_playwright.py | 4 +-- src/agentlab/benchmarks/miniwob/task.py | 7 ++++ src/agentlab/benchmarks/web_task.py | 6 +++- 6 files changed, 27 insertions(+), 34 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index b009f27d..bc1b444d 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -1,9 +1,11 @@ import json import logging +from typing import Literal from bgym import AbstractActionSet -from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec +from pydantic import BaseModel, Field +from agentlab.backends.browser.base import FunctionCall, ToolCallAction, ToolSpec from agentlab.llm.llm_utils import parse_html_tags_raise logger = logging.getLogger(__name__) diff --git a/src/agentlab/agents/generic_agent/generic_agent.py b/src/agentlab/agents/generic_agent/generic_agent.py index 1bcd2b67..f65b2132 100644 --- a/src/agentlab/agents/generic_agent/generic_agent.py +++ b/src/agentlab/agents/generic_agent/generic_agent.py @@ -10,13 +10,10 @@ from copy import deepcopy from dataclasses import asdict, dataclass -from functools import partial from warnings import warn -import bgym from bgym import Benchmark from browsergym.experiments.agent import Agent, AgentInfo -from tapeagents.tool_calling import ToolSpec from agentlab.actions import ToolsActionSet from agentlab.agents import dynamic_prompting as dp @@ -67,7 +64,7 @@ def prepare(self): def close(self): return self.chat_model_args.close_server() - def make_agent(self, actions: list[ToolSpec] | None = None): + def make_agent(self, actions: list | None = None): return GenericAgent( chat_model_args=self.chat_model_args, flags=self.flags, @@ -83,7 +80,7 @@ def __init__( chat_model_args: BaseModelArgs, flags: GenericPromptFlags, max_retry: int = 4, - actions: list[ToolSpec] | None = None, + actions: list | None = None, ): self.chat_llm = chat_model_args.make_model() diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 44dfd90f..c7d3294b 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -1,31 +1,15 @@ -import json import logging import time from dataclasses import dataclass from pathlib import Path -from typing import Literal - -from tapeagents.core import Action, Observation, StopStep -from tapeagents.tool_calling import ToolCallAction, ToolSpec from agentlab.actions import ToolsActionSet -from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.base import BrowserBackend, ToolCallAction, ToolSpec from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs from agentlab.benchmarks.web_task import AbstractWebTask logger = logging.getLogger(__name__) - -class GoalObservation(Observation): - kind: Literal["goal_observation"] = "goal_observation" - goal: str - - -class PageObservation(Observation): - kind: Literal["page_observation"] = "page_observation" - content: str - - class BrowserEnv(AbstractEnv): def __init__( self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 @@ -50,22 +34,23 @@ def reset(self, seed: int): page_content = self.backend.page_snapshot() screenshot = self.backend.page_screenshot() logger.info(f"Initial obs: {page_content}\n{screenshot}") - return { + obs = { "goal_object": [{"type": "text", "text": self.goal}], "pruned_html": page_content, "axtree_txt": page_content, "screenshot": screenshot, "last_action_error": "", "focused_element_bid": "none", - }, {} + } + return self.task.obs_postprocess(obs), {} - def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, bool, dict]: + def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, dict]: if isinstance(action, str): action = ToolsActionSet.parse_action(action) logger.info(f"BrowserEnv.step() called with action {action}") action_exec_start = time.time() - finished = isinstance(action, StopStep) + finished = action.function.name == "final_step" if finished: observation = { "goal_object": [{"type": "text", "text": self.goal}], @@ -76,6 +61,7 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, } else: observation = self._step(action) + observation = self.task.obs_postprocess(observation) action_exec_stop = time.time() self._turns += 1 logger.info(f"Obs:\n{observation['pruned_html']}") @@ -95,8 +81,7 @@ def step(self, action: ToolCallAction | str) -> tuple[Observation, float, bool, "action_exec_stop": action_exec_stop, "action_exec_timeout": 0.0, } | other - obs_view = observation.short_view() if isinstance(observation, Observation) else observation - logger.info(f"Action result in observation: {obs_view}") + logger.info(f"Action result in observation: {observation}") return observation, reward, finished, truncated, env_info def _step(self, action: ToolCallAction) -> dict: @@ -108,7 +93,7 @@ def _step(self, action: ToolCallAction) -> dict: "focused_element_bid": "none", } - def validate_task(self, action: Action, observation: PageObservation) -> tuple[float, dict]: + def validate_task(self, action: ToolCallAction, observation: dict) -> tuple[float, dict]: validate_js = self.task.get_step_validate_js() validate_result = self.backend.run_js(validate_js) reward, other = self.task.parse_validation_result(validate_result) diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 5e5e68a5..033158d3 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -2,11 +2,9 @@ import logging from io import BytesIO -from mcp.types import ImageContent, TextContent from PIL import Image -from tapeagents.tool_calling import ToolCallAction -from agentlab.backends.browser.mcp import MCPBrowserBackend +from agentlab.backends.browser.mcp import MCPBrowserBackend, ToolCallAction logger = logging.getLogger(__name__) diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 5a46cb2a..5e2306a0 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -3,6 +3,7 @@ from typing import Any, ClassVar from browsergym.miniwob import ALL_MINIWOB_TASKS +from PIL import Image from agentlab.benchmarks.web_task import AbstractWebTask @@ -130,6 +131,12 @@ def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: "done": done, } + def obs_postprocess(self, obs: dict) -> dict: + screenshot: Image.Image | None = obs.get("screenshot", None) + if screenshot is not None: + obs["screenshot"] = screenshot.crop((0, 0, 332, 214)) # crop to 332x214 because this is the viewport size for MiniWob + return obs + def get_miniwob_tasks( base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000 diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index e8627519..930c1450 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -1,7 +1,8 @@ from typing import ClassVar from pydantic import BaseModel -from tapeagents.tool_calling import ToolSpec + +from agentlab.backends.browser.base import ToolSpec class AbstractWebTask(BaseModel): @@ -29,3 +30,6 @@ def get_step_validate_js(self) -> str: def parse_validation_result(self, validate_result: str) -> tuple[float, dict]: raise NotImplementedError + + def obs_postprocess(self, obs: dict) -> dict: + return obs From b82aef04fc1e106b1523027a243959b13fcd33e1 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 13 Nov 2025 14:10:12 +0000 Subject: [PATCH 19/61] fix --- src/agentlab/backends/browser/mcp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index 99cd685c..d0d02b2a 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -159,7 +159,7 @@ def step(self, action: ToolCallAction) -> dict: def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | ImageContent]: tool_result = self._mcp.call_tool(tool_name, arguments) if tool_result.isError: - return [TextContent(text=f"Error calling tool {tool_name}: {tool_result.error}")] + return [TextContent(text=f"Error calling tool {tool_name}")] + tool_result.content return tool_result.content def actions(self) -> tuple[ToolSpec]: From 645ee2de8c09099b16127447dc5fd435e79fc1ae Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 13 Nov 2025 14:11:22 +0000 Subject: [PATCH 20/61] fix action space --- src/agentlab/actions.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index bc1b444d..50301d61 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -1,9 +1,7 @@ import json import logging -from typing import Literal from bgym import AbstractActionSet -from pydantic import BaseModel, Field from agentlab.backends.browser.base import FunctionCall, ToolCallAction, ToolSpec from agentlab.llm.llm_utils import parse_html_tags_raise @@ -28,27 +26,22 @@ def describe(self, with_long_description: bool = True, with_examples: bool = Tru def example_action(self, abstract: bool) -> str: if abstract: - return """ -{ + return """{ "name": "", "arguments": { "": "", "": "", ... } -} - -""" +}""" else: - return """ -{ - "name": "browser_navigate", + return """{ + "name": "browser_click", "arguments": { - "url": "https://www.google.com" + "element": "buttom with year 2022", + "ref": "e26" } -} - -""" +}""" @classmethod def parse_action(cls, llm_output: str) -> ToolCallAction: From 02dee092f1869bf9cd6d21b721de35f230f6c636 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Mon, 17 Nov 2025 17:37:43 +0000 Subject: [PATCH 21/61] playwright backend --- experiments/run_miniwob.py | 12 +- src/agentlab/backends/browser/playwright.py | 118 +++++++++++++++----- src/agentlab/benchmarks/miniwob/task.py | 10 +- 3 files changed, 103 insertions(+), 37 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 36aa5212..dc7ea95c 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -8,6 +8,7 @@ from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.backends.browser.playwright import PlaywrightSyncBackend from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT @@ -20,13 +21,16 @@ if __name__ == "__main__": config = load_config("miniwob") - # benchmark = DEFAULT_BENCHMARKS["miniwob"]() + # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) + benchmark = MiniWobBenchmark(backend=PlaywrightSyncBackend()) + agent_args = GenericAgentArgs( chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"], flags=FLAGS_GPT_4o, ) - - benchmark = MiniWobBenchmark(backend=MCPPlaywright()) + # agent_args.flags.obs.use_ax_tree = False + # agent_args.flags.obs.use_html = True + # agent_args.flags.obs.use_focused_element = False # agent_args =TapeAgentArgs(agent_name=config.name, config=config) @@ -38,7 +42,7 @@ logging_level_stdout=logging.INFO, ) if os.environ.get("AGENTLAB_DEBUG"): - study.exp_args_list = study.exp_args_list[1:2] + study.exp_args_list = study.exp_args_list[23:24] study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") else: study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index 12824201..c5be2495 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -1,53 +1,111 @@ -from playwright.sync_api import sync_playwright +import logging +from io import BytesIO +from typing import Any, Callable -from agentlab.backends.browser.base import BrowserBackend, ToolCallAction +from PIL import Image +from playwright.sync_api import Page, sync_playwright + +from agentlab.backends.browser.base import BrowserBackend, ToolCallAction, ToolSpec + +logger = logging.getLogger(__name__) class PlaywrightSyncBackend(BrowserBackend): - def __init__(self): - self.actions = { - "browser_press_key": lambda key: self.page.keyboard.press(key), - "browser_type": lambda text: self.page.type(text), - "browser_click": lambda selector: self.page.click(selector), - "browser_drag": lambda from_selector, to_selector: self.drag_and_drop( - from_selector, to_selector - ), - "browser_hover": lambda selector: self.page.hover(selector), - "browser_select_option": lambda selector: self.page.select_option(selector), - "browser_mouse_click_xy": lambda x, y: self.page.mouse.click(x, y), + _actions: dict[str, Callable] + _browser: Any + _page: Page + + def model_post_init(self, __context: Any): + self._actions = { + "browser_press_key": self.browser_press_key, + "browser_type": self.browser_type, + "browser_click": self.browser_click, + "browser_drag": self.browser_drag, + "browser_hover": self.browser_hover, + "browser_select_option": self.browser_select_option, + "browser_mouse_click_xy": self.browser_mouse_click_xy, } - def drag_and_drop(self, from_selector: str, to_selector: str): - from_elem = self.page.locator(from_selector) + def browser_press_key(self, key: str): + """ + Press a key on the keyboard. + """ + self._page.keyboard.press(key) + + def browser_type(self, text: str): + """ + Type text into the focused element. + """ + self._page.type(text) + + def browser_click(self, selector: str): + """ + Click on a selector. + """ + self._page.click(selector) + + def browser_drag(self, from_selector: str, to_selector: str): + """ + Drag and drop from one selector to another. + """ + from_elem = self._page.locator(from_selector) from_elem.hover(timeout=500) - self.page.mouse.down() + self._page.mouse.down() - to_elem = self.page.locator(to_selector) + to_elem = self._page.locator(to_selector) to_elem.hover(timeout=500) - self.page.mouse.up() + self._page.mouse.up() + + def browser_hover(self, selector: str): + """ + Hover over a given element. + """ + self._page.hover(selector) + + def browser_select_option(self, selector: str): + """ + Select an option from a given element. + """ + self._page.select_option(selector) + + def browser_mouse_click_xy(self, x: int, y: int): + """ + Click at a given x, y coordinate using the mouse. + """ + self._page.mouse.click(x, y) def initialize(self): - self.browser = sync_playwright().start().chromium.launch(headless=True) - self.page = self.browser.new_page() + self._browser = sync_playwright().start().chromium.launch(headless=True, chromium_sandbox=True) + self._page = self._browser.new_page() def run_js(self, js: str): - return self.page.evaluate(js) + js_result = self._page.evaluate(js) + logger.info(f"JS result: {js_result}") + return js_result def goto(self, url: str): - self.page.goto(url) + self._page.goto(url) def page_snapshot(self): - return self.page.content() + return self._page.content() def page_screenshot(self): - return self.page.screenshot() + scr_bytes = self._page.screenshot() + return Image.open(BytesIO(scr_bytes)) def step(self, action: ToolCallAction): - fn = self.actions[action.function.name] - return fn(**action.function.arguments) - - def actions(self): - return self.page.actions() + fn = self._actions[action.function.name] + action_result = fn(**action.function.arguments) + snapshot = self.page_snapshot() + screenshot = self.page_screenshot() + return { + "pruned_html": f"{action_result or ''}\n{snapshot}", + "axtree_txt": snapshot, + "screenshot": screenshot, + } + def actions(self) -> tuple[ToolSpec]: + specs = [ToolSpec.from_function(fn) for fn in self._actions.values()] + return tuple(specs) def close(self): - self.browser.close() + self._browser.close() diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 5e2306a0..b2e42ade 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -120,10 +120,14 @@ def get_task_validate_js(self) -> str: return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; }""" - def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: - chunks = [c.strip() for c in validation_result.split(",")] + def parse_validation_result(self, validation_result: str | list) -> tuple[float, dict]: + if isinstance(validation_result, list): + chunks = validation_result + done = chunks[3] + else: + chunks = [c.strip() for c in validation_result.split(",")] + done = chunks[3].strip().lower() == "true" raw_reward = float(chunks[1]) - done = chunks[3].strip().lower() == "true" reward = float(raw_reward > 0) return reward, { "raw_reward": raw_reward, From f591f36e8ffb33bbc8bca9ac494065a05a813876 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 13:37:53 +0000 Subject: [PATCH 22/61] fix obs format --- src/agentlab/backends/browser/env.py | 11 +++++++---- src/agentlab/backends/browser/mcp_playwright.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index c7d3294b..732366c8 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -33,16 +33,17 @@ def reset(self, seed: int): logger.info(f"Task goal: {self.goal}") page_content = self.backend.page_snapshot() screenshot = self.backend.page_screenshot() - logger.info(f"Initial obs: {page_content}\n{screenshot}") obs = { "goal_object": [{"type": "text", "text": self.goal}], - "pruned_html": page_content, + "pruned_html": "", "axtree_txt": page_content, "screenshot": screenshot, "last_action_error": "", "focused_element_bid": "none", } - return self.task.obs_postprocess(obs), {} + obs = self.task.obs_postprocess(obs) + logger.info(f"Initial obs: {obs}") + return obs, {} def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, dict]: if isinstance(action, str): @@ -62,9 +63,11 @@ def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, d else: observation = self._step(action) observation = self.task.obs_postprocess(observation) + + action_exec_stop = time.time() self._turns += 1 - logger.info(f"Obs:\n{observation['pruned_html']}") + logger.info(f"Obs: {observation}") truncated = self._turns >= self.max_turns diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 033158d3..51f88d6a 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -35,7 +35,8 @@ def step(self, action: ToolCallAction) -> dict: snapshot = self.page_snapshot() screenshot = self.page_screenshot() return { - "pruned_html": f"{tool_result}\n{snapshot}", + "tool_result": tool_result, + "pruned_html": "", "axtree_txt": snapshot, "screenshot": screenshot, } From 01e0719672c858119e1deee652c60ea36c9a7e32 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 13:40:02 +0000 Subject: [PATCH 23/61] simplest react agent with markdown observations, images and tool calls --- experiments/run_miniwob.py | 14 +- src/agentlab/agents/tapeagent/agent.py | 159 +++++++++++++++++- .../agents/tapeagent/conf/agent/react.yaml | 24 +++ .../agents/tapeagent/conf/llm/gpt5-mini.yaml | 4 +- .../agents/tapeagent/conf/miniwob.yaml | 8 +- 5 files changed, 192 insertions(+), 17 deletions(-) create mode 100644 src/agentlab/agents/tapeagent/conf/agent/react.yaml diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index dc7ea95c..c9f02f55 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -22,16 +22,16 @@ config = load_config("miniwob") # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) - benchmark = MiniWobBenchmark(backend=PlaywrightSyncBackend()) + benchmark = MiniWobBenchmark(backend=MCPPlaywright()) - agent_args = GenericAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"], - flags=FLAGS_GPT_4o, - ) + # agent_args = GenericAgentArgs( + # chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/gpt-5-mini"], + # flags=FLAGS_GPT_4o, + # ) # agent_args.flags.obs.use_ax_tree = False # agent_args.flags.obs.use_html = True # agent_args.flags.obs.use_focused_element = False - # agent_args =TapeAgentArgs(agent_name=config.name, config=config) + agent_args = TapeAgentArgs(agent_name=config.name, config=config) study = make_study( @@ -42,7 +42,7 @@ logging_level_stdout=logging.INFO, ) if os.environ.get("AGENTLAB_DEBUG"): - study.exp_args_list = study.exp_args_list[23:24] + study.exp_args_list = study.exp_args_list[23:27] study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") else: study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index a0062801..b636f26e 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -1,17 +1,33 @@ import logging +import tempfile from dataclasses import dataclass from typing import Literal import bgym import hydra +from litellm import ChatCompletionThinkingBlock from omegaconf import DictConfig +from PIL import Image from pydantic import Field from tapeagents.agent import Agent -from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought +from tapeagents.core import ( + Action, + LLMOutputParsingFailureAction, + Observation, + SetNextNode, + StopStep, + TapeMetadata, + Thought, +) from tapeagents.core import Tape as BaseTape +from tapeagents.llms import LLMStream +from tapeagents.nodes import FatalError, StandardNode +from tapeagents.steps import ImageObservation from tapeagents.tool_calling import ToolSpec +from termcolor import colored from agentlab.agents.agent_args import AgentArgs +from agentlab.backends.browser.base import ToolSpec as AgentlabToolSpec logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) @@ -27,10 +43,59 @@ class ExtendedMetadata(TapeMetadata): other: dict = {} +class AgentResponse(Thought): + kind: Literal["agent_response"] = "agent_response" + response: str + + +class AgentThinking(Thought): + kind: Literal["agent_thinking"] = "agent_thinking" + thinking: str + + class Tape(BaseTape): metadata: ExtendedMetadata = Field(default_factory=ExtendedMetadata) # type: ignore +class ToolCallNode(StandardNode): + use_known_actions: bool = True + use_function_calls: bool = True + + def generate_steps(self, agent: Agent, tape: Tape, llm_stream: LLMStream): + new_steps = [] + for event in llm_stream: + if event.output.get("reasoning_content"): + logger.info(colored(f"LLM reasoning:\n{event.output.reasoning_content}", "yellow")) + new_steps.append(AgentThinking(thinking=event.output.reasoning_content)) + if event.output.get("thinking_blocks"): + for block in event.output.thinking_blocks: + if isinstance(block, ChatCompletionThinkingBlock): + logger.info(colored(f"LLM thinking block:\n{block}", "yellow")) + new_steps.append(AgentThinking(thinking=block.content)) + if event.output.content: + logger.info(colored(f"LLM output:\n{event.output.content}", "cyan")) + new_steps.append(AgentResponse(response=event.output.content)) + if event.output.tool_calls: + logger.info(colored(f"LLM tool calls:\n{event.output.tool_calls}", "magenta")) + new_steps += [ + self.tool_call_to_step(agent, tool_call) + for tool_call in event.output.tool_calls + ] + for step in new_steps: + yield step + if isinstance(step, LLMOutputParsingFailureAction): + yield SetNextNode(next_node=self.name) # loop to the same node to retry + break + if not new_steps: + raise FatalError("No completions!") + if ( + self.next_node + and not isinstance(new_steps[-1], StopStep) + and not any(isinstance(step, SetNextNode) for step in new_steps) + ): + yield SetNextNode(next_node=self.next_node) + + def load_config(config_name: str) -> DictConfig: with hydra.initialize(config_path="conf", version_base="1.1"): config = hydra.compose(config_name=config_name) @@ -45,8 +110,16 @@ def make_agent(self, actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: if actions is None: agent = hydra.utils.instantiate(self.config.agent) else: + tapeagents_actions = [ + ToolSpec(**tool.model_dump()) if isinstance(tool, AgentlabToolSpec) else tool + for tool in actions + ] tools_description = "\n".join([action.description() for action in actions]) - agent = hydra.utils.instantiate(self.config.agent, known_actions=actions, tools_description=tools_description) + agent = hydra.utils.instantiate( + self.config.agent, + known_actions=tapeagents_actions, + tools_description=tools_description, + ) return TapeAgent(agent=agent) @@ -64,6 +137,62 @@ class DictObservation(Observation): content: str +class MarkdownObservation(Observation): + def llm_view(self, **kwargs) -> str: + return f"## Markdown:\n{self.content}" + + def short_view(self, max_chars: int = 100) -> str: + return self.llm_view()[:max_chars] + + +class GoalObservation(MarkdownObservation): + """ + Contains task goal + """ + + kind: Literal["goal_observation"] = "goal_observation" # type: ignore + goal: str + + def llm_view(self, **kwargs) -> str: + return f"## Goal:\n{self.goal}" + + +class HTMLPage(MarkdownObservation): + """ + Contains page content + """ + + kind: Literal["html_page"] = "html_page" + html: str + + def llm_view(self, **kwargs) -> str: + return f"## Page Content:\n{self.html}" + + +class AXTreePage(MarkdownObservation): + """ + Contains accessibility tree + """ + + kind: Literal["ax_tree_page"] = "ax_tree_page" + axtree: str + + def llm_view(self, **kwargs) -> str: + return f"## Accessibility Tree:\n{self.axtree}" + + +class ActionResult(MarkdownObservation): + """ + Contains action result + """ + + kind: Literal["action_result"] = "action_result" + result: str + + def llm_view(self, **kwargs) -> str: + return f"## Action Result:\n{self.result}" + + class TapeAgent(bgym.Agent): agent: Agent tape: Tape @@ -73,11 +202,33 @@ def __init__(self, agent: Agent): self.agent = agent self.tape = Tape(steps=[]) - def obs_preprocessor(self, obs: Observation | list[Observation]) -> list[Observation]: + def obs_preprocessor(self, obs: Observation | list[Observation] | dict) -> list[Observation]: if isinstance(obs, Observation): obs = [obs] + if isinstance(obs, dict): + obs_steps = [] + if obs.get("goal_object"): + obs_steps.append(GoalObservation(goal=obs["goal_object"][0]["text"])) + if obs.get("action_result"): + obs_steps.append(ActionResult(result=obs["action_result"])) + if obs.get("pruned_html"): + obs_steps.append(HTMLPage(html=obs["pruned_html"])) + if obs.get("axtree_txt"): + obs_steps.append(AXTreePage(axtree=obs["axtree_txt"])) + if obs.get("screenshot"): + if isinstance(obs["screenshot"], Image.Image): + tmp_image_path = tempfile.mktemp(suffix=".png") + obs["screenshot"].save(tmp_image_path) + obs_steps.append(ImageObservation(image_path=tmp_image_path)) + else: + raise ValueError(f"Expected Image.Image, got {type(obs['screenshot'])}") + if obs.get("last_action_error"): + obs_steps.append(ActionResult(result=f"Action error:\n{obs['last_action_error']}")) + assert len(obs_steps) > 0, f"Unknown dict observation, keys: {obs.keys()}" + obs = obs_steps assert isinstance(obs, list), f"Expected list of Observations, got {type(obs)}" - logger.info(f"Observations: {[type(o).__name__ for o in obs]}") + obs_view = "\n".join([o.short_view() for o in obs]) + logger.info(colored(f"Observations:\n{obs_view}", "green")) return obs def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, TapeAgentInfo]: diff --git a/src/agentlab/agents/tapeagent/conf/agent/react.yaml b/src/agentlab/agents/tapeagent/conf/agent/react.yaml new file mode 100644 index 00000000..0ec8689c --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/agent/react.yaml @@ -0,0 +1,24 @@ +_target_: tapeagents.agent.Agent +name : react_agent +max_iterations: 10 +llms: + default: ${llm} +templates: {} +nodes: + - _target_: agentlab.agents.tapeagent.agent.ToolCallNode + name: react + system_prompt: | + You are an expert AI Agent trained to assist users with complex web tasks. + Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. + Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration. + Do not express emotions or opinions. + guidance: | + Think along the following lines: + 1. Summarize the last observation and describe the visible changes in the state. + 2. Evaluate action success, explain impact on task/plan. + 3. If there are any errors, describe them and propose alternative. + 4. List next steps to move towards the goaland propose next immediate action. + The produce the function call that performs the proposed step. If the task is complete, produce the final step. + steps: + - tapeagents.core.FinalStep + next_node: react \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml index 84dbe3b3..e45a7756 100644 --- a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml +++ b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml @@ -1,6 +1,6 @@ _target_: tapeagents.llms.LiteLLM -model_name: gpt-5-mini-2025-08-07 -use_cache: true +model_name: azure/gpt-5-mini +use_cache: false context_size: 128000 parameters: temperature: 1.0 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml index 1867cf20..a9d82712 100644 --- a/src/agentlab/agents/tapeagent/conf/miniwob.yaml +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -1,9 +1,9 @@ defaults: - - llm: sonnet - - agent: plan_react_fcall + - llm: gpt5-mini + - agent: react - _self_ -name: miniwob_tapeagent -comment: MiniWob TapeAgent +name: miniwob +comment: MiniWob Agent parallel_backend: ray n_jobs: 32 \ No newline at end of file From dba597817e68af9131ebf7396cf70492ebebf448 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 14:10:33 +0000 Subject: [PATCH 24/61] fix mcp close --- src/agentlab/backends/browser/mcp.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index d0d02b2a..84129374 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -134,9 +134,12 @@ def actions(self) -> tuple[ToolSpec]: for tool in self.tools.values() ) - async def close(self) -> None: + async def aclose(self) -> None: await self.exit_stack.aclose() + def close(self) -> None: + self.loop.run_until_complete(self.aclose()) + class MCPBrowserBackend(BrowserBackend): config_path: str @@ -166,4 +169,7 @@ def actions(self) -> tuple[ToolSpec]: return list(self._mcp.actions()) def close(self) -> None: - self._mcp.close() + try: + self._mcp.close() + except Exception: + pass From ecf59d5a9eb4598d0e75de95c785fb6722cc3b87 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 15:52:02 +0000 Subject: [PATCH 25/61] async playwright backend --- src/agentlab/backends/browser/base.py | 5 +- src/agentlab/backends/browser/env.py | 20 ++--- .../backends/browser/mcp_playwright.py | 12 ++- src/agentlab/backends/browser/playwright.py | 81 +++++++++++-------- 4 files changed, 70 insertions(+), 48 deletions(-) diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 1f5cacf8..d5cc0151 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -80,12 +80,15 @@ def run_js(self, js: str): def goto(self, url: str) -> str: raise NotImplementedError - def page_snapshot(self) -> str: + def page_html(self) -> str: raise NotImplementedError def page_screenshot(self) -> Image: raise NotImplementedError + def page_axtree(self) -> str: + raise NotImplementedError + def step(self, action: ToolCallAction) -> str: raise NotImplementedError diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 732366c8..3c403b86 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -31,12 +31,13 @@ def reset(self, seed: int): if setup_js: self.goal = self.backend.run_js(setup_js) logger.info(f"Task goal: {self.goal}") - page_content = self.backend.page_snapshot() + html = self.backend.page_html() screenshot = self.backend.page_screenshot() + axtree = self.backend.page_axtree() obs = { "goal_object": [{"type": "text", "text": self.goal}], - "pruned_html": "", - "axtree_txt": page_content, + "pruned_html": html, + "axtree_txt": axtree, "screenshot": screenshot, "last_action_error": "", "focused_element_bid": "none", @@ -89,12 +90,13 @@ def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, d def _step(self, action: ToolCallAction) -> dict: obs_dict = self.backend.step(action) - return { - "goal_object": [{"type": "text", "text": self.goal}], - **obs_dict, - "last_action_error": "", - "focused_element_bid": "none", - } + if "goal_object" not in obs_dict: + obs_dict["goal_object"] = [{"type": "text", "text": self.goal}] + if "last_action_error" not in obs_dict: + obs_dict["last_action_error"] = "" + if "focused_element_bid" not in obs_dict: + obs_dict["focused_element_bid"] = "none" + return obs_dict def validate_task(self, action: ToolCallAction, observation: dict) -> tuple[float, dict]: validate_js = self.task.get_step_validate_js() diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 51f88d6a..1705bfc1 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -32,16 +32,20 @@ def step(self, action: ToolCallAction) -> dict: tool_result = "\n".join( [c.text for c in contents if c.type == "text" and "# Ran Playwright code" not in c.text] ) - snapshot = self.page_snapshot() + html = self.page_html() screenshot = self.page_screenshot() + axtree = self.page_axtree() return { "tool_result": tool_result, - "pruned_html": "", - "axtree_txt": snapshot, + "pruned_html": html, + "axtree_txt": axtree, "screenshot": screenshot, } - def page_snapshot(self) -> str: + def page_html(self) -> str: + return "" + + def page_axtree(self) -> str: contents = self.call_tool("browser_snapshot", {}) return "\n".join([c.text for c in contents if c.type == "text"]) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index c5be2495..f38ac9ce 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -1,18 +1,20 @@ +import asyncio import logging from io import BytesIO from typing import Any, Callable from PIL import Image -from playwright.sync_api import Page, sync_playwright +from playwright.async_api import Browser, Page, async_playwright from agentlab.backends.browser.base import BrowserBackend, ToolCallAction, ToolSpec logger = logging.getLogger(__name__) -class PlaywrightSyncBackend(BrowserBackend): +class AsyncPlaywright(BrowserBackend): _actions: dict[str, Callable] - _browser: Any + _loop: asyncio.AbstractEventLoop + _browser: Browser _page: Page def model_post_init(self, __context: Any): @@ -26,86 +28,97 @@ def model_post_init(self, __context: Any): "browser_mouse_click_xy": self.browser_mouse_click_xy, } - def browser_press_key(self, key: str): + def initialize(self): + self._loop = asyncio.get_event_loop() + self._loop.run_until_complete(self.ainitialize()) + + async def ainitialize(self): + pw = await async_playwright().start() + self._browser = await pw.chromium.launch(headless=True, chromium_sandbox=True) + self._page = await self._browser.new_page() + + async def browser_press_key(self, key: str): """ Press a key on the keyboard. """ - self._page.keyboard.press(key) + await self._page.keyboard.press(key) - def browser_type(self, text: str): + async def browser_type(self, text: str): """ Type text into the focused element. """ - self._page.type(text) + await self._page.type(text) - def browser_click(self, selector: str): + async def browser_click(self, selector: str): """ Click on a selector. """ - self._page.click(selector) + await self._page.click(selector) - def browser_drag(self, from_selector: str, to_selector: str): + async def browser_drag(self, from_selector: str, to_selector: str): """ Drag and drop from one selector to another. """ from_elem = self._page.locator(from_selector) - from_elem.hover(timeout=500) - self._page.mouse.down() + await from_elem.hover(timeout=500) + await self._page.mouse.down() to_elem = self._page.locator(to_selector) - to_elem.hover(timeout=500) - self._page.mouse.up() + await to_elem.hover(timeout=500) + await self._page.mouse.up() - def browser_hover(self, selector: str): + async def browser_hover(self, selector: str): """ Hover over a given element. """ - self._page.hover(selector) + await self._page.hover(selector) - def browser_select_option(self, selector: str): + async def browser_select_option(self, selector: str): """ Select an option from a given element. """ - self._page.select_option(selector) + await self._page.select_option(selector) - def browser_mouse_click_xy(self, x: int, y: int): + async def browser_mouse_click_xy(self, x: int, y: int): """ Click at a given x, y coordinate using the mouse. """ - self._page.mouse.click(x, y) - - def initialize(self): - self._browser = sync_playwright().start().chromium.launch(headless=True, chromium_sandbox=True) - self._page = self._browser.new_page() + await self._page.mouse.click(x, y) def run_js(self, js: str): - js_result = self._page.evaluate(js) + js_result = self._loop.run_until_complete(self._page.evaluate(js)) logger.info(f"JS result: {js_result}") return js_result def goto(self, url: str): - self._page.goto(url) + self._loop.run_until_complete(self._page.goto(url)) - def page_snapshot(self): - return self._page.content() + def page_html(self): + return self._loop.run_until_complete(self._page.content()) def page_screenshot(self): - scr_bytes = self._page.screenshot() + scr_bytes = self._loop.run_until_complete(self._page.screenshot()) return Image.open(BytesIO(scr_bytes)) + def page_axtree(self): + return "" + def step(self, action: ToolCallAction): fn = self._actions[action.function.name] - action_result = fn(**action.function.arguments) - snapshot = self.page_snapshot() + action_result = self._loop.run_until_complete(fn(**action.function.arguments)) + html = self.page_html() screenshot = self.page_screenshot() + axtree = self.page_axtree() return { - "pruned_html": f"{action_result or ''}\n{snapshot}", - "axtree_txt": snapshot, + "tool_result": action_result, + "pruned_html": html, + "axtree_txt": axtree, "screenshot": screenshot, } + def actions(self) -> tuple[ToolSpec]: specs = [ToolSpec.from_function(fn) for fn in self._actions.values()] return tuple(specs) def close(self): - self._browser.close() + self._loop.run_until_complete(self._browser.close()) From d42dfd7a42ac730979d9d0ec1af53f0bc8fb87e6 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 16:55:12 +0000 Subject: [PATCH 26/61] fixes --- src/agentlab/backends/browser/env.py | 2 +- src/agentlab/benchmarks/miniwob/task.py | 6 ++++++ src/agentlab/benchmarks/web_task.py | 3 +++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 3c403b86..333bbeca 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -29,7 +29,7 @@ def reset(self, seed: int): self.backend.goto(self.task.url) setup_js = self.task.get_setup_js() if setup_js: - self.goal = self.backend.run_js(setup_js) + self.goal = self.task.parse_setup_result(self.backend.run_js(setup_js)) logger.info(f"Task goal: {self.goal}") html = self.backend.page_html() screenshot = self.backend.page_screenshot() diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index b2e42ade..9de260dd 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -107,6 +107,12 @@ def get_setup_js(self) -> str: """ return f"async () => {{{js}}}" + def parse_setup_result(self, setup_result: str | dict | list) -> str: + if isinstance(setup_result, dict): + return setup_result["utterance"] + else: + return setup_result + def get_teardown_js(self) -> str: return "" diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index 930c1450..67ff07ec 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -19,6 +19,9 @@ def filter_actions(cls, actions: list[ToolSpec]) -> list[str]: def get_setup_js(self) -> str: raise NotImplementedError + def parse_setup_result(self, setup_result: str | dict | list) -> str: + raise NotImplementedError + def get_teardown_js(self) -> str: raise NotImplementedError From 55da7cff96d6eb379d090f1eef568a553d345b3d Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 17:10:28 +0000 Subject: [PATCH 27/61] format --- src/agentlab/agents/tapeagent/conf/miniwob.yaml | 2 +- src/agentlab/backends/browser/env.py | 2 +- src/agentlab/backends/browser/mcp_playwright.py | 2 +- src/agentlab/benchmarks/miniwob/task.py | 4 +++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml index a9d82712..e7e00ecd 100644 --- a/src/agentlab/agents/tapeagent/conf/miniwob.yaml +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -6,4 +6,4 @@ defaults: name: miniwob comment: MiniWob Agent parallel_backend: ray -n_jobs: 32 \ No newline at end of file +n_jobs: 16 \ No newline at end of file diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 333bbeca..818e8fd2 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -10,6 +10,7 @@ logger = logging.getLogger(__name__) + class BrowserEnv(AbstractEnv): def __init__( self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 @@ -65,7 +66,6 @@ def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, d observation = self._step(action) observation = self.task.obs_postprocess(observation) - action_exec_stop = time.time() self._turns += 1 logger.info(f"Obs: {observation}") diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 1705bfc1..19bfe7c5 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -43,7 +43,7 @@ def step(self, action: ToolCallAction) -> dict: } def page_html(self) -> str: - return "" + return "" def page_axtree(self) -> str: contents = self.call_tool("browser_snapshot", {}) diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 9de260dd..36d5e34e 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -144,7 +144,9 @@ def parse_validation_result(self, validation_result: str | list) -> tuple[float, def obs_postprocess(self, obs: dict) -> dict: screenshot: Image.Image | None = obs.get("screenshot", None) if screenshot is not None: - obs["screenshot"] = screenshot.crop((0, 0, 332, 214)) # crop to 332x214 because this is the viewport size for MiniWob + obs["screenshot"] = screenshot.crop( + (0, 0, 332, 214) + ) # crop to 332x214 because this is the viewport size for MiniWob return obs From 1f090c2c9c1cc9f978ecca0f7e0513f1e2211448 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 17:56:30 +0000 Subject: [PATCH 28/61] fix pw actions --- src/agentlab/backends/browser/playwright.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index f38ac9ce..b05b5c7a 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -43,11 +43,11 @@ async def browser_press_key(self, key: str): """ await self._page.keyboard.press(key) - async def browser_type(self, text: str): + async def browser_type(self, selector: str, text: str): """ Type text into the focused element. """ - await self._page.type(text) + await self._page.type(selector, text) async def browser_click(self, selector: str): """ @@ -73,17 +73,17 @@ async def browser_hover(self, selector: str): """ await self._page.hover(selector) - async def browser_select_option(self, selector: str): + async def browser_select_option(self, selector: str, value: str): """ Select an option from a given element. """ - await self._page.select_option(selector) + await self._page.select_option(selector, value) async def browser_mouse_click_xy(self, x: int, y: int): """ Click at a given x, y coordinate using the mouse. """ - await self._page.mouse.click(x, y) + await self._page.mouse.click(x, y, delay=100) def run_js(self, js: str): js_result = self._loop.run_until_complete(self._page.evaluate(js)) From f2c480a8bb47fcddba83c835644d2cc8e9814123 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 18:18:22 +0000 Subject: [PATCH 29/61] fix tapeagent --- src/agentlab/agents/tapeagent/agent.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index b636f26e..97017085 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -1,7 +1,7 @@ import logging import tempfile from dataclasses import dataclass -from typing import Literal +from typing import Any, Literal import bgym import hydra @@ -47,11 +47,17 @@ class AgentResponse(Thought): kind: Literal["agent_response"] = "agent_response" response: str + def llm_view(self, **kwargs) -> str: + return self.response + class AgentThinking(Thought): kind: Literal["agent_thinking"] = "agent_thinking" thinking: str + def llm_view(self, **kwargs) -> str: + return self.thinking + class Tape(BaseTape): metadata: ExtendedMetadata = Field(default_factory=ExtendedMetadata) # type: ignore @@ -202,7 +208,10 @@ def __init__(self, agent: Agent): self.agent = agent self.tape = Tape(steps=[]) - def obs_preprocessor(self, obs: Observation | list[Observation] | dict) -> list[Observation]: + def obs_preprocessor(self, obs: Any) -> list[Observation]: + return obs + + def obs_to_steps(self, obs: Observation | list[Observation] | dict) -> list[Observation]: if isinstance(obs, Observation): obs = [obs] if isinstance(obs, dict): @@ -231,8 +240,10 @@ def obs_preprocessor(self, obs: Observation | list[Observation] | dict) -> list[ logger.info(colored(f"Observations:\n{obs_view}", "green")) return obs - def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, TapeAgentInfo]: - self.tape += obs # type: ignore + def get_action( + self, obs: Observation | list[Observation] | dict + ) -> tuple[Action, TapeAgentInfo]: + self.tape += self.obs_to_steps(obs) thoughts: list[Thought] = [] action = None while not action: @@ -250,7 +261,8 @@ def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, Tape # there could be control flow steps for switching nodes and if clauses logger.info(f"Other step: {type(event.step)}") logger.info(f"Tape after run: ({len(self.tape)}) {[type(s).__name__ for s in self.tape]}") - return (action, TapeAgentInfo(thoughts=thoughts)) + think_str = "\n".join([t.llm_view() for t in thoughts]) + return (action, {"thoughts": thoughts, "think": think_str}) @property def final_tape(self) -> Tape: From 8be928a2731239118c2385605db1bd5f3c89f509 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 18 Nov 2025 19:49:02 +0000 Subject: [PATCH 30/61] fixes --- experiments/run_miniwob.py | 11 ++++++----- src/agentlab/agents/tapeagent/agent.py | 3 ++- src/agentlab/agents/tapeagent/conf/agent/react.yaml | 10 +++++----- src/agentlab/backends/browser/base.py | 3 +++ src/agentlab/experiments/loop.py | 2 ++ 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index c9f02f55..424bb100 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -4,11 +4,11 @@ from bgym import DEFAULT_BENCHMARKS from dotenv import load_dotenv -from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_4o +from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright -from agentlab.backends.browser.playwright import PlaywrightSyncBackend +from agentlab.backends.browser.playwright import AsyncPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT @@ -22,11 +22,12 @@ config = load_config("miniwob") # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) - benchmark = MiniWobBenchmark(backend=MCPPlaywright()) + # benchmark = MiniWobBenchmark(backend=MCPPlaywright()) + benchmark = MiniWobBenchmark(backend=AsyncPlaywright()) # agent_args = GenericAgentArgs( - # chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/gpt-5-mini"], - # flags=FLAGS_GPT_4o, + # chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], + # flags=GPT5_MINI_FLAGS, # ) # agent_args.flags.obs.use_ax_tree = False # agent_args.flags.obs.use_html = True diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index 97017085..ec4f4a2d 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -12,6 +12,7 @@ from tapeagents.agent import Agent from tapeagents.core import ( Action, + ControlFlow, LLMOutputParsingFailureAction, Observation, SetNextNode, @@ -251,7 +252,7 @@ def get_action( if not event.step: continue self.tape = self.tape.append(event.step) - if isinstance(event.step, Thought): + if isinstance(event.step, Thought) and not isinstance(event.step, ControlFlow): thoughts.append(event.step) logger.info(f"Thought: {event.step.llm_view()}") elif isinstance(event.step, Action) and not action: # we use first action only diff --git a/src/agentlab/agents/tapeagent/conf/agent/react.yaml b/src/agentlab/agents/tapeagent/conf/agent/react.yaml index 0ec8689c..2f5b576e 100644 --- a/src/agentlab/agents/tapeagent/conf/agent/react.yaml +++ b/src/agentlab/agents/tapeagent/conf/agent/react.yaml @@ -10,15 +10,15 @@ nodes: system_prompt: | You are an expert AI Agent trained to assist users with complex web tasks. Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. - Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration. + Keep your replies brief, concise, direct and on topic. Prioritize clarity and avoid over-elaboration. Do not express emotions or opinions. guidance: | Think along the following lines: 1. Summarize the last observation and describe the visible changes in the state. - 2. Evaluate action success, explain impact on task/plan. - 3. If there are any errors, describe them and propose alternative. - 4. List next steps to move towards the goaland propose next immediate action. - The produce the function call that performs the proposed step. If the task is complete, produce the final step. + 2. Evaluate action success, explain impact on task and next steps. + 3. If you see any errors in the last observation, think about it. If there is no error, just move on. + 4. List next steps to move towards the goal and propose next immediate action. + Then produce the function call that performs the proposed action. If the task is complete, produce the final step. steps: - tapeagents.core.FinalStep next_node: react \ No newline at end of file diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index d5cc0151..0c0ce20a 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -40,6 +40,9 @@ class ToolCallAction(BaseModel): id: str = "" function: FunctionCall + def llm_view(self, **kwargs) -> str: + return self.model_dump_json(indent=2) + class ToolSpec(BaseModel): """ diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 865fb178..8fb32005 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -23,6 +23,7 @@ from browsergym.experiments.utils import count_tokens from dataclasses_json import DataClassJsonMixin from PIL import Image +from pydantic import BaseModel from tqdm import tqdm from agentlab.backends.browser.env import BrowserEnvArgs @@ -411,6 +412,7 @@ def run(self): logger.debug(f"Starting step {step_info.step}.") step_info.profiling.agent_start = time.time() action, step_info.agent_info = agent.get_action(step_info.obs.copy()) + step_info.action = action.model_dump_json(indent=2) if isinstance(action, BaseModel) else str(action) step_info.profiling.agent_stop = time.time() if step_info.agent_info.get("think", None): logger.info(f"Agent thought: {step_info.agent_info['think']}") From 963c999fbd509ff6c0bdc0629b4210e4e219f137 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 19 Nov 2025 17:24:55 +0000 Subject: [PATCH 31/61] better final step, pass loop backend init if needed, click timeouts --- src/agentlab/backends/browser/env.py | 8 ++++++++ src/agentlab/backends/browser/playwright.py | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 818e8fd2..241bba98 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -11,6 +11,12 @@ logger = logging.getLogger(__name__) +def final_step(): + """ + Finish the task execution. + """ + pass + class BrowserEnv(AbstractEnv): def __init__( self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 @@ -117,6 +123,8 @@ def actions(self) -> list[ToolSpec]: logger.info( f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}" ) + final_step_action = ToolSpec.from_function(final_step) + filtered_actions.append(final_step_action) return filtered_actions diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index b05b5c7a..3d075e9e 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -28,8 +28,8 @@ def model_post_init(self, __context: Any): "browser_mouse_click_xy": self.browser_mouse_click_xy, } - def initialize(self): - self._loop = asyncio.get_event_loop() + def initialize(self, loop: asyncio.AbstractEventLoop | None = None): + self._loop = loop or asyncio.get_event_loop() self._loop.run_until_complete(self.ainitialize()) async def ainitialize(self): @@ -53,7 +53,7 @@ async def browser_click(self, selector: str): """ Click on a selector. """ - await self._page.click(selector) + await self._page.click(selector, timeout=3000, strict=True) async def browser_drag(self, from_selector: str, to_selector: str): """ @@ -71,7 +71,7 @@ async def browser_hover(self, selector: str): """ Hover over a given element. """ - await self._page.hover(selector) + await self._page.hover(selector, timeout=3000, strict=True) async def browser_select_option(self, selector: str, value: str): """ @@ -105,7 +105,11 @@ def page_axtree(self): def step(self, action: ToolCallAction): fn = self._actions[action.function.name] - action_result = self._loop.run_until_complete(fn(**action.function.arguments)) + try: + action_result = self._loop.run_until_complete(fn(**action.function.arguments)) + except Exception as e: + logger.error(f"Error executing action {action.function.name}: {e}") + action_result = f"Error executing action {action.function.name}: {e}" html = self.page_html() screenshot = self.page_screenshot() axtree = self.page_axtree() From d1d5c6d45d96ff6577ac1e9695f85d5b2c003d72 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 09:57:39 +0000 Subject: [PATCH 32/61] return flatten axtree from playwright backend --- src/agentlab/backends/browser/playwright.py | 55 ++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index 3d075e9e..71244806 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -101,7 +101,9 @@ def page_screenshot(self): return Image.open(BytesIO(scr_bytes)) def page_axtree(self): - return "" + axtree = self._loop.run_until_complete(self._page.accessibility.snapshot()) + flat_axtree = flatten_axtree(axtree) + return flat_axtree def step(self, action: ToolCallAction): fn = self._actions[action.function.name] @@ -126,3 +128,54 @@ def actions(self) -> tuple[ToolSpec]: def close(self): self._loop.run_until_complete(self._browser.close()) + + +def flatten_axtree(axtree_dict: dict | None) -> str: + """ + Traverses accessibility tree dictionary and returns its markdown view. + + Args: + axtree_dict: Accessibility tree from playwright page.accessibility.snapshot() + Structure: dict with 'role', 'name', 'value', 'children' keys + + Returns: + String representation of the accessibility tree in markdown format + """ + if axtree_dict is None: + return "" + + def traverse_node(node: dict, depth: int = 0) -> list[str]: + """Recursively traverse the accessibility tree and build markdown lines.""" + lines = [] + indent = " " * depth # 2 spaces per indent level + + # Extract node information + role = node.get("role", "") + name = node.get("name", "") + value = node.get("value", "") + + # Build the node representation + parts = [] + if role: + parts.append(f"{role}:") + if name.strip(): + parts.append(f"{name}") + if value: + parts.append(f"[value: {value}]") + + # Only add line if there's meaningful content + if parts: + line = f"{indent}{' '.join(parts)}" + lines.append(line) + + # Recursively process children + children = node.get("children", []) + for child in children: + child_lines = traverse_node(child, depth + 1) + lines.extend(child_lines) + + return lines + + # Start traversal from root + all_lines = traverse_node(axtree_dict, depth=0) + return "\n".join(all_lines) From 6664b691bca2104fdb3b04e690203b052a444e59 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:48:30 +0000 Subject: [PATCH 33/61] pass backend cls, instantiate backend in task --- src/agentlab/backends/browser/env.py | 13 +++++++++---- src/agentlab/benchmarks/miniwob/benchmark.py | 10 +++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 241bba98..ed33bc17 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -133,18 +133,23 @@ class BrowserEnvArgs(AbstractEnvArgs): task: AbstractWebTask task_seed: int task_name: str - backend: BrowserBackend + backend_cls: type[BrowserBackend] def __init__( - self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0 + self, + task_name: str, + task: AbstractWebTask, + backend_cls: type[BrowserBackend], + task_seed: int = 0, ): self.task_name = task_name self.task = task self.task_seed = task_seed - self.backend = backend + self.backend_cls = backend_cls def make_env(self, exp_dir: Path) -> BrowserEnv: + backend = self.backend_cls() env = BrowserEnv( - task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed + task_name=self.task_name, task=self.task, backend=backend, seed=self.task_seed ) return env diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py index 7cacc309..1c38bbcd 100644 --- a/src/agentlab/benchmarks/miniwob/benchmark.py +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -1,7 +1,7 @@ import logging from typing import Any -from pydantic import ConfigDict, Field +from pydantic import ConfigDict from agentlab.actions import ToolsActionSet from agentlab.backends.browser.base import BrowserBackend @@ -15,20 +15,20 @@ class MiniWobBenchmark(AbstractBenchmark): model_config = ConfigDict(arbitrary_types_allowed=True) - backend: BrowserBackend + backend_cls: type[BrowserBackend] name: str = "miniwob" env_args_list: list[BrowserEnvArgs] = None # type: ignore dataset: list[MiniWobTask] = None # type: ignore is_multi_tab: bool = False - high_level_action_set_args: ToolsActionSet = None + high_level_action_set_args: ToolsActionSet = None # type: ignore def model_post_init(self, __context: Any) -> None: - self.name = f"miniwob_{self.backend.__class__.__name__.lower()}" + self.name = f"miniwob_{self.backend_cls.__name__.lower()}" self.env_args_list = [] if self.dataset is None: self.dataset = get_miniwob_tasks() for task in self.dataset: name = f"miniwob.{task.task_id}" - env_args = BrowserEnvArgs(task_name=name, task=task, backend=self.backend) + env_args = BrowserEnvArgs(task_name=name, task=task, backend_cls=self.backend_cls) self.env_args_list.append(env_args) logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks") From ffebf6b9621e69880b2f423716efb4127cb0c02f Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:48:49 +0000 Subject: [PATCH 34/61] get html from playwright mcp --- src/agentlab/backends/browser/mcp_playwright.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 19bfe7c5..eebdb995 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -43,7 +43,15 @@ def step(self, action: ToolCallAction) -> dict: } def page_html(self) -> str: - return "" + contents = self.call_tool("browser_evaluate", {"function": "document.documentElement.outerHTML"}) + raw_response = "\n".join([c.text for c in contents if c.type == "text"]) + try: + _, half_response = raw_response.split("### Result", maxsplit=1) + result_str, _ = half_response.split("\n### Ran", maxsplit=1) + return result_str.strip() + except Exception as e: + logger.error(f"Error parsing page_html result: {e}. Raw result: {raw_response}") + return "" def page_axtree(self) -> str: contents = self.call_tool("browser_snapshot", {}) From 3378b56ebf94031259556ebc97b4664bb5b79a75 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:49:12 +0000 Subject: [PATCH 35/61] better abstract class --- src/agentlab/backends/browser/base.py | 30 ++++++++++++++++++--------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 0c0ce20a..65c23875 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -1,4 +1,5 @@ import logging +from abc import ABC, abstractmethod from typing import Any, Callable, Literal from langchain_core.utils.function_calling import convert_to_openai_tool @@ -73,30 +74,39 @@ def from_function(cls, function: Callable): return cls.model_validate(convert_to_openai_tool(function)) -class BrowserBackend(BaseModel): +class BrowserBackend(BaseModel, ABC): + @abstractmethod def initialize(self) -> None: - raise NotImplementedError + pass + @abstractmethod def run_js(self, js: str): - raise NotImplementedError + pass + @abstractmethod def goto(self, url: str) -> str: - raise NotImplementedError + pass + @abstractmethod def page_html(self) -> str: - raise NotImplementedError + pass + @abstractmethod def page_screenshot(self) -> Image: - raise NotImplementedError + pass + @abstractmethod def page_axtree(self) -> str: - raise NotImplementedError + pass + @abstractmethod def step(self, action: ToolCallAction) -> str: - raise NotImplementedError + pass + @abstractmethod def actions(self) -> tuple[ToolSpec]: - raise NotImplementedError + pass + @abstractmethod def close(self) -> None: - raise NotImplementedError + pass From 323978d21a116e61e568fd80363ea45f8ad72183 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:49:41 +0000 Subject: [PATCH 36/61] init files --- src/agentlab/backends/__init__.py | 0 src/agentlab/backends/browser/__init__.py | 18 ++++++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 src/agentlab/backends/__init__.py create mode 100644 src/agentlab/backends/browser/__init__.py diff --git a/src/agentlab/backends/__init__.py b/src/agentlab/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/agentlab/backends/browser/__init__.py b/src/agentlab/backends/browser/__init__.py new file mode 100644 index 00000000..7687462f --- /dev/null +++ b/src/agentlab/backends/browser/__init__.py @@ -0,0 +1,18 @@ +from agentlab.backends.browser.base import BrowserBackend, FunctionCall, ToolCallAction, ToolSpec +from agentlab.backends.browser.env import BrowserEnv, BrowserEnvArgs +from agentlab.backends.browser.mcp import MCPBrowserBackend, MCPClient +from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.backends.browser.playwright import AsyncPlaywright + +__all__ = [ + "BrowserBackend", + "FunctionCall", + "ToolCallAction", + "ToolSpec", + "BrowserEnv", + "BrowserEnvArgs", + "MCPBrowserBackend", + "MCPClient", + "MCPPlaywright", + "AsyncPlaywright", +] From e2cd4b9038d710d15301816b8fa76c11699faad0 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:56:32 +0000 Subject: [PATCH 37/61] add base benchmark class to study --- src/agentlab/experiments/study.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agentlab/experiments/study.py b/src/agentlab/experiments/study.py index 391f419c..abcc3d6c 100644 --- a/src/agentlab/experiments/study.py +++ b/src/agentlab/experiments/study.py @@ -17,7 +17,7 @@ from agentlab.agents.agent_args import AgentArgs from agentlab.analyze import inspect_results -from agentlab.benchmarks.abstract_env import AbstractEnvArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark, AbstractEnvArgs from agentlab.experiments import reproducibility_util as repro from agentlab.experiments.exp_utils import RESULTS_DIR, add_dependencies from agentlab.experiments.launch_exp import ( @@ -33,7 +33,7 @@ def make_study( agent_args: list[AgentArgs] | AgentArgs, - benchmark: Benchmark | str, + benchmark: Benchmark | AbstractBenchmark | str, logging_level=logging.WARNING, logging_level_stdout=logging.WARNING, suffix="", From 20502a8bb4f6321c00b88a89162fdde9ea2ec67a Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:58:29 +0000 Subject: [PATCH 38/61] move action and tool classes to actions module --- src/agentlab/actions.py | 77 +++++++++++++++++-- src/agentlab/agents/tapeagent/agent.py | 2 +- src/agentlab/backends/browser/__init__.py | 3 +- src/agentlab/backends/browser/base.py | 69 +---------------- src/agentlab/backends/browser/env.py | 4 +- src/agentlab/backends/browser/mcp.py | 3 +- .../backends/browser/mcp_playwright.py | 3 +- src/agentlab/backends/browser/playwright.py | 3 +- src/agentlab/benchmarks/web_task.py | 2 +- 9 files changed, 84 insertions(+), 82 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index 50301d61..60d21e11 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -1,14 +1,81 @@ import json import logging +from typing import Any, Callable, Literal from bgym import AbstractActionSet +from langchain_core.utils.function_calling import convert_to_openai_tool +from pydantic import BaseModel -from agentlab.backends.browser.base import FunctionCall, ToolCallAction, ToolSpec from agentlab.llm.llm_utils import parse_html_tags_raise logger = logging.getLogger(__name__) +class FunctionSpec(BaseModel): + """ + A class representing the specification of a function. + + Attributes: + name (str): The name of the function. + description (str): A brief description of the function. + parameters (dict): A dictionary containing the parameters of the function. + """ + + name: str + description: str + parameters: dict + + +class FunctionCall(BaseModel): + """ + A class representing a function call. + + Attributes: + name (str): The name of the function being called. + arguments (Any): The arguments to be passed to the function. + """ + + name: str + arguments: Any + + +class ToolCallAction(BaseModel): + id: str = "" + function: FunctionCall + + def llm_view(self, **kwargs) -> str: + return self.model_dump_json(indent=2) + + +class ToolSpec(BaseModel): + """ + ToolSpec is a model that represents a tool specification with a type and a function. + + Attributes: + type (Literal["function"]): The type of the tool, which is always "function". + function (FunctionSpec): The specification of the function. + """ + + type: Literal["function"] = "function" + function: FunctionSpec + + def description(self) -> str: + return f"{self.function.name} - {self.function.description}" + + @classmethod + def from_function(cls, function: Callable): + """ + Creates an instance of the class by validating the model from a given function. + + Args: + function (Callable): The function to be converted and validated. + + Returns: + (ToolSpec): An instance of the class with the validated model. + """ + return cls.model_validate(convert_to_openai_tool(function)) + + class ToolsActionSet(AbstractActionSet): multiaction: bool = False strict: bool = False @@ -49,9 +116,7 @@ def parse_action(cls, llm_output: str) -> ToolCallAction: if "" in llm_output: content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) if not valid or "action" not in content_dict: - raise ValueError( - f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}" - ) + raise ValueError(f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}") action_str = content_dict["action"] else: action_str = llm_output @@ -59,9 +124,7 @@ def parse_action(cls, llm_output: str) -> ToolCallAction: action_dict = json.loads(action_str) except json.JSONDecodeError: raise ValueError(f"Failed to parse action: {action_str}") - return ToolCallAction( - function=FunctionCall(name=action_dict["name"], arguments=action_dict["arguments"]) - ) + return ToolCallAction(function=FunctionCall(name=action_dict["name"], arguments=action_dict["arguments"])) def to_python_code(self, action) -> str: return action diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index ec4f4a2d..3627682c 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -27,8 +27,8 @@ from tapeagents.tool_calling import ToolSpec from termcolor import colored +from agentlab.actions import ToolSpec as AgentlabToolSpec from agentlab.agents.agent_args import AgentArgs -from agentlab.backends.browser.base import ToolSpec as AgentlabToolSpec logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) diff --git a/src/agentlab/backends/browser/__init__.py b/src/agentlab/backends/browser/__init__.py index 7687462f..bed8c2b4 100644 --- a/src/agentlab/backends/browser/__init__.py +++ b/src/agentlab/backends/browser/__init__.py @@ -1,4 +1,5 @@ -from agentlab.backends.browser.base import BrowserBackend, FunctionCall, ToolCallAction, ToolSpec +from agentlab.actions import FunctionCall, ToolCallAction, ToolSpec +from agentlab.backends.browser.base import BrowserBackend from agentlab.backends.browser.env import BrowserEnv, BrowserEnvArgs from agentlab.backends.browser.mcp import MCPBrowserBackend, MCPClient from agentlab.backends.browser.mcp_playwright import MCPPlaywright diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 65c23875..33d5da21 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -1,77 +1,12 @@ import logging from abc import ABC, abstractmethod -from typing import Any, Callable, Literal -from langchain_core.utils.function_calling import convert_to_openai_tool from PIL import Image from pydantic import BaseModel -logger = logging.getLogger(__name__) - - -class FunctionCall(BaseModel): - """ - A class representing a function call. - - Attributes: - name (str): The name of the function being called. - arguments (Any): The arguments to be passed to the function. - """ - - name: str - arguments: Any - - -class FunctionSpec(BaseModel): - """ - A class representing the specification of a function. - - Attributes: - name (str): The name of the function. - description (str): A brief description of the function. - parameters (dict): A dictionary containing the parameters of the function. - """ - - name: str - description: str - parameters: dict - +from agentlab.actions import ToolCallAction, ToolSpec -class ToolCallAction(BaseModel): - id: str = "" - function: FunctionCall - - def llm_view(self, **kwargs) -> str: - return self.model_dump_json(indent=2) - - -class ToolSpec(BaseModel): - """ - ToolSpec is a model that represents a tool specification with a type and a function. - - Attributes: - type (Literal["function"]): The type of the tool, which is always "function". - function (FunctionSpec): The specification of the function. - """ - - type: Literal["function"] = "function" - function: FunctionSpec - - def description(self) -> str: - return f"{self.function.name} - {self.function.description}" - - @classmethod - def from_function(cls, function: Callable): - """ - Creates an instance of the class by validating the model from a given function. - - Args: - function (Callable): The function to be converted and validated. - - Returns: - (ToolSpec): An instance of the class with the validated model. - """ - return cls.model_validate(convert_to_openai_tool(function)) +logger = logging.getLogger(__name__) class BrowserBackend(BaseModel, ABC): diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index ed33bc17..b3fe12b6 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -3,8 +3,8 @@ from dataclasses import dataclass from pathlib import Path -from agentlab.actions import ToolsActionSet -from agentlab.backends.browser.base import BrowserBackend, ToolCallAction, ToolSpec +from agentlab.actions import ToolCallAction, ToolsActionSet, ToolSpec +from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs from agentlab.benchmarks.web_task import AbstractWebTask diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index 84129374..57cffbb7 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -10,7 +10,8 @@ from mcp import Tool as MCPTool from mcp.types import CallToolResult, ImageContent, TextContent -from agentlab.backends.browser.base import BrowserBackend, FunctionSpec, ToolCallAction, ToolSpec +from agentlab.actions import FunctionSpec, ToolCallAction, ToolSpec +from agentlab.backends.browser.base import BrowserBackend logger = logging.getLogger(__name__) diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index eebdb995..18a96daa 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -4,7 +4,8 @@ from PIL import Image -from agentlab.backends.browser.mcp import MCPBrowserBackend, ToolCallAction +from agentlab.actions import ToolCallAction +from agentlab.backends.browser.mcp import MCPBrowserBackend logger = logging.getLogger(__name__) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index 71244806..f834b502 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -6,7 +6,8 @@ from PIL import Image from playwright.async_api import Browser, Page, async_playwright -from agentlab.backends.browser.base import BrowserBackend, ToolCallAction, ToolSpec +from agentlab.actions import ToolCallAction, ToolSpec +from agentlab.backends.browser.base import BrowserBackend logger = logging.getLogger(__name__) diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index 67ff07ec..56d0d9b1 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -2,7 +2,7 @@ from pydantic import BaseModel -from agentlab.backends.browser.base import ToolSpec +from agentlab.actions import ToolSpec class AbstractWebTask(BaseModel): From dfbc0058c31945920a379bcc5f4c6d32cbd43f53 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 13:58:47 +0000 Subject: [PATCH 39/61] improve entrypoint --- experiments/run_miniwob.py | 57 +++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 424bb100..cfbd9fd0 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -1,5 +1,7 @@ +import argparse import logging import os +import sys from bgym import DEFAULT_BENCHMARKS from dotenv import load_dotenv @@ -18,22 +20,51 @@ logger = logging.getLogger(__name__) load_dotenv() -if __name__ == "__main__": - config = load_config("miniwob") - # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) - # benchmark = MiniWobBenchmark(backend=MCPPlaywright()) - benchmark = MiniWobBenchmark(backend=AsyncPlaywright()) - # agent_args = GenericAgentArgs( - # chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], - # flags=GPT5_MINI_FLAGS, - # ) - # agent_args.flags.obs.use_ax_tree = False - # agent_args.flags.obs.use_html = True - # agent_args.flags.obs.use_focused_element = False - agent_args = TapeAgentArgs(agent_name=config.name, config=config) +def parse_args(): + parser = argparse.ArgumentParser(description="Run MiniWob benchmark experiments") + parser.add_argument( + "--backend", + choices=["playwright", "mcp", "bgym"], + default="playwright", + help="Browser backend to use (default: playwright)", + ) + parser.add_argument( + "--agent", + choices=["tape", "generic"], + default="tape", + help="Agent type to use (default: tape)", + ) + parser.add_argument( + "--config", + type=str, + default="miniwob", + help="Hydra config name to load (default: miniwob)", + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + config = load_config(args.config) + if args.backend == "bgym": + benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) + elif args.backend == "playwright": + benchmark = MiniWobBenchmark(backend_cls=AsyncPlaywright) + elif args.backend == "mcp": + benchmark = MiniWobBenchmark(backend_cls=MCPPlaywright) + else: + raise ValueError(f"Unknown backend: {args.backend}") + + if args.agent == "generic": + agent_args = GenericAgentArgs( + chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], + flags=GPT5_MINI_FLAGS, + ) + else: + agent_args = TapeAgentArgs(agent_name=config.name, config=config) study = make_study( benchmark=benchmark, From 7a682a088f239af158ffa4caf622200f026429e6 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 16:12:16 +0000 Subject: [PATCH 40/61] new react toolcall agent, inspired by tapeagents but independent --- experiments/run_miniwob.py | 9 +- src/agentlab/agents/react_toolcall_agent.py | 218 ++++++++++++++++++++ src/agentlab/backends/browser/env.py | 3 - 3 files changed, 226 insertions(+), 4 deletions(-) create mode 100644 src/agentlab/agents/react_toolcall_agent.py diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index cfbd9fd0..ea6daf8e 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -8,11 +8,13 @@ from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs +from agentlab.agents.react_toolcall_agent import AgentConfig, LLMArgs, ReactToolCallAgentArgs from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright from agentlab.backends.browser.playwright import AsyncPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study +from agentlab.llm.chat_api import BaseModelArgs from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" @@ -32,7 +34,7 @@ def parse_args(): ) parser.add_argument( "--agent", - choices=["tape", "generic"], + choices=["tape", "generic", "react"], default="tape", help="Agent type to use (default: tape)", ) @@ -63,6 +65,11 @@ def parse_args(): chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], flags=GPT5_MINI_FLAGS, ) + elif args.agent == "react": + agent_args = ReactToolCallAgentArgs( + llm_args=LLMArgs(model_name="azure/gpt-5-mini", temperature=1.0, max_total_tokens=128000), + config=AgentConfig(), + ) else: agent_args = TapeAgentArgs(agent_name=config.name, config=config) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py new file mode 100644 index 00000000..13bfd1a0 --- /dev/null +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -0,0 +1,218 @@ +import json +import logging +import pprint +from dataclasses import dataclass +from functools import partial +from typing import Callable + +from litellm import completion_with_retries +from litellm.types.utils import ChatCompletionMessageToolCall, Message, ModelResponse +from PIL import Image +from termcolor import colored + +from agentlab.actions import FunctionCall, ToolCallAction, ToolsActionSet, ToolSpec +from agentlab.agents.agent_args import AgentArgs +from agentlab.llm.chat_api import BaseModelArgs +from agentlab.llm.llm_utils import image_to_png_base64_url + +logger = logging.getLogger(__name__) + +@dataclass +class Observation: + data: dict + + def to_messages(self) -> list[dict]: + messages = [] + tool_call_id = self.data.get("tool_call_id") + if self.data.get("goal_object") and not tool_call_id: # its a first observation when there are no tool_call_id, so include goal + goal=self.data["goal_object"][0]["text"] + messages.append({ + "role": "user", + "content": f"## Goal:\n{goal}" + }) + text_obs = [] + if self.data.get("action_result"): + result=self.data["action_result"] + text_obs.append(f"Action Result:\n{result}") + if self.data.get("pruned_html"): + html=self.data["pruned_html"] + text_obs.append(f"Pruned HTML:\n{html}") + if self.data.get("axtree_txt"): + axtree=self.data["axtree_txt"] + text_obs.append(f"Accessibility Tree:\n{axtree}") + if self.data.get("last_action_error"): + error = self.data['last_action_error'] + text_obs.append(f"Action Error:\n{error}") + if text_obs: + if tool_call_id: + message = { + "role": "tool", + "tool_call_id": tool_call_id, + "content": "\n\n".join(text_obs), + } + else: + message = { + "role": "user", + "content": "\n\n".join(text_obs), + } + messages.append(message) + if self.data.get("screenshot"): + if isinstance(self.data["screenshot"], Image.Image): + image_content_url = image_to_png_base64_url(self.data["screenshot"]) + messages.append({ + "role": "user", + "content": [{"type": "image_url", "image_url": {"url": image_content_url}}], + }) + else: + raise ValueError(f"Expected Image.Image, got {type(self.data['screenshot'])}") + return messages + +@dataclass +class LLMOutput: + message: Message + def to_messages(self) -> list[Message]: + return [self.message] + +@dataclass +class SystemMessage: + message: str + def to_messages(self) -> list[dict]: + return [{"role": "system", "content": self.message}] + +@dataclass +class UserMessage: + message: str + def to_messages(self) -> list[dict]: + return [{"role": "user", "content": self.message}] + +Step = LLMOutput | Observation | SystemMessage | UserMessage + +@dataclass +class AgentConfig: + use_html: bool = True + use_axtree: bool = False + use_screenshot: bool = True + max_actions: int = 10 + max_retry: int = 4 + system_prompt: str = """ +You are an expert AI Agent trained to assist users with complex web tasks. +Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. +Keep your replies brief, concise, direct and on topic. Prioritize clarity and avoid over-elaboration. +Do not express emotions or opinions. +""" + guidance: str = """ +Think along the following lines: +1. Summarize the last observation and describe the visible changes in the state. +2. Evaluate action success, explain impact on task and next steps. +3. If you see any errors in the last observation, think about it. If there is no error, just move on. +4. List next steps to move towards the goal and propose next immediate action. +Then produce the function call that performs the proposed action. If the task is complete, produce the final step. +""" + +class LLMArgs(BaseModelArgs): + reasoning_effort: str = "low" + + def make_model(self) -> Callable: + return partial( + completion_with_retries, + model=self.model_name, + temperature=self.temperature, + max_tokens=self.max_total_tokens, + max_completion_tokens=self.max_new_tokens, + reasoning_effort=self.reasoning_effort, + ) + +class ReactToolCallAgent: + def __init__(self, action_set: ToolsActionSet, llm: Callable, config: AgentConfig): + self.action_set = action_set + self.history: list[Step] = [SystemMessage(message=config.system_prompt)] + self.llm = llm + self.config = config + self.last_tool_call_id: str = "" + + def obs_preprocessor(self, obs: dict) -> dict: + if not self.config.use_html: + obs.pop("pruned_html", None) + if not self.config.use_axtree: + obs.pop("axtree_txt", None) + if not self.config.use_screenshot: + obs.pop("screenshot", None) + if self.last_tool_call_id: + obs["tool_call_id"] = self.last_tool_call_id + return obs + + def get_action(self, obs: dict) -> tuple[ToolCallAction, dict]: + prev_actions = [step for step in self.history if isinstance(step, LLMOutput)] + if len(prev_actions) >= self.config.max_actions: + logger.warning("Max actions reached, stopping agent.") + stop_action = ToolCallAction(id="stop", function=FunctionCall(name="final_step", arguments={})) + return stop_action, {} + self.history.append(Observation(data=obs)) + steps = self.history + [UserMessage(message=self.config.guidance)] + messages = [m for step in steps for m in step.to_messages()] + tools = [tool.model_dump() for tool in self.action_set.actions] + try: + logger.info(colored(f"Prompt:\n{pprint.pformat(messages, width=120)}", "blue")) + response: ModelResponse = self.llm( + tools=tools, + messages=messages, + num_retries=self.config.max_retry, + ) + message = response.choices[0].message # type: ignore + except Exception as e: + logger.exception(f"Error getting LLM response: {e}. Prompt: {messages}") + raise e + logger.info(colored(f"LLM response:\n{pprint.pformat(message, width=120)}", "green")) + self.history.append(LLMOutput(message=message)) + thoughts = self.thoughts_from_message(message) + action = self.action_from_message(message) + + return action, {"think": thoughts} + + def thoughts_from_message(self, message) -> str: + thoughts = [] + if reasoning := message.get("reasoning_content"): + logger.info(colored(f"LLM reasoning:\n{reasoning}", "yellow")) + thoughts.append(reasoning) + if blocks := message.get("thinking_blocks"): + for block in blocks: + if thinking := getattr(block, "content", None) or getattr(block, "thinking", None): + logger.info(colored(f"LLM thinking block:\n{thinking}", "yellow")) + thoughts.append(thinking) + if message.content: + logger.info(colored(f"LLM output:\n{message.content}", "cyan")) + thoughts.append(message.content) + return "\n\n".join(thoughts) + + def action_from_message(self, message) -> ToolCallAction: + if message.tool_calls: + if len(message.tool_calls) > 1: + logger.warning("Multiple tool calls found in LLM response, using the first one.") + tool_call: ChatCompletionMessageToolCall = message.tool_calls[0] + assert isinstance(tool_call.function.name, str) + try: + args = json.loads(tool_call.function.arguments) + action = ToolCallAction( + id=tool_call.id, + function=FunctionCall(name=tool_call.function.name, arguments=args) + ) + except json.JSONDecodeError as e: + logger.exception(f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}") + raise e + + self.last_tool_call_id = action.id + else: + raise ValueError(f"No tool call found in LLM response: {message}") + return action + + +@dataclass +class ReactToolCallAgentArgs(AgentArgs): + llm_args: LLMArgs = None # type: ignore + config: AgentConfig = None # type: ignore + + def make_agent(self, actions: list[ToolSpec]) -> ReactToolCallAgent: + llm = self.llm_args.make_model() + action_set = ToolsActionSet(actions=actions) + return ReactToolCallAgent(action_set=action_set, llm=llm, config=self.config) + diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index b3fe12b6..7dbd630d 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -50,7 +50,6 @@ def reset(self, seed: int): "focused_element_bid": "none", } obs = self.task.obs_postprocess(obs) - logger.info(f"Initial obs: {obs}") return obs, {} def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, dict]: @@ -74,8 +73,6 @@ def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, d action_exec_stop = time.time() self._turns += 1 - logger.info(f"Obs: {observation}") - truncated = self._turns >= self.max_turns if self.task.validate_per_step or finished or truncated: From 29ba1c46ecaa273d3235e514fb141a67561554cc Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 16:18:41 +0000 Subject: [PATCH 41/61] few comments --- experiments/run_miniwob.py | 4 ++-- src/agentlab/agents/react_toolcall_agent.py | 9 ++++++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index ea6daf8e..8b740a48 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -35,8 +35,8 @@ def parse_args(): parser.add_argument( "--agent", choices=["tape", "generic", "react"], - default="tape", - help="Agent type to use (default: tape)", + default="react", + help="Agent type to use (default: react)", ) parser.add_argument( "--config", diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 13bfd1a0..6e782163 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -19,9 +19,12 @@ @dataclass class Observation: - data: dict + data: dict # expected keys: goal_object, pruned_html, axtree_txt, screenshot, last_action_error, action_result def to_messages(self) -> list[dict]: + """ + Convert the observation dictionary into a list of chat messages for Lite LLM + """ messages = [] tool_call_id = self.data.get("tool_call_id") if self.data.get("goal_object") and not tool_call_id: # its a first observation when there are no tool_call_id, so include goal @@ -69,6 +72,9 @@ def to_messages(self) -> list[dict]: @dataclass class LLMOutput: + """ + LiteLLM output message containing all the llm response details, suitable for putting back into prompt to reuse KV cache + """ message: Message def to_messages(self) -> list[Message]: return [self.message] @@ -138,6 +144,7 @@ def obs_preprocessor(self, obs: dict) -> dict: if not self.config.use_screenshot: obs.pop("screenshot", None) if self.last_tool_call_id: + # add tool_call_id to obs for linking observation to the last executed action obs["tool_call_id"] = self.last_tool_call_id return obs From d9c921618f798f2a91f1b003d8af7e7b0d8a1762 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 17:27:12 +0000 Subject: [PATCH 42/61] simplify history format --- src/agentlab/agents/react_toolcall_agent.py | 212 ++++++++---------- .../agents/tapeagent/conf/miniwob.yaml | 2 +- 2 files changed, 90 insertions(+), 124 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 6e782163..ab5d36d8 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -1,11 +1,12 @@ import json import logging import pprint +import time from dataclasses import dataclass from functools import partial -from typing import Callable +from typing import Callable, Literal -from litellm import completion_with_retries +from litellm import completion from litellm.types.utils import ChatCompletionMessageToolCall, Message, ModelResponse from PIL import Image from termcolor import colored @@ -17,81 +18,24 @@ logger = logging.getLogger(__name__) -@dataclass -class Observation: - data: dict # expected keys: goal_object, pruned_html, axtree_txt, screenshot, last_action_error, action_result - - def to_messages(self) -> list[dict]: - """ - Convert the observation dictionary into a list of chat messages for Lite LLM - """ - messages = [] - tool_call_id = self.data.get("tool_call_id") - if self.data.get("goal_object") and not tool_call_id: # its a first observation when there are no tool_call_id, so include goal - goal=self.data["goal_object"][0]["text"] - messages.append({ - "role": "user", - "content": f"## Goal:\n{goal}" - }) - text_obs = [] - if self.data.get("action_result"): - result=self.data["action_result"] - text_obs.append(f"Action Result:\n{result}") - if self.data.get("pruned_html"): - html=self.data["pruned_html"] - text_obs.append(f"Pruned HTML:\n{html}") - if self.data.get("axtree_txt"): - axtree=self.data["axtree_txt"] - text_obs.append(f"Accessibility Tree:\n{axtree}") - if self.data.get("last_action_error"): - error = self.data['last_action_error'] - text_obs.append(f"Action Error:\n{error}") - if text_obs: - if tool_call_id: - message = { - "role": "tool", - "tool_call_id": tool_call_id, - "content": "\n\n".join(text_obs), - } - else: - message = { - "role": "user", - "content": "\n\n".join(text_obs), - } - messages.append(message) - if self.data.get("screenshot"): - if isinstance(self.data["screenshot"], Image.Image): - image_content_url = image_to_png_base64_url(self.data["screenshot"]) - messages.append({ - "role": "user", - "content": [{"type": "image_url", "image_url": {"url": image_content_url}}], - }) - else: - raise ValueError(f"Expected Image.Image, got {type(self.data['screenshot'])}") - return messages -@dataclass -class LLMOutput: - """ - LiteLLM output message containing all the llm response details, suitable for putting back into prompt to reuse KV cache - """ - message: Message - def to_messages(self) -> list[Message]: - return [self.message] - -@dataclass -class SystemMessage: - message: str - def to_messages(self) -> list[dict]: - return [{"role": "system", "content": self.message}] +class LLMArgs(BaseModelArgs): + reasoning_effort: Literal["minimal", "low", "medium", "high"] = "low" + num_retries: int = 3 -@dataclass -class UserMessage: - message: str - def to_messages(self) -> list[dict]: - return [{"role": "user", "content": self.message}] + def make_model(self) -> Callable: + return partial( + completion, + model=self.model_name, + temperature=self.temperature, + max_tokens=self.max_total_tokens, + max_completion_tokens=self.max_new_tokens, + reasoning_effort=self.reasoning_effort, + num_retries=self.num_retries, + tool_choice="auto", + parallel_tool_calls=False, + ) -Step = LLMOutput | Observation | SystemMessage | UserMessage @dataclass class AgentConfig: @@ -112,68 +56,90 @@ class AgentConfig: 2. Evaluate action success, explain impact on task and next steps. 3. If you see any errors in the last observation, think about it. If there is no error, just move on. 4. List next steps to move towards the goal and propose next immediate action. -Then produce the function call that performs the proposed action. If the task is complete, produce the final step. +Then produce the single function call that performs the proposed action. If the task is complete, produce the final step. """ -class LLMArgs(BaseModelArgs): - reasoning_effort: str = "low" - - def make_model(self) -> Callable: - return partial( - completion_with_retries, - model=self.model_name, - temperature=self.temperature, - max_tokens=self.max_total_tokens, - max_completion_tokens=self.max_new_tokens, - reasoning_effort=self.reasoning_effort, - ) class ReactToolCallAgent: - def __init__(self, action_set: ToolsActionSet, llm: Callable, config: AgentConfig): + def __init__( + self, action_set: ToolsActionSet, llm: Callable[..., ModelResponse], config: AgentConfig + ): self.action_set = action_set - self.history: list[Step] = [SystemMessage(message=config.system_prompt)] + self.history: list[dict | Message] = [{"role": "system", "content": config.system_prompt}] self.llm = llm self.config = config self.last_tool_call_id: str = "" def obs_preprocessor(self, obs: dict) -> dict: - if not self.config.use_html: - obs.pop("pruned_html", None) - if not self.config.use_axtree: - obs.pop("axtree_txt", None) - if not self.config.use_screenshot: - obs.pop("screenshot", None) - if self.last_tool_call_id: - # add tool_call_id to obs for linking observation to the last executed action - obs["tool_call_id"] = self.last_tool_call_id return obs + def obs_to_messages(self, obs: dict) -> list[dict]: + """ + Convert the observation dictionary into a list of chat messages for Lite LLM + """ + messages = [] + if obs.get("goal_object") and not self.last_tool_call_id: + # its a first observation when there are no tool_call_id, so include goal + goal = obs["goal_object"][0]["text"] + messages.append({"role": "user", "content": f"## Goal:\n{goal}"}) + text_obs = [] + if result := obs.get("action_result"): + text_obs.append(f"## Action Result:\n{result}") + if error := obs.get("last_action_error"): + text_obs.append(f"## Action Error:\n{error}") + if self.config.use_html and (html := obs.get("pruned_html")): + text_obs.append(f"## HTML:\n{html}") + if self.config.use_axtree and (axtree := obs.get("axtree_txt")): + text_obs.append(f"## Accessibility Tree:\n{axtree}") + content = "\n\n".join(text_obs) + if content: + if self.last_tool_call_id: + message = { + "role": "tool", + "tool_call_id": self.last_tool_call_id, + "content": content, + } + else: + message = {"role": "user", "content": content} + messages.append(message) + if self.config.use_screenshot and (scr := obs.get("screenshot")): + if isinstance(scr, Image.Image): + image_content = [ + {"type": "image_url", "image_url": {"url": image_to_png_base64_url(scr)}} + ] + messages.append({"role": "user", "content": image_content}) + else: + raise ValueError( + f"Expected Image.Image in screenshot obs, got {type(obs['screenshot'])}" + ) + return messages + def get_action(self, obs: dict) -> tuple[ToolCallAction, dict]: - prev_actions = [step for step in self.history if isinstance(step, LLMOutput)] - if len(prev_actions) >= self.config.max_actions: + actions_count = len( + [msg for msg in self.history if isinstance(msg, Message) and msg.tool_calls] + ) + if actions_count >= self.config.max_actions: logger.warning("Max actions reached, stopping agent.") - stop_action = ToolCallAction(id="stop", function=FunctionCall(name="final_step", arguments={})) + stop_action = ToolCallAction( + id="stop", function=FunctionCall(name="final_step", arguments={}) + ) return stop_action, {} - self.history.append(Observation(data=obs)) - steps = self.history + [UserMessage(message=self.config.guidance)] - messages = [m for step in steps for m in step.to_messages()] + self.history += self.obs_to_messages(self.obs_preprocessor(obs)) tools = [tool.model_dump() for tool in self.action_set.actions] + messages = self.history + [{"role": "user", "content": self.config.guidance}] + try: logger.info(colored(f"Prompt:\n{pprint.pformat(messages, width=120)}", "blue")) - response: ModelResponse = self.llm( - tools=tools, - messages=messages, - num_retries=self.config.max_retry, - ) - message = response.choices[0].message # type: ignore + response = self.llm(tools=tools, messages=messages) + message = response.choices[0].message # type: ignore except Exception as e: logger.exception(f"Error getting LLM response: {e}. Prompt: {messages}") raise e logger.info(colored(f"LLM response:\n{pprint.pformat(message, width=120)}", "green")) - self.history.append(LLMOutput(message=message)) + + self.history.append(message) thoughts = self.thoughts_from_message(message) action = self.action_from_message(message) - return action, {"think": thoughts} def thoughts_from_message(self, message) -> str: @@ -187,7 +153,7 @@ def thoughts_from_message(self, message) -> str: logger.info(colored(f"LLM thinking block:\n{thinking}", "yellow")) thoughts.append(thinking) if message.content: - logger.info(colored(f"LLM output:\n{message.content}", "cyan")) + logger.info(colored(f"LLM text output:\n{message.content}", "cyan")) thoughts.append(message.content) return "\n\n".join(thoughts) @@ -199,27 +165,27 @@ def action_from_message(self, message) -> ToolCallAction: assert isinstance(tool_call.function.name, str) try: args = json.loads(tool_call.function.arguments) - action = ToolCallAction( - id=tool_call.id, - function=FunctionCall(name=tool_call.function.name, arguments=args) - ) except json.JSONDecodeError as e: - logger.exception(f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}") + logger.exception( + f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}" + ) raise e - + action = ToolCallAction( + id=tool_call.id, function=FunctionCall(name=tool_call.function.name, arguments=args) + ) self.last_tool_call_id = action.id + logger.info(f"Parsed tool call action: {action}") else: raise ValueError(f"No tool call found in LLM response: {message}") return action - + @dataclass class ReactToolCallAgentArgs(AgentArgs): - llm_args: LLMArgs = None # type: ignore - config: AgentConfig = None # type: ignore + llm_args: LLMArgs | None = None + config: AgentConfig | None = None def make_agent(self, actions: list[ToolSpec]) -> ReactToolCallAgent: llm = self.llm_args.make_model() action_set = ToolsActionSet(actions=actions) return ReactToolCallAgent(action_set=action_set, llm=llm, config=self.config) - diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml index e7e00ecd..de8571c6 100644 --- a/src/agentlab/agents/tapeagent/conf/miniwob.yaml +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -6,4 +6,4 @@ defaults: name: miniwob comment: MiniWob Agent parallel_backend: ray -n_jobs: 16 \ No newline at end of file +n_jobs: 8 \ No newline at end of file From cb6d213c4caebac896cd03503480bd04981a784d Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 17:31:24 +0000 Subject: [PATCH 43/61] fix --- src/agentlab/agents/react_toolcall_agent.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index ab5d36d8..7101873f 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -43,7 +43,6 @@ class AgentConfig: use_axtree: bool = False use_screenshot: bool = True max_actions: int = 10 - max_retry: int = 4 system_prompt: str = """ You are an expert AI Agent trained to assist users with complex web tasks. Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. From cc2389356bf0c8bc7878e7de89ef103411c2edf8 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 17:32:54 +0000 Subject: [PATCH 44/61] fix --- src/agentlab/agents/react_toolcall_agent.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 7101873f..8e45c417 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -1,13 +1,12 @@ import json import logging import pprint -import time from dataclasses import dataclass from functools import partial from typing import Callable, Literal from litellm import completion -from litellm.types.utils import ChatCompletionMessageToolCall, Message, ModelResponse +from litellm.types.utils import Message, ModelResponse from PIL import Image from termcolor import colored @@ -160,15 +159,8 @@ def action_from_message(self, message) -> ToolCallAction: if message.tool_calls: if len(message.tool_calls) > 1: logger.warning("Multiple tool calls found in LLM response, using the first one.") - tool_call: ChatCompletionMessageToolCall = message.tool_calls[0] - assert isinstance(tool_call.function.name, str) - try: - args = json.loads(tool_call.function.arguments) - except json.JSONDecodeError as e: - logger.exception( - f"Error in json parsing of tool call arguments, {e}: {tool_call.function.arguments}" - ) - raise e + tool_call = message.tool_calls[0] + args = json.loads(tool_call.function.arguments) action = ToolCallAction( id=tool_call.id, function=FunctionCall(name=tool_call.function.name, arguments=args) ) From b8e5c3a8af2efcfee74fd5c50bd33ea700823f28 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 17:52:50 +0000 Subject: [PATCH 45/61] simpler tool call object --- src/agentlab/actions.py | 26 ++++++------------- src/agentlab/agents/react_toolcall_agent.py | 15 +++++------ src/agentlab/backends/browser/__init__.py | 4 --- src/agentlab/backends/browser/base.py | 4 +-- src/agentlab/backends/browser/env.py | 10 +++---- src/agentlab/backends/browser/mcp.py | 9 +++---- .../backends/browser/mcp_playwright.py | 6 ++--- src/agentlab/backends/browser/playwright.py | 12 ++++----- 8 files changed, 34 insertions(+), 52 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index 60d21e11..fe6b5b66 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -1,10 +1,11 @@ import json import logging -from typing import Any, Callable, Literal +from typing import Callable, Literal +from uuid import uuid4 from bgym import AbstractActionSet from langchain_core.utils.function_calling import convert_to_openai_tool -from pydantic import BaseModel +from pydantic import BaseModel, Field from agentlab.llm.llm_utils import parse_html_tags_raise @@ -26,22 +27,11 @@ class FunctionSpec(BaseModel): parameters: dict -class FunctionCall(BaseModel): - """ - A class representing a function call. - - Attributes: - name (str): The name of the function being called. - arguments (Any): The arguments to be passed to the function. - """ +class ToolCall(BaseModel): + id: str = Field(default_factory=lambda: uuid4().hex) name: str - arguments: Any - - -class ToolCallAction(BaseModel): - id: str = "" - function: FunctionCall + arguments: dict = Field(default_factory=dict) def llm_view(self, **kwargs) -> str: return self.model_dump_json(indent=2) @@ -111,7 +101,7 @@ def example_action(self, abstract: bool) -> str: }""" @classmethod - def parse_action(cls, llm_output: str) -> ToolCallAction: + def parse_action(cls, llm_output: str) -> ToolCall: logger.info(f"Parsing action: {llm_output}") if "" in llm_output: content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) @@ -124,7 +114,7 @@ def parse_action(cls, llm_output: str) -> ToolCallAction: action_dict = json.loads(action_str) except json.JSONDecodeError: raise ValueError(f"Failed to parse action: {action_str}") - return ToolCallAction(function=FunctionCall(name=action_dict["name"], arguments=action_dict["arguments"])) + return ToolCall(name=action_dict["name"], arguments=action_dict["arguments"]) def to_python_code(self, action) -> str: return action diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 8e45c417..d540522f 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -10,7 +10,7 @@ from PIL import Image from termcolor import colored -from agentlab.actions import FunctionCall, ToolCallAction, ToolsActionSet, ToolSpec +from agentlab.actions import ToolCall, ToolsActionSet, ToolSpec from agentlab.agents.agent_args import AgentArgs from agentlab.llm.chat_api import BaseModelArgs from agentlab.llm.llm_utils import image_to_png_base64_url @@ -112,15 +112,13 @@ def obs_to_messages(self, obs: dict) -> list[dict]: ) return messages - def get_action(self, obs: dict) -> tuple[ToolCallAction, dict]: + def get_action(self, obs: dict) -> tuple[ToolCall, dict]: actions_count = len( [msg for msg in self.history if isinstance(msg, Message) and msg.tool_calls] ) if actions_count >= self.config.max_actions: logger.warning("Max actions reached, stopping agent.") - stop_action = ToolCallAction( - id="stop", function=FunctionCall(name="final_step", arguments={}) - ) + stop_action = ToolCall(name="final_step") return stop_action, {} self.history += self.obs_to_messages(self.obs_preprocessor(obs)) tools = [tool.model_dump() for tool in self.action_set.actions] @@ -155,15 +153,14 @@ def thoughts_from_message(self, message) -> str: thoughts.append(message.content) return "\n\n".join(thoughts) - def action_from_message(self, message) -> ToolCallAction: + def action_from_message(self, message) -> ToolCall: if message.tool_calls: if len(message.tool_calls) > 1: logger.warning("Multiple tool calls found in LLM response, using the first one.") tool_call = message.tool_calls[0] + name = tool_call.function.name args = json.loads(tool_call.function.arguments) - action = ToolCallAction( - id=tool_call.id, function=FunctionCall(name=tool_call.function.name, arguments=args) - ) + action = ToolCall(id=tool_call.id, name=name, arguments=args) self.last_tool_call_id = action.id logger.info(f"Parsed tool call action: {action}") else: diff --git a/src/agentlab/backends/browser/__init__.py b/src/agentlab/backends/browser/__init__.py index bed8c2b4..1e936c6c 100644 --- a/src/agentlab/backends/browser/__init__.py +++ b/src/agentlab/backends/browser/__init__.py @@ -1,4 +1,3 @@ -from agentlab.actions import FunctionCall, ToolCallAction, ToolSpec from agentlab.backends.browser.base import BrowserBackend from agentlab.backends.browser.env import BrowserEnv, BrowserEnvArgs from agentlab.backends.browser.mcp import MCPBrowserBackend, MCPClient @@ -7,9 +6,6 @@ __all__ = [ "BrowserBackend", - "FunctionCall", - "ToolCallAction", - "ToolSpec", "BrowserEnv", "BrowserEnvArgs", "MCPBrowserBackend", diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index 33d5da21..aa7c023b 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -4,7 +4,7 @@ from PIL import Image from pydantic import BaseModel -from agentlab.actions import ToolCallAction, ToolSpec +from agentlab.actions import ToolCall, ToolSpec logger = logging.getLogger(__name__) @@ -35,7 +35,7 @@ def page_axtree(self) -> str: pass @abstractmethod - def step(self, action: ToolCallAction) -> str: + def step(self, action: ToolCall) -> dict: pass @abstractmethod diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 7dbd630d..d1c7464d 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -3,7 +3,7 @@ from dataclasses import dataclass from pathlib import Path -from agentlab.actions import ToolCallAction, ToolsActionSet, ToolSpec +from agentlab.actions import ToolCall, ToolsActionSet, ToolSpec from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs from agentlab.benchmarks.web_task import AbstractWebTask @@ -52,13 +52,13 @@ def reset(self, seed: int): obs = self.task.obs_postprocess(obs) return obs, {} - def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, dict]: + def step(self, action: ToolCall | str) -> tuple[dict, float, bool, bool, dict]: if isinstance(action, str): action = ToolsActionSet.parse_action(action) logger.info(f"BrowserEnv.step() called with action {action}") action_exec_start = time.time() - finished = action.function.name == "final_step" + finished = action.name == "final_step" if finished: observation = { "goal_object": [{"type": "text", "text": self.goal}], @@ -91,7 +91,7 @@ def step(self, action: ToolCallAction | str) -> tuple[dict, float, bool, bool, d logger.info(f"Action result in observation: {observation}") return observation, reward, finished, truncated, env_info - def _step(self, action: ToolCallAction) -> dict: + def _step(self, action: ToolCall) -> dict: obs_dict = self.backend.step(action) if "goal_object" not in obs_dict: obs_dict["goal_object"] = [{"type": "text", "text": self.goal}] @@ -101,7 +101,7 @@ def _step(self, action: ToolCallAction) -> dict: obs_dict["focused_element_bid"] = "none" return obs_dict - def validate_task(self, action: ToolCallAction, observation: dict) -> tuple[float, dict]: + def validate_task(self, action: ToolCall, observation: dict) -> tuple[float, dict]: validate_js = self.task.get_step_validate_js() validate_result = self.backend.run_js(validate_js) reward, other = self.task.parse_validation_result(validate_result) diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index 57cffbb7..428dd9cb 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -10,7 +10,7 @@ from mcp import Tool as MCPTool from mcp.types import CallToolResult, ImageContent, TextContent -from agentlab.actions import FunctionSpec, ToolCallAction, ToolSpec +from agentlab.actions import FunctionSpec, ToolCall, ToolSpec from agentlab.backends.browser.base import BrowserBackend logger = logging.getLogger(__name__) @@ -150,13 +150,12 @@ def initialize(self) -> None: self._mcp = MCPClient(config_path=self.config_path) self._mcp.initialize() - def step(self, action: ToolCallAction) -> dict: - contents = self.call_tool(action.function.name, action.function.arguments) + def step(self, action: ToolCall) -> dict: + contents = self.call_tool(action.name, action.arguments) text = "\n".join([c.text for c in contents if c.type == "text"]) images = [c for c in contents if c.type == "image"] return { - "pruned_html": text, - "axtree_txt": text, + "text": text, "screenshot": images[-1] if images else None, } diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 18a96daa..ab4ddc62 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -4,7 +4,7 @@ from PIL import Image -from agentlab.actions import ToolCallAction +from agentlab.actions import ToolCall from agentlab.backends.browser.mcp import MCPBrowserBackend logger = logging.getLogger(__name__) @@ -27,8 +27,8 @@ def run_js(self, js: str): raise e return result_str - def step(self, action: ToolCallAction) -> dict: - contents = self.call_tool(action.function.name, action.function.arguments) + def step(self, action: ToolCall) -> dict: + contents = self.call_tool(action.name, action.arguments) logger.info(f"Step result has {len(contents)} contents") tool_result = "\n".join( [c.text for c in contents if c.type == "text" and "# Ran Playwright code" not in c.text] diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index f834b502..f1485e4f 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -6,7 +6,7 @@ from PIL import Image from playwright.async_api import Browser, Page, async_playwright -from agentlab.actions import ToolCallAction, ToolSpec +from agentlab.actions import ToolCall, ToolSpec from agentlab.backends.browser.base import BrowserBackend logger = logging.getLogger(__name__) @@ -106,13 +106,13 @@ def page_axtree(self): flat_axtree = flatten_axtree(axtree) return flat_axtree - def step(self, action: ToolCallAction): - fn = self._actions[action.function.name] + def step(self, action: ToolCall): + fn = self._actions[action.name] try: - action_result = self._loop.run_until_complete(fn(**action.function.arguments)) + action_result = self._loop.run_until_complete(fn(**action.arguments)) except Exception as e: - logger.error(f"Error executing action {action.function.name}: {e}") - action_result = f"Error executing action {action.function.name}: {e}" + action_result = f"Error executing action {action.name}: {e}" + logger.error(action_result) html = self.page_html() screenshot = self.page_screenshot() axtree = self.page_axtree() From 3d88daf77f183c401b1e25afe4375b858fc5b543 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 18:06:12 +0000 Subject: [PATCH 46/61] format --- src/agentlab/actions.py | 5 +++-- src/agentlab/agents/react_toolcall_agent.py | 6 +++--- src/agentlab/backends/browser/env.py | 1 + src/agentlab/backends/browser/mcp_playwright.py | 4 +++- src/agentlab/experiments/loop.py | 6 +++++- 5 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index fe6b5b66..d261bb48 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -27,7 +27,6 @@ class FunctionSpec(BaseModel): parameters: dict - class ToolCall(BaseModel): id: str = Field(default_factory=lambda: uuid4().hex) name: str @@ -106,7 +105,9 @@ def parse_action(cls, llm_output: str) -> ToolCall: if "" in llm_output: content_dict, valid, retry_message = parse_html_tags_raise(llm_output, keys=["action"]) if not valid or "action" not in content_dict: - raise ValueError(f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}") + raise ValueError( + f"Invalid action: llm_output: {llm_output}, retry_message: {retry_message}" + ) action_str = content_dict["action"] else: action_str = llm_output diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index d540522f..14c855d3 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -138,7 +138,7 @@ def get_action(self, obs: dict) -> tuple[ToolCall, dict]: action = self.action_from_message(message) return action, {"think": thoughts} - def thoughts_from_message(self, message) -> str: + def thoughts_from_message(self, message: Message) -> str: thoughts = [] if reasoning := message.get("reasoning_content"): logger.info(colored(f"LLM reasoning:\n{reasoning}", "yellow")) @@ -153,7 +153,7 @@ def thoughts_from_message(self, message) -> str: thoughts.append(message.content) return "\n\n".join(thoughts) - def action_from_message(self, message) -> ToolCall: + def action_from_message(self, message: Message) -> ToolCall: if message.tool_calls: if len(message.tool_calls) > 1: logger.warning("Multiple tool calls found in LLM response, using the first one.") @@ -162,7 +162,7 @@ def action_from_message(self, message) -> ToolCall: args = json.loads(tool_call.function.arguments) action = ToolCall(id=tool_call.id, name=name, arguments=args) self.last_tool_call_id = action.id - logger.info(f"Parsed tool call action: {action}") + logger.info(colored(f"Parsed tool call: {action}", "magenta")) else: raise ValueError(f"No tool call found in LLM response: {message}") return action diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index d1c7464d..6336410b 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -17,6 +17,7 @@ def final_step(): """ pass + class BrowserEnv(AbstractEnv): def __init__( self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index ab4ddc62..69eddc7c 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -44,7 +44,9 @@ def step(self, action: ToolCall) -> dict: } def page_html(self) -> str: - contents = self.call_tool("browser_evaluate", {"function": "document.documentElement.outerHTML"}) + contents = self.call_tool( + "browser_evaluate", {"function": "document.documentElement.outerHTML"} + ) raw_response = "\n".join([c.text for c in contents if c.type == "text"]) try: _, half_response = raw_response.split("### Result", maxsplit=1) diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index 8fb32005..4a8597dc 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -412,7 +412,11 @@ def run(self): logger.debug(f"Starting step {step_info.step}.") step_info.profiling.agent_start = time.time() action, step_info.agent_info = agent.get_action(step_info.obs.copy()) - step_info.action = action.model_dump_json(indent=2) if isinstance(action, BaseModel) else str(action) + step_info.action = ( + action.model_dump_json(indent=2) + if isinstance(action, BaseModel) + else str(action) + ) step_info.profiling.agent_stop = time.time() if step_info.agent_info.get("think", None): logger.info(f"Agent thought: {step_info.agent_info['think']}") From 768d37c2779db9a353e3b8b7bbf4b17e650b0b91 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 18:52:25 +0000 Subject: [PATCH 47/61] history compaction --- src/agentlab/agents/react_toolcall_agent.py | 83 ++++++++++++++++----- 1 file changed, 66 insertions(+), 17 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 14c855d3..3dee7f34 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -5,7 +5,7 @@ from functools import partial from typing import Callable, Literal -from litellm import completion +from litellm import completion, token_counter from litellm.types.utils import Message, ModelResponse from PIL import Image from termcolor import colored @@ -42,29 +42,43 @@ class AgentConfig: use_axtree: bool = False use_screenshot: bool = True max_actions: int = 10 + max_history_tokens: int = 120000 system_prompt: str = """ You are an expert AI Agent trained to assist users with complex web tasks. Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. Keep your replies brief, concise, direct and on topic. Prioritize clarity and avoid over-elaboration. -Do not express emotions or opinions. -""" +Do not express emotions or opinions.""" guidance: str = """ Think along the following lines: 1. Summarize the last observation and describe the visible changes in the state. 2. Evaluate action success, explain impact on task and next steps. 3. If you see any errors in the last observation, think about it. If there is no error, just move on. 4. List next steps to move towards the goal and propose next immediate action. -Then produce the single function call that performs the proposed action. If the task is complete, produce the final step. -""" +Then produce the single function call that performs the proposed action. If the task is complete, produce the final step.""" + summarize_system_prompt: str = """ +You are a helpful assistant that summarizes conversation history. Following messages is the history to summarize:""" + summarize_prompt: str = """ +Summarize the presented agent interaction history concisely. +Focus on: +- The original goal +- Key actions taken and their outcomes +- Important errors or obstacles encountered +- Current progress toward the goal +Provide a concise summary that preserves all information needed to continue the task.""" class ReactToolCallAgent: def __init__( - self, action_set: ToolsActionSet, llm: Callable[..., ModelResponse], config: AgentConfig + self, + action_set: ToolsActionSet, + llm: Callable[..., ModelResponse], + token_counter: Callable[..., int], + config: AgentConfig, ): self.action_set = action_set self.history: list[dict | Message] = [{"role": "system", "content": config.system_prompt}] self.llm = llm + self.token_counter = token_counter self.config = config self.last_tool_call_id: str = "" @@ -113,14 +127,12 @@ def obs_to_messages(self, obs: dict) -> list[dict]: return messages def get_action(self, obs: dict) -> tuple[ToolCall, dict]: - actions_count = len( - [msg for msg in self.history if isinstance(msg, Message) and msg.tool_calls] - ) - if actions_count >= self.config.max_actions: + if self.max_actions_reached(): logger.warning("Max actions reached, stopping agent.") - stop_action = ToolCall(name="final_step") - return stop_action, {} + return ToolCall(name="final_step"), {} + self.history += self.obs_to_messages(self.obs_preprocessor(obs)) + self.maybe_compact_history() tools = [tool.model_dump() for tool in self.action_set.actions] messages = self.history + [{"role": "user", "content": self.config.guidance}] @@ -136,21 +148,23 @@ def get_action(self, obs: dict) -> tuple[ToolCall, dict]: self.history.append(message) thoughts = self.thoughts_from_message(message) action = self.action_from_message(message) - return action, {"think": thoughts} + return action, {"think": thoughts, "chat_messages": self.history} + + def max_actions_reached(self) -> bool: + prev_actions = [msg for msg in self.history if isinstance(msg, Message) and msg.tool_calls] + return len(prev_actions) >= self.config.max_actions def thoughts_from_message(self, message: Message) -> str: thoughts = [] if reasoning := message.get("reasoning_content"): - logger.info(colored(f"LLM reasoning:\n{reasoning}", "yellow")) thoughts.append(reasoning) if blocks := message.get("thinking_blocks"): for block in blocks: if thinking := getattr(block, "content", None) or getattr(block, "thinking", None): - logger.info(colored(f"LLM thinking block:\n{thinking}", "yellow")) thoughts.append(thinking) if message.content: - logger.info(colored(f"LLM text output:\n{message.content}", "cyan")) thoughts.append(message.content) + logger.info(colored(f"LLM thoughts: {thoughts}", "cyan")) return "\n\n".join(thoughts) def action_from_message(self, message: Message) -> ToolCall: @@ -167,6 +181,40 @@ def action_from_message(self, message: Message) -> ToolCall: raise ValueError(f"No tool call found in LLM response: {message}") return action + def maybe_compact_history(self): + tokens = self.token_counter(messages=self.history) + if tokens > self.config.max_history_tokens: + logger.info("Compacting history due to length.") + self.compact_history() + short_tokens = self.token_counter(messages=self.history) + logger.info(f"Compacted history from {tokens} to {short_tokens} tokens.") + + def compact_history(self): + """ + Compact the history by summarizing the first half of messages with the LLM. + Updates self.history in place by replacing the first half with the summary message. + """ + system_msg = self.history[0] + rest = self.history[1:] + midpoint = len(rest) // 2 + messages = [ + {"role": "system", "content": self.config.summarize_system_prompt}, + *rest[:midpoint], + {"role": "user", "content": self.config.summarize_prompt}, + ] + + try: + response = self.llm(messages=messages, tool_choice="none") + summary = response.choices[0].message.content # type: ignore + except Exception as e: + logger.exception(f"Error compacting history: {e}") + raise + + logger.info(colored(f"Compacted {midpoint} messages into summary:\n{summary}", "cyan")) + # Rebuild history: system + summary + remaining messages + summary_message = {"role": "user", "content": f"## Previous Interaction :\n{summary}"} + self.history = [system_msg, summary_message, *rest[midpoint:]] + @dataclass class ReactToolCallAgentArgs(AgentArgs): @@ -175,5 +223,6 @@ class ReactToolCallAgentArgs(AgentArgs): def make_agent(self, actions: list[ToolSpec]) -> ReactToolCallAgent: llm = self.llm_args.make_model() + counter = partial(token_counter, model=self.llm_args.model_name) action_set = ToolsActionSet(actions=actions) - return ReactToolCallAgent(action_set=action_set, llm=llm, config=self.config) + return ReactToolCallAgent(action_set, llm, counter, self.config) From e28eb0f20ddf63c5a430797b0edcf32caeaa07e8 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Tue, 25 Nov 2025 19:16:00 +0000 Subject: [PATCH 48/61] tool schemas in the action module --- src/agentlab/actions.py | 4 ++++ src/agentlab/agents/react_toolcall_agent.py | 10 ++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/agentlab/actions.py b/src/agentlab/actions.py index d261bb48..a0dd8d10 100644 --- a/src/agentlab/actions.py +++ b/src/agentlab/actions.py @@ -119,3 +119,7 @@ def parse_action(cls, llm_output: str) -> ToolCall: def to_python_code(self, action) -> str: return action + + def tools(self) -> list[dict]: + """Returns the list of tool spec dicts for LLM consumption.""" + return [tool.model_dump() for tool in self.actions] diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 3dee7f34..c500fe49 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -56,7 +56,7 @@ class AgentConfig: 4. List next steps to move towards the goal and propose next immediate action. Then produce the single function call that performs the proposed action. If the task is complete, produce the final step.""" summarize_system_prompt: str = """ -You are a helpful assistant that summarizes conversation history. Following messages is the history to summarize:""" +You are a helpful assistant that summarizes agent interaction history. Following messages is the history to summarize:""" summarize_prompt: str = """ Summarize the presented agent interaction history concisely. Focus on: @@ -76,6 +76,7 @@ def __init__( config: AgentConfig, ): self.action_set = action_set + self.tools = self.action_set.tools() self.history: list[dict | Message] = [{"role": "system", "content": config.system_prompt}] self.llm = llm self.token_counter = token_counter @@ -131,14 +132,13 @@ def get_action(self, obs: dict) -> tuple[ToolCall, dict]: logger.warning("Max actions reached, stopping agent.") return ToolCall(name="final_step"), {} - self.history += self.obs_to_messages(self.obs_preprocessor(obs)) + self.history += self.obs_to_messages(obs) self.maybe_compact_history() - tools = [tool.model_dump() for tool in self.action_set.actions] messages = self.history + [{"role": "user", "content": self.config.guidance}] try: logger.info(colored(f"Prompt:\n{pprint.pformat(messages, width=120)}", "blue")) - response = self.llm(tools=tools, messages=messages) + response = self.llm(tools=self.tools, messages=messages) message = response.choices[0].message # type: ignore except Exception as e: logger.exception(f"Error getting LLM response: {e}. Prompt: {messages}") @@ -155,6 +155,7 @@ def max_actions_reached(self) -> bool: return len(prev_actions) >= self.config.max_actions def thoughts_from_message(self, message: Message) -> str: + """Extract the agent's thoughts from the LLM message.""" thoughts = [] if reasoning := message.get("reasoning_content"): thoughts.append(reasoning) @@ -168,6 +169,7 @@ def thoughts_from_message(self, message: Message) -> str: return "\n\n".join(thoughts) def action_from_message(self, message: Message) -> ToolCall: + """Parse the ToolCall from the LLM message.""" if message.tool_calls: if len(message.tool_calls) > 1: logger.warning("Multiple tool calls found in LLM response, using the first one.") From f10615faaebe6c55705246c0d63a47f61c8d0c17 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 15:53:58 +0000 Subject: [PATCH 49/61] better task interface, support old bgym tasks in the new env --- src/agentlab/backends/browser/__init__.py | 6 +- src/agentlab/backends/browser/base.py | 63 ++++++++++- src/agentlab/backends/browser/env.py | 127 +++++++++++----------- src/agentlab/benchmarks/miniwob/task.py | 77 ++++++++++--- src/agentlab/benchmarks/web_task.py | 62 ++++++++--- 5 files changed, 237 insertions(+), 98 deletions(-) diff --git a/src/agentlab/backends/browser/__init__.py b/src/agentlab/backends/browser/__init__.py index 1e936c6c..9fc3f071 100644 --- a/src/agentlab/backends/browser/__init__.py +++ b/src/agentlab/backends/browser/__init__.py @@ -1,15 +1,17 @@ -from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.base import AsyncBrowserBackend, BrowserBackend from agentlab.backends.browser.env import BrowserEnv, BrowserEnvArgs from agentlab.backends.browser.mcp import MCPBrowserBackend, MCPClient from agentlab.backends.browser.mcp_playwright import MCPPlaywright -from agentlab.backends.browser.playwright import AsyncPlaywright +from agentlab.backends.browser.playwright import AsyncPlaywright, SyncPlaywright __all__ = [ "BrowserBackend", + "AsyncBrowserBackend", "BrowserEnv", "BrowserEnvArgs", "MCPBrowserBackend", "MCPClient", "MCPPlaywright", "AsyncPlaywright", + "SyncPlaywright", ] diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py index aa7c023b..d11d0d5f 100644 --- a/src/agentlab/backends/browser/base.py +++ b/src/agentlab/backends/browser/base.py @@ -10,13 +10,15 @@ class BrowserBackend(BaseModel, ABC): + has_pw_page: bool = False + @abstractmethod def initialize(self) -> None: pass @abstractmethod - def run_js(self, js: str): - pass + def evaluate_js(self, js: str) -> str | dict | list: + return "" @abstractmethod def goto(self, url: str) -> str: @@ -27,7 +29,7 @@ def page_html(self) -> str: pass @abstractmethod - def page_screenshot(self) -> Image: + def page_screenshot(self) -> Image.Image: pass @abstractmethod @@ -39,9 +41,62 @@ def step(self, action: ToolCall) -> dict: pass @abstractmethod - def actions(self) -> tuple[ToolSpec]: + def actions(self) -> list[ToolSpec]: pass @abstractmethod def close(self) -> None: pass + + @property + def page(self): + raise NotImplementedError("Direct access to the playwright page is not supported.") + + +class AsyncBrowserBackend(BaseModel): + """Abstract base class for async browser backends.""" + + has_pw_page: bool = False + + class Config: + arbitrary_types_allowed = True + + @abstractmethod + async def initialize(self) -> None: + pass + + @abstractmethod + async def evaluate_js(self, js: str) -> str | dict | list: + pass + + @abstractmethod + async def goto(self, url: str) -> None: + pass + + @abstractmethod + async def page_html(self) -> str: + pass + + @abstractmethod + async def page_screenshot(self) -> Image.Image: + pass + + @abstractmethod + async def page_axtree(self) -> str: + pass + + @abstractmethod + async def step(self, action: ToolCall) -> dict: + pass + + @abstractmethod + def actions(self) -> list[ToolSpec]: + pass + + @abstractmethod + async def close(self) -> None: + pass + + @property + def page(self): + raise NotImplementedError("Direct access to the playwright page is not supported.") diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 6336410b..8059af09 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from pathlib import Path +from browsergym.core.task import AbstractBrowserTask + from agentlab.actions import ToolCall, ToolsActionSet, ToolSpec from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs @@ -15,30 +17,42 @@ def final_step(): """ Finish the task execution. """ - pass + return { + "pruned_html": "Task finished", + "axtree_txt": "", + "last_action_error": "", + "focused_element_bid": "none", + } class BrowserEnv(AbstractEnv): def __init__( - self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 + self, task_name: str, task: AbstractWebTask | AbstractBrowserTask, backend: BrowserBackend, seed: int = 0 ): self.task_name = task_name self.task = task self.seed = seed self._turns = 0 - self.max_turns = task.max_turns self.backend = backend self.backend.initialize() self.goal = "" + if isinstance(self.task, AbstractBrowserTask) and not self.backend.has_pw_page: + raise ValueError( + "Legacy task requires a backend with direct playwright page access." + ) def reset(self, seed: int): self.seed = seed - logger.info(f"Open task URL: {self.task.url}") - self.backend.goto(self.task.url) - setup_js = self.task.get_setup_js() - if setup_js: - self.goal = self.task.parse_setup_result(self.backend.run_js(setup_js)) - logger.info(f"Task goal: {self.goal}") + if isinstance(self.task, AbstractBrowserTask): + self.goal, task_info = self.task.setup(page=self.backend.page) + obs = self._get_obs() + else: + self.goal, task_info = self.task.setup(backend=self.backend) + obs = self._get_obs() + obs = self.task.obs_postprocess(obs) + return obs, task_info + + def _get_obs(self) -> dict: html = self.backend.page_html() screenshot = self.backend.page_screenshot() axtree = self.backend.page_axtree() @@ -50,8 +64,7 @@ def reset(self, seed: int): "last_action_error": "", "focused_element_bid": "none", } - obs = self.task.obs_postprocess(obs) - return obs, {} + return obs def step(self, action: ToolCall | str) -> tuple[dict, float, bool, bool, dict]: if isinstance(action, str): @@ -59,71 +72,64 @@ def step(self, action: ToolCall | str) -> tuple[dict, float, bool, bool, dict]: logger.info(f"BrowserEnv.step() called with action {action}") action_exec_start = time.time() - finished = action.name == "final_step" - if finished: - observation = { - "goal_object": [{"type": "text", "text": self.goal}], - "pruned_html": "Task finished", - "axtree_txt": "", - "last_action_error": "", - "focused_element_bid": "none", - } + done = action.name == "final_step" + if done: + observation = final_step() else: - observation = self._step(action) - observation = self.task.obs_postprocess(observation) - + observation = self.backend.step(action) action_exec_stop = time.time() self._turns += 1 - truncated = self._turns >= self.max_turns + if isinstance(self.task, AbstractWebTask): + truncated = self._turns >= self.task.max_turns + else: + truncated = False - if self.task.validate_per_step or finished or truncated: - reward, other = self.validate_task(action, observation) - if other.get("done", False): - finished = True + observation = self.obs_postprocess(observation) + + if isinstance(self.task, AbstractBrowserTask): + reward, done, _, info = self.task.validate(page=self.backend.page, chat_messages=[]) + elif self.task.validate_per_step or done or truncated: + reward, info = self.task.validate() + if info.get("done", False): + done = True else: reward = 0.0 - other = {} + info = {} env_info = { + **info, "action_exec_start": action_exec_start, "action_exec_stop": action_exec_stop, - "action_exec_timeout": 0.0, - } | other + "action_exec_timeout": 0.0 + } logger.info(f"Action result in observation: {observation}") - return observation, reward, finished, truncated, env_info - - def _step(self, action: ToolCall) -> dict: - obs_dict = self.backend.step(action) - if "goal_object" not in obs_dict: - obs_dict["goal_object"] = [{"type": "text", "text": self.goal}] - if "last_action_error" not in obs_dict: - obs_dict["last_action_error"] = "" - if "focused_element_bid" not in obs_dict: - obs_dict["focused_element_bid"] = "none" - return obs_dict - - def validate_task(self, action: ToolCall, observation: dict) -> tuple[float, dict]: - validate_js = self.task.get_step_validate_js() - validate_result = self.backend.run_js(validate_js) - reward, other = self.task.parse_validation_result(validate_result) - return reward, other + return observation, reward, done, truncated, env_info + + def obs_postprocess(self, obs: dict) -> dict: + if "goal_object" not in obs: + obs["goal_object"] = [{"type": "text", "text": self.goal}] + if "last_action_error" not in obs: + obs["last_action_error"] = "" + if "focused_element_bid" not in obs: + obs["focused_element_bid"] = "none" + if isinstance(self.task, AbstractWebTask): + obs = self.task.obs_postprocess(obs) + return obs def close(self): - teardown_js = self.task.get_teardown_js() - if teardown_js: - js_result_str = self.backend.run_js(teardown_js) - logger.info(f"Task teardown result: {js_result_str}") - self.backend.close() + self.task.teardown() def actions(self) -> list[ToolSpec]: all_actions = self.backend.actions() - filtered_actions = self.task.filter_actions(all_actions) - logger.info( - f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}" - ) + if isinstance(self.task, AbstractWebTask): + filtered_actions = self.task.filter_actions(all_actions) + logger.info( + f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for dataset {self.task.dataset}" + ) + else: + filtered_actions = all_actions final_step_action = ToolSpec.from_function(final_step) - filtered_actions.append(final_step_action) - return filtered_actions + return filtered_actions + [final_step_action] @dataclass @@ -135,12 +141,11 @@ class BrowserEnvArgs(AbstractEnvArgs): def __init__( self, - task_name: str, task: AbstractWebTask, backend_cls: type[BrowserBackend], task_seed: int = 0, ): - self.task_name = task_name + self.task_name = f"{task.dataset}.{task.task_id}" self.task = task self.task_seed = task_seed self.backend_cls = backend_cls diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 36d5e34e..80ffa12a 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -3,8 +3,8 @@ from typing import Any, ClassVar from browsergym.miniwob import ALL_MINIWOB_TASKS -from PIL import Image +from agentlab.backends.browser import BrowserBackend from agentlab.benchmarks.web_task import AbstractWebTask logger = logging.getLogger(__name__) @@ -12,11 +12,10 @@ class MiniWobTask(AbstractWebTask): dataset: str = "miniwob" - task_id: str desc: str subdomain: str - base_url: str = None - url: str = None + base_url: str = None # type: ignore + url: str = None # type: ignore remove_human_display: bool = True episode_max_time: int = 1000000 max_turns: int = 10 @@ -36,7 +35,54 @@ def model_post_init(self, __context: Any): self.base_url = self.base_url[:-1] self.url = f"{self.base_url}/{self.subdomain}.html" - def get_setup_js(self) -> str: + + def setup(self, backend: BrowserBackend) -> tuple[str, dict]: + """ + Set up everything needed to execute the task. + + Args: + page: the active playwright page. + + Returns: + goal: str, goal of the task. + info: dict, custom information from the task. + """ + backend.goto(self.url) + setup_js = self._get_setup_js() + setup_result = backend.evaluate_js(setup_js) + goal, info = self._parse_setup_result(setup_result) + self._backend = backend + return goal, info + + def teardown(self) -> None: + """ + Tear down the task, clean up resources if needed. + + Args: + page: the active playwright page. + """ + teardown_js = self._get_teardown_js() + if teardown_js: + self._backend.evaluate_js(teardown_js) + + def validate(self) -> tuple[float, dict]: + """ + Validate the task, either per step or at the end. + + Returns: + reward: float, the reward obtained. + info: dict, custom information from the validation. + """ + validate_js = ( + self._get_step_validate_js() + if self.validate_per_step + else self._get_task_validate_js() + ) + validate_result = self._backend.evaluate_js(validate_js) + reward, info = self._parse_validation_result(validate_result) + return reward, info + + def _get_setup_js(self) -> str: if self.remove_human_display: logger.info("Remove human display") js = r""" @@ -107,29 +153,33 @@ def get_setup_js(self) -> str: """ return f"async () => {{{js}}}" - def parse_setup_result(self, setup_result: str | dict | list) -> str: + def _parse_setup_result(self, setup_result: str | dict | list) -> tuple[str, dict]: if isinstance(setup_result, dict): - return setup_result["utterance"] + return setup_result["utterance"], {} + elif isinstance(setup_result, str): + return setup_result, {} else: - return setup_result + raise ValueError(f"Unexpected setup_result type: {type(setup_result)}") - def get_teardown_js(self) -> str: + def _get_teardown_js(self) -> str: return "" - def get_step_validate_js(self) -> str: + def _get_step_validate_js(self) -> str: return """() => { return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; }""" - def get_task_validate_js(self) -> str: + def _get_task_validate_js(self) -> str: return """() => { return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; }""" - def parse_validation_result(self, validation_result: str | list) -> tuple[float, dict]: + def _parse_validation_result(self, validation_result: str | dict | list) -> tuple[float, dict]: if isinstance(validation_result, list): chunks = validation_result done = chunks[3] + elif isinstance(validation_result, dict): + raise ValueError("Validation result as dict is not supported") else: chunks = [c.strip() for c in validation_result.split(",")] done = chunks[3].strip().lower() == "true" @@ -142,8 +192,7 @@ def parse_validation_result(self, validation_result: str | list) -> tuple[float, } def obs_postprocess(self, obs: dict) -> dict: - screenshot: Image.Image | None = obs.get("screenshot", None) - if screenshot is not None: + if screenshot := obs.get("screenshot", None): obs["screenshot"] = screenshot.crop( (0, 0, 332, 214) ) # crop to 332x214 because this is the viewport size for MiniWob diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index 56d0d9b1..b77eeb8b 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -1,38 +1,66 @@ +from abc import ABC, abstractmethod from typing import ClassVar from pydantic import BaseModel from agentlab.actions import ToolSpec +from agentlab.backends.browser import BrowserBackend -class AbstractWebTask(BaseModel): +class AbstractWebTask(BaseModel, ABC): dataset: str + task_id: str url: str validate_per_step: bool = False actions_whitelist: ClassVar[list[str]] = [] max_turns: int = 100 + _backend: BrowserBackend = None # type: ignore - @classmethod - def filter_actions(cls, actions: list[ToolSpec]) -> list[str]: - return [action for action in actions if action.function.name in cls.actions_whitelist] + def get_task_id(self) -> str: + return self.task_id + + @abstractmethod + def setup(self, backend: BrowserBackend) -> tuple[str, dict]: + """ + Set up everything needed to execute the task. + + Args: + page: the active playwright page. - def get_setup_js(self) -> str: - raise NotImplementedError + Returns: + goal: str, goal of the task. + info: dict, custom information from the task. + """ - def parse_setup_result(self, setup_result: str | dict | list) -> str: - raise NotImplementedError + @abstractmethod + def teardown(self): + """ + Tear down the task, clean up resources if needed. - def get_teardown_js(self) -> str: - raise NotImplementedError + Args: + page: the active playwright page. + """ - def get_task_validate_js(self) -> str: - raise NotImplementedError + @abstractmethod + def validate(self) -> tuple[float, dict]: + """ + Validate the task, either per step or at the end. - def get_step_validate_js(self) -> str: - raise NotImplementedError + Returns: + reward: float, the reward obtained. + info: dict, custom information from the validation. + """ - def parse_validation_result(self, validate_result: str) -> tuple[float, dict]: - raise NotImplementedError + @abstractmethod + def cheat(self): + """ + Solve the task using a pre-defined solution (optional). + """ + + @classmethod + def filter_actions(cls, actions: list[ToolSpec]) -> list[ToolSpec]: + filtered_actions = [action for action in actions if action.function.name in cls.actions_whitelist] + return filtered_actions def obs_postprocess(self, obs: dict) -> dict: - return obs + return obs \ No newline at end of file From a203e464edaac76f95a3406b7bfb74f3743798ab Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 15:55:33 +0000 Subject: [PATCH 50/61] support new tasks interface --- experiments/run_miniwob.py | 4 +- experiments/test_mcp.py | 42 ------------------- src/agentlab/backends/browser/mcp.py | 22 +++++----- .../backends/browser/mcp_playwright.py | 8 ++-- src/agentlab/benchmarks/miniwob/benchmark.py | 3 +- 5 files changed, 18 insertions(+), 61 deletions(-) delete mode 100644 experiments/test_mcp.py diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index 8b740a48..ac0de326 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -11,7 +11,7 @@ from agentlab.agents.react_toolcall_agent import AgentConfig, LLMArgs, ReactToolCallAgentArgs from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright -from agentlab.backends.browser.playwright import AsyncPlaywright +from agentlab.backends.browser.playwright import SyncPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study from agentlab.llm.chat_api import BaseModelArgs @@ -54,7 +54,7 @@ def parse_args(): if args.backend == "bgym": benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) elif args.backend == "playwright": - benchmark = MiniWobBenchmark(backend_cls=AsyncPlaywright) + benchmark = MiniWobBenchmark(backend_cls=SyncPlaywright) elif args.backend == "mcp": benchmark = MiniWobBenchmark(backend_cls=MCPPlaywright) else: diff --git a/experiments/test_mcp.py b/experiments/test_mcp.py deleted file mode 100644 index 09eb7469..00000000 --- a/experiments/test_mcp.py +++ /dev/null @@ -1,42 +0,0 @@ -from agentlab.backends.browser.mcp_playwright import MCPPlaywright -from agentlab.benchmarks.miniwob.task import get_miniwob_tasks - - -def main(): - tasks = get_miniwob_tasks() - task = tasks[0] - setup_js = task.get_setup_js() - - backend = MCPPlaywright() - backend.initialize() - print(backend.actions()) - - print("="*100) - # 1. goto task url - print("URL: ", task.url) - obs = backend.call_tool("browser_navigate", {"url": task.url}) - print("------") - print(obs) - print("-"*100) - - # 2. eval js - obs = backend.run_js(setup_js) - print("------") - print(obs) - print("-"*100) - - # 3. validate - print("\n\nVALIDATE") - js = task.get_task_validate_js() - print(js) - obs = backend.run_js(js) - print("------") - print(obs) - print("-"*100) - -if __name__ == "__main__": - main() - - - - \ No newline at end of file diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index 428dd9cb..d6f1e9e4 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -8,7 +8,7 @@ from mcp import ClientSession, StdioServerParameters, stdio_client from mcp import Tool as MCPTool -from mcp.types import CallToolResult, ImageContent, TextContent +from mcp.types import CallToolResult, ContentBlock, TextContent from agentlab.actions import FunctionSpec, ToolCall, ToolSpec from agentlab.backends.browser.base import BrowserBackend @@ -24,7 +24,7 @@ def __init__(self, config_path: str, read_timeout_seconds: int = 10) -> None: self.tool_to_server: dict[str, str] = {} self.read_timeout_seconds = read_timeout_seconds self.exit_stack = AsyncExitStack() - self.loop = None + self.loop: asyncio.AbstractEventLoop def initialize(self): try: @@ -125,15 +125,15 @@ def check_tool_exists(self, tool_name): raise Exception(f"Tool {tool_name} not found in any of the MCP servers") return server_name - def actions(self) -> tuple[ToolSpec]: - return ( + def actions(self) -> list[ToolSpec]: + return [ ToolSpec( function=FunctionSpec( name=tool.name, description=tool.description or "", parameters=tool.inputSchema ) ) for tool in self.tools.values() - ) + ] async def aclose(self) -> None: await self.exit_stack.aclose() @@ -144,7 +144,7 @@ def close(self) -> None: class MCPBrowserBackend(BrowserBackend): config_path: str - _mcp = None + _mcp: MCPClient def initialize(self) -> None: self._mcp = MCPClient(config_path=self.config_path) @@ -152,20 +152,20 @@ def initialize(self) -> None: def step(self, action: ToolCall) -> dict: contents = self.call_tool(action.name, action.arguments) - text = "\n".join([c.text for c in contents if c.type == "text"]) + action_result = "\n".join([c.text for c in contents if c.type == "text"]) images = [c for c in contents if c.type == "image"] return { - "text": text, + "action_result": action_result, "screenshot": images[-1] if images else None, } - def call_tool(self, tool_name: str, arguments: dict) -> list[TextContent | ImageContent]: + def call_tool(self, tool_name: str, arguments: dict) -> list[ContentBlock]: tool_result = self._mcp.call_tool(tool_name, arguments) if tool_result.isError: - return [TextContent(text=f"Error calling tool {tool_name}")] + tool_result.content + return [TextContent(type="text", text=f"Error calling tool {tool_name}")] + tool_result.content return tool_result.content - def actions(self) -> tuple[ToolSpec]: + def actions(self) -> list[ToolSpec]: return list(self._mcp.actions()) def close(self) -> None: diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 69eddc7c..0718b356 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -15,7 +15,7 @@ class MCPPlaywright(MCPBrowserBackend): config_path: str = DEFAULT_CONFIG_PATH - def run_js(self, js: str): + def evaluate_js(self, js: str): contents = self.call_tool("browser_evaluate", {"function": js}) raw_response = "\n".join([c.text for c in contents if c.type == "text"]) try: @@ -30,14 +30,14 @@ def run_js(self, js: str): def step(self, action: ToolCall) -> dict: contents = self.call_tool(action.name, action.arguments) logger.info(f"Step result has {len(contents)} contents") - tool_result = "\n".join( + action_result = "\n".join( [c.text for c in contents if c.type == "text" and "# Ran Playwright code" not in c.text] ) html = self.page_html() screenshot = self.page_screenshot() axtree = self.page_axtree() return { - "tool_result": tool_result, + "action_result": action_result, "pruned_html": html, "axtree_txt": axtree, "screenshot": screenshot, @@ -60,7 +60,7 @@ def page_axtree(self) -> str: contents = self.call_tool("browser_snapshot", {}) return "\n".join([c.text for c in contents if c.type == "text"]) - def page_screenshot(self) -> Image: + def page_screenshot(self) -> Image.Image: contents = self.call_tool("browser_take_screenshot", {}) content = [c for c in contents if c.type == "image"][0] image_base64 = content.data diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py index 1c38bbcd..2ce01895 100644 --- a/src/agentlab/benchmarks/miniwob/benchmark.py +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -28,7 +28,6 @@ def model_post_init(self, __context: Any) -> None: if self.dataset is None: self.dataset = get_miniwob_tasks() for task in self.dataset: - name = f"miniwob.{task.task_id}" - env_args = BrowserEnvArgs(task_name=name, task=task, backend_cls=self.backend_cls) + env_args = BrowserEnvArgs(task=task, backend_cls=self.backend_cls) self.env_args_list.append(env_args) logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks") From 362de79a71c9d93f61ee926968481933481f62be Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 15:55:44 +0000 Subject: [PATCH 51/61] async playwright backend --- src/agentlab/backends/browser/playwright.py | 218 ++++++++++++++------ 1 file changed, 160 insertions(+), 58 deletions(-) diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index f1485e4f..c47aa86f 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -1,22 +1,29 @@ -import asyncio import logging from io import BytesIO from typing import Any, Callable from PIL import Image -from playwright.async_api import Browser, Page, async_playwright +from playwright.async_api import Page as AsyncPage +from playwright.async_api import async_playwright +from playwright.sync_api import Page as SyncPage +from playwright.sync_api import sync_playwright from agentlab.actions import ToolCall, ToolSpec -from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.base import AsyncBrowserBackend, BrowserBackend logger = logging.getLogger(__name__) -class AsyncPlaywright(BrowserBackend): +_pw = None # Global Playwright instance for SyncPlaywright +_browser = None # Global Browser instance for SyncPlaywright + + +class SyncPlaywright(BrowserBackend): + """Fully synchronous Playwright backend using playwright.sync_api.""" + + has_pw_page: bool = True _actions: dict[str, Callable] - _loop: asyncio.AbstractEventLoop - _browser: Browser - _page: Page + _page: SyncPage def model_post_init(self, __context: Any): self._actions = { @@ -29,37 +36,139 @@ def model_post_init(self, __context: Any): "browser_mouse_click_xy": self.browser_mouse_click_xy, } - def initialize(self, loop: asyncio.AbstractEventLoop | None = None): - self._loop = loop or asyncio.get_event_loop() - self._loop.run_until_complete(self.ainitialize()) + def initialize(self): + global _pw, _browser + if _pw is None: + _pw = sync_playwright().start() + if _browser is None: + _browser = _pw.chromium.launch(headless=True, chromium_sandbox=True) + self._page = _browser.new_page() + + @property + def page(self) -> SyncPage: + return self._page + + def browser_press_key(self, key: str): + """Press a key on the keyboard.""" + self._page.keyboard.press(key) + + def browser_type(self, selector: str, text: str): + """Type text into the focused element.""" + self._page.type(selector, text) + + def browser_click(self, selector: str): + """Click on a selector.""" + self._page.click(selector, timeout=3000, strict=True) + + def browser_drag(self, from_selector: str, to_selector: str): + """Drag and drop from one selector to another.""" + from_elem = self._page.locator(from_selector) + from_elem.hover(timeout=500) + self._page.mouse.down() + + to_elem = self._page.locator(to_selector) + to_elem.hover(timeout=500) + self._page.mouse.up() + + def browser_hover(self, selector: str): + """Hover over a given element.""" + self._page.hover(selector, timeout=3000, strict=True) + + def browser_select_option(self, selector: str, value: str): + """Select an option from a given element.""" + self._page.select_option(selector, value) + + def browser_mouse_click_xy(self, x: int, y: int): + """Click at a given x, y coordinate using the mouse.""" + self._page.mouse.click(x, y, delay=100) + + def evaluate_js(self, js: str): + js_result = self._page.evaluate(js) + logger.info(f"JS result: {js_result}") + return js_result + + def goto(self, url: str): + self._page.goto(url) + + def page_html(self) -> str: + return self._page.content() + + def page_screenshot(self) -> Image.Image: + scr_bytes = self._page.screenshot() + return Image.open(BytesIO(scr_bytes)) + + def page_axtree(self) -> str: + axtree = self._page.accessibility.snapshot() + return flatten_axtree(axtree) + + def step(self, action: ToolCall) -> dict: + fn = self._actions[action.name] + try: + action_result = fn(**action.arguments) + except Exception as e: + action_result = f"Error executing action {action.name}: {e}" + logger.error(action_result) + html = self.page_html() + screenshot = self.page_screenshot() + axtree = self.page_axtree() + return { + "action_result": action_result, + "pruned_html": html, + "axtree_txt": axtree, + "screenshot": screenshot, + } + + def actions(self) -> list[ToolSpec]: + return [ToolSpec.from_function(fn) for fn in self._actions.values()] + + def close(self): + self._page.close() + + +_apw = None # Global Playwright instance for AsyncPlaywright +_abrowser = None # Global Browser instance for AsyncPlaywright + - async def ainitialize(self): - pw = await async_playwright().start() - self._browser = await pw.chromium.launch(headless=True, chromium_sandbox=True) - self._page = await self._browser.new_page() +class AsyncPlaywright(AsyncBrowserBackend): + """Fully asynchronous Playwright backend using playwright.async_api.""" + + has_pw_page: bool = False + _actions: dict[str, Callable] + _page: AsyncPage + + def model_post_init(self, __context: Any): + self._actions = { + "browser_press_key": self.browser_press_key, + "browser_type": self.browser_type, + "browser_click": self.browser_click, + "browser_drag": self.browser_drag, + "browser_hover": self.browser_hover, + "browser_select_option": self.browser_select_option, + "browser_mouse_click_xy": self.browser_mouse_click_xy, + } + + async def initialize(self): + global _apw, _abrowser + if _apw is None: + _apw = await async_playwright().start() + if _abrowser is None: + _abrowser = await _apw.chromium.launch(headless=True, chromium_sandbox=True) + self._page = await _abrowser.new_page() async def browser_press_key(self, key: str): - """ - Press a key on the keyboard. - """ + """Press a key on the keyboard.""" await self._page.keyboard.press(key) async def browser_type(self, selector: str, text: str): - """ - Type text into the focused element. - """ + """Type text into the focused element.""" await self._page.type(selector, text) async def browser_click(self, selector: str): - """ - Click on a selector. - """ + """Click on a selector.""" await self._page.click(selector, timeout=3000, strict=True) async def browser_drag(self, from_selector: str, to_selector: str): - """ - Drag and drop from one selector to another. - """ + """Drag and drop from one selector to another.""" from_elem = self._page.locator(from_selector) await from_elem.hover(timeout=500) await self._page.mouse.down() @@ -69,66 +178,59 @@ async def browser_drag(self, from_selector: str, to_selector: str): await self._page.mouse.up() async def browser_hover(self, selector: str): - """ - Hover over a given element. - """ + """Hover over a given element.""" await self._page.hover(selector, timeout=3000, strict=True) async def browser_select_option(self, selector: str, value: str): - """ - Select an option from a given element. - """ + """Select an option from a given element.""" await self._page.select_option(selector, value) async def browser_mouse_click_xy(self, x: int, y: int): - """ - Click at a given x, y coordinate using the mouse. - """ + """Click at a given x, y coordinate using the mouse.""" await self._page.mouse.click(x, y, delay=100) - def run_js(self, js: str): - js_result = self._loop.run_until_complete(self._page.evaluate(js)) + async def evaluate_js(self, js: str): + js_result = await self._page.evaluate(js) logger.info(f"JS result: {js_result}") return js_result - def goto(self, url: str): - self._loop.run_until_complete(self._page.goto(url)) + async def goto(self, url: str): + await self._page.goto(url) - def page_html(self): - return self._loop.run_until_complete(self._page.content()) + async def page_html(self) -> str: + return await self._page.content() - def page_screenshot(self): - scr_bytes = self._loop.run_until_complete(self._page.screenshot()) + async def page_screenshot(self) -> Image.Image: + scr_bytes = await self._page.screenshot() return Image.open(BytesIO(scr_bytes)) - def page_axtree(self): - axtree = self._loop.run_until_complete(self._page.accessibility.snapshot()) - flat_axtree = flatten_axtree(axtree) - return flat_axtree + async def page_axtree(self) -> str: + axtree = await self._page.accessibility.snapshot() + return flatten_axtree(axtree) - def step(self, action: ToolCall): + async def step(self, action: ToolCall) -> dict: fn = self._actions[action.name] try: - action_result = self._loop.run_until_complete(fn(**action.arguments)) + action_result = await fn(**action.arguments) except Exception as e: action_result = f"Error executing action {action.name}: {e}" logger.error(action_result) - html = self.page_html() - screenshot = self.page_screenshot() - axtree = self.page_axtree() + html = await self.page_html() + screenshot = await self.page_screenshot() + axtree = await self.page_axtree() return { - "tool_result": action_result, + "action_result": action_result, "pruned_html": html, "axtree_txt": axtree, "screenshot": screenshot, } - def actions(self) -> tuple[ToolSpec]: - specs = [ToolSpec.from_function(fn) for fn in self._actions.values()] - return tuple(specs) + def actions(self) -> list[ToolSpec]: + return [ToolSpec.from_function(fn) for fn in self._actions.values()] - def close(self): - self._loop.run_until_complete(self._browser.close()) + async def close(self): + await self._browser.close() + await self._pw.stop() def flatten_axtree(axtree_dict: dict | None) -> str: From 4fe4e48dbef268d11f3004317ffe3726af17d4e2 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 15:58:34 +0000 Subject: [PATCH 52/61] fix --- src/agentlab/benchmarks/web_task.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py index b77eeb8b..f753828f 100644 --- a/src/agentlab/benchmarks/web_task.py +++ b/src/agentlab/benchmarks/web_task.py @@ -51,7 +51,6 @@ def validate(self) -> tuple[float, dict]: info: dict, custom information from the validation. """ - @abstractmethod def cheat(self): """ Solve the task using a pre-defined solution (optional). From e6f1f5d5ddea437a98a43cf7dd3b786badc090ee Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 16:16:08 +0000 Subject: [PATCH 53/61] universal rendering of any dict observation that contains only texts and images --- src/agentlab/agents/react_toolcall_agent.py | 63 ++++++++++----------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index c500fe49..7477a148 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -5,8 +5,10 @@ from functools import partial from typing import Callable, Literal -from litellm import completion, token_counter +import numpy as np +from litellm import completion from litellm.types.utils import Message, ModelResponse +from litellm.utils import token_counter from PIL import Image from termcolor import colored @@ -67,6 +69,10 @@ class AgentConfig: Provide a concise summary that preserves all information needed to continue the task.""" +def user_message(content: str | list[dict]) -> dict: + return {"role": "user", "content": content} + + class ReactToolCallAgent: def __init__( self, @@ -90,41 +96,34 @@ def obs_to_messages(self, obs: dict) -> list[dict]: """ Convert the observation dictionary into a list of chat messages for Lite LLM """ + images = {k: v for k, v in obs.items() if isinstance(v, (Image.Image, np.ndarray))} + texts = {k: v for k, v in obs.items() if k not in images and v is not None and v != ""} messages = [] - if obs.get("goal_object") and not self.last_tool_call_id: + + if not self.last_tool_call_id and (goal_obj := texts.pop("goal_object", None)): # its a first observation when there are no tool_call_id, so include goal - goal = obs["goal_object"][0]["text"] - messages.append({"role": "user", "content": f"## Goal:\n{goal}"}) - text_obs = [] - if result := obs.get("action_result"): - text_obs.append(f"## Action Result:\n{result}") - if error := obs.get("last_action_error"): - text_obs.append(f"## Action Error:\n{error}") - if self.config.use_html and (html := obs.get("pruned_html")): - text_obs.append(f"## HTML:\n{html}") - if self.config.use_axtree and (axtree := obs.get("axtree_txt")): - text_obs.append(f"## Accessibility Tree:\n{axtree}") - content = "\n\n".join(text_obs) - if content: - if self.last_tool_call_id: - message = { - "role": "tool", - "tool_call_id": self.last_tool_call_id, - "content": content, - } - else: - message = {"role": "user", "content": content} - messages.append(message) - if self.config.use_screenshot and (scr := obs.get("screenshot")): - if isinstance(scr, Image.Image): + goal = goal_obj[0]["text"] + messages.append(user_message(f"Goal: {goal}")) + + text = "\n\n".join([f"## {k}\n{v}" for k, v in texts.items()]) + if self.last_tool_call_id: + message = { + "role": "tool", + "tool_call_id": self.last_tool_call_id, + "content": text, + } + else: + message = user_message(text) + messages.append(message) + + if self.config.use_screenshot: + for caption, image in images.items(): image_content = [ - {"type": "image_url", "image_url": {"url": image_to_png_base64_url(scr)}} + {"type": "text", "text": caption}, + {"type": "image_url", "image_url": {"url": image_to_png_base64_url(image)}}, ] - messages.append({"role": "user", "content": image_content}) - else: - raise ValueError( - f"Expected Image.Image in screenshot obs, got {type(obs['screenshot'])}" - ) + messages.append(user_message(image_content)) + return messages def get_action(self, obs: dict) -> tuple[ToolCall, dict]: From e7aa80736e5aa2b80e2cb91a2edf1fb48a0c83fe Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 16:19:06 +0000 Subject: [PATCH 54/61] fix --- src/agentlab/backends/browser/env.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 8059af09..cbeb5c91 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -21,7 +21,7 @@ def final_step(): "pruned_html": "Task finished", "axtree_txt": "", "last_action_error": "", - "focused_element_bid": "none", + "focused_element_bid": "", } @@ -62,7 +62,7 @@ def _get_obs(self) -> dict: "axtree_txt": axtree, "screenshot": screenshot, "last_action_error": "", - "focused_element_bid": "none", + "focused_element_bid": "", } return obs @@ -111,7 +111,7 @@ def obs_postprocess(self, obs: dict) -> dict: if "last_action_error" not in obs: obs["last_action_error"] = "" if "focused_element_bid" not in obs: - obs["focused_element_bid"] = "none" + obs["focused_element_bid"] = "" if isinstance(self.task, AbstractWebTask): obs = self.task.obs_postprocess(obs) return obs From 212c0f410908bc6d46157959581a47aa64850532 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 16:38:49 +0000 Subject: [PATCH 55/61] remove tape agent --- experiments/run_miniwob.py | 16 ++---- .../conf/agent/plan_react_fcall.yaml | 57 ------------------- .../agents/tapeagent/conf/agent/react.yaml | 24 -------- .../agents/tapeagent/conf/llm/gpt5-mini.yaml | 6 -- .../agents/tapeagent/conf/llm/sonnet.yaml | 6 -- .../agents/tapeagent/conf/miniwob.yaml | 9 --- 6 files changed, 5 insertions(+), 113 deletions(-) delete mode 100644 src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml delete mode 100644 src/agentlab/agents/tapeagent/conf/agent/react.yaml delete mode 100644 src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml delete mode 100644 src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml delete mode 100644 src/agentlab/agents/tapeagent/conf/miniwob.yaml diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py index ac0de326..8af60cdb 100644 --- a/experiments/run_miniwob.py +++ b/experiments/run_miniwob.py @@ -1,7 +1,6 @@ import argparse import logging import os -import sys from bgym import DEFAULT_BENCHMARKS from dotenv import load_dotenv @@ -9,12 +8,10 @@ from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs from agentlab.agents.react_toolcall_agent import AgentConfig, LLMArgs, ReactToolCallAgentArgs -from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config from agentlab.backends.browser.mcp_playwright import MCPPlaywright from agentlab.backends.browser.playwright import SyncPlaywright from agentlab.benchmarks.miniwob import MiniWobBenchmark from agentlab.experiments.study import make_study -from agentlab.llm.chat_api import BaseModelArgs from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" @@ -23,7 +20,6 @@ load_dotenv() - def parse_args(): parser = argparse.ArgumentParser(description="Run MiniWob benchmark experiments") parser.add_argument( @@ -49,7 +45,6 @@ def parse_args(): if __name__ == "__main__": args = parse_args() - config = load_config(args.config) if args.backend == "bgym": benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1) @@ -65,18 +60,17 @@ def parse_args(): chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"], flags=GPT5_MINI_FLAGS, ) - elif args.agent == "react": + else: # react agent_args = ReactToolCallAgentArgs( - llm_args=LLMArgs(model_name="azure/gpt-5-mini", temperature=1.0, max_total_tokens=128000), + llm_args=LLMArgs( + model_name="azure/gpt-5-mini", temperature=1.0, max_total_tokens=128000 + ), config=AgentConfig(), ) - else: - agent_args = TapeAgentArgs(agent_name=config.name, config=config) study = make_study( benchmark=benchmark, agent_args=agent_args, - comment=config.comment, logging_level=logging.INFO, logging_level_stdout=logging.INFO, ) @@ -84,4 +78,4 @@ def parse_args(): study.exp_args_list = study.exp_args_list[23:27] study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") else: - study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) + study.run(n_jobs=8, n_relaunch=1, parallel_backend="ray") diff --git a/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml b/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml deleted file mode 100644 index 69788ed2..00000000 --- a/src/agentlab/agents/tapeagent/conf/agent/plan_react_fcall.yaml +++ /dev/null @@ -1,57 +0,0 @@ -_target_: tapeagents.agent.Agent -name : web_agent -max_iterations: 2 -llms: - default: ${llm} -templates: - system_prompt: | - You are an expert AI Agent trained to assist users with complex web tasks. - Your role is to understand user queries, perform actions and respond in a helpful and accurate manner. - Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration. - Do not express emotions or opinions about user questions. - allowed_tools: | - You have access to the following tools: - {tools_description} - thought_format: | - Important! Respond with the plain text, do not include any JSON or code. - Do not output anything besides what I asked in this message. - -nodes: - - _target_: tapeagents.nodes.StandardNode - name: plan - system_prompt: ${agent.templates.system_prompt} - guidance: | - Write a concise multi-step plan explaining which steps should be performed to find the answer for the given task. - Be specific about how each step should be performed. Only describe the intended actions here, do not perform them yet. - Consider that next steps may depend on results of previous steps, so include conditional branching using "if" statements where needed. - Start with the title "Plan". - ${agent.templates.thought_format} - steps_prompt: ${agent.templates.allowed_tools} - - - _target_: tapeagents.nodes.StandardNode - name: reflect - system_prompt: ${agent.templates.system_prompt} - guidance: | - Produce the reasoning with a bullet-point list of thoughts strictly following the rules: - 1. Summarize the last observation and describe any webpage interactions/effects. - 2. Evaluate action success, explain impact on task/plan, and describe any errors with solutions. - 3. If the last action was not successful, ask yourself about the reasons for failure. - 4. List next steps to accomplish current plan step and propose next immediate action. - - Additional notes for web page observations: - - Accept cookie consents first - - Quote relevant observation parts verbatim - - Close popups before interacting - - If last action was not successful, check if the target element is visible, use scrolling if its not. - ${agent.templates.thought_format} - steps_prompt: ${agent.templates.allowed_tools} - - - _target_: tapeagents.nodes.StandardNode - name: act - system_prompt: ${agent.templates.system_prompt} - guidance: Produce an function call that performs the proposed step, if the task is complete, produce the final step. - steps: - - tapeagents.core.FinalStep - use_known_actions: true - use_function_calls: true - next_node: reflect \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/agent/react.yaml b/src/agentlab/agents/tapeagent/conf/agent/react.yaml deleted file mode 100644 index 2f5b576e..00000000 --- a/src/agentlab/agents/tapeagent/conf/agent/react.yaml +++ /dev/null @@ -1,24 +0,0 @@ -_target_: tapeagents.agent.Agent -name : react_agent -max_iterations: 10 -llms: - default: ${llm} -templates: {} -nodes: - - _target_: agentlab.agents.tapeagent.agent.ToolCallNode - name: react - system_prompt: | - You are an expert AI Agent trained to assist users with complex web tasks. - Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner. - Keep your replies brief, concise, direct and on topic. Prioritize clarity and avoid over-elaboration. - Do not express emotions or opinions. - guidance: | - Think along the following lines: - 1. Summarize the last observation and describe the visible changes in the state. - 2. Evaluate action success, explain impact on task and next steps. - 3. If you see any errors in the last observation, think about it. If there is no error, just move on. - 4. List next steps to move towards the goal and propose next immediate action. - Then produce the function call that performs the proposed action. If the task is complete, produce the final step. - steps: - - tapeagents.core.FinalStep - next_node: react \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml deleted file mode 100644 index e45a7756..00000000 --- a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml +++ /dev/null @@ -1,6 +0,0 @@ -_target_: tapeagents.llms.LiteLLM -model_name: azure/gpt-5-mini -use_cache: false -context_size: 128000 -parameters: - temperature: 1.0 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml b/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml deleted file mode 100644 index 01120ec9..00000000 --- a/src/agentlab/agents/tapeagent/conf/llm/sonnet.yaml +++ /dev/null @@ -1,6 +0,0 @@ -_target_: tapeagents.llms.Claude -model_name: claude-sonnet-4-5-20250929 -use_cache: false -context_size: 128000 -parameters: - temperature: 0.1 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml deleted file mode 100644 index de8571c6..00000000 --- a/src/agentlab/agents/tapeagent/conf/miniwob.yaml +++ /dev/null @@ -1,9 +0,0 @@ -defaults: - - llm: gpt5-mini - - agent: react - - _self_ - -name: miniwob -comment: MiniWob Agent -parallel_backend: ray -n_jobs: 8 \ No newline at end of file From 8be1174da88091d5641fb8df53f2a67750fb726f Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 16:40:12 +0000 Subject: [PATCH 56/61] revert tapeagent changes --- src/agentlab/agents/tapeagent/agent.py | 191 ++----------------------- 1 file changed, 11 insertions(+), 180 deletions(-) diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index 3627682c..1c7acae3 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -1,33 +1,15 @@ import logging -import tempfile from dataclasses import dataclass -from typing import Any, Literal +from typing import Literal import bgym import hydra -from litellm import ChatCompletionThinkingBlock from omegaconf import DictConfig -from PIL import Image from pydantic import Field from tapeagents.agent import Agent -from tapeagents.core import ( - Action, - ControlFlow, - LLMOutputParsingFailureAction, - Observation, - SetNextNode, - StopStep, - TapeMetadata, - Thought, -) +from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought from tapeagents.core import Tape as BaseTape -from tapeagents.llms import LLMStream -from tapeagents.nodes import FatalError, StandardNode -from tapeagents.steps import ImageObservation -from tapeagents.tool_calling import ToolSpec -from termcolor import colored -from agentlab.actions import ToolSpec as AgentlabToolSpec from agentlab.agents.agent_args import AgentArgs logger = logging.getLogger(__name__) @@ -44,65 +26,10 @@ class ExtendedMetadata(TapeMetadata): other: dict = {} -class AgentResponse(Thought): - kind: Literal["agent_response"] = "agent_response" - response: str - - def llm_view(self, **kwargs) -> str: - return self.response - - -class AgentThinking(Thought): - kind: Literal["agent_thinking"] = "agent_thinking" - thinking: str - - def llm_view(self, **kwargs) -> str: - return self.thinking - - class Tape(BaseTape): metadata: ExtendedMetadata = Field(default_factory=ExtendedMetadata) # type: ignore -class ToolCallNode(StandardNode): - use_known_actions: bool = True - use_function_calls: bool = True - - def generate_steps(self, agent: Agent, tape: Tape, llm_stream: LLMStream): - new_steps = [] - for event in llm_stream: - if event.output.get("reasoning_content"): - logger.info(colored(f"LLM reasoning:\n{event.output.reasoning_content}", "yellow")) - new_steps.append(AgentThinking(thinking=event.output.reasoning_content)) - if event.output.get("thinking_blocks"): - for block in event.output.thinking_blocks: - if isinstance(block, ChatCompletionThinkingBlock): - logger.info(colored(f"LLM thinking block:\n{block}", "yellow")) - new_steps.append(AgentThinking(thinking=block.content)) - if event.output.content: - logger.info(colored(f"LLM output:\n{event.output.content}", "cyan")) - new_steps.append(AgentResponse(response=event.output.content)) - if event.output.tool_calls: - logger.info(colored(f"LLM tool calls:\n{event.output.tool_calls}", "magenta")) - new_steps += [ - self.tool_call_to_step(agent, tool_call) - for tool_call in event.output.tool_calls - ] - for step in new_steps: - yield step - if isinstance(step, LLMOutputParsingFailureAction): - yield SetNextNode(next_node=self.name) # loop to the same node to retry - break - if not new_steps: - raise FatalError("No completions!") - if ( - self.next_node - and not isinstance(new_steps[-1], StopStep) - and not any(isinstance(step, SetNextNode) for step in new_steps) - ): - yield SetNextNode(next_node=self.next_node) - - def load_config(config_name: str) -> DictConfig: with hydra.initialize(config_path="conf", version_base="1.1"): config = hydra.compose(config_name=config_name) @@ -113,20 +40,8 @@ def load_config(config_name: str) -> DictConfig: class TapeAgentArgs(AgentArgs): config: DictConfig = None # type: ignore - def make_agent(self, actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: - if actions is None: - agent = hydra.utils.instantiate(self.config.agent) - else: - tapeagents_actions = [ - ToolSpec(**tool.model_dump()) if isinstance(tool, AgentlabToolSpec) else tool - for tool in actions - ] - tools_description = "\n".join([action.description() for action in actions]) - agent = hydra.utils.instantiate( - self.config.agent, - known_actions=tapeagents_actions, - tools_description=tools_description, - ) + def make_agent(self) -> bgym.Agent: + agent: Agent = hydra.utils.instantiate(self.config.agent) return TapeAgent(agent=agent) @@ -144,62 +59,6 @@ class DictObservation(Observation): content: str -class MarkdownObservation(Observation): - def llm_view(self, **kwargs) -> str: - return f"## Markdown:\n{self.content}" - - def short_view(self, max_chars: int = 100) -> str: - return self.llm_view()[:max_chars] - - -class GoalObservation(MarkdownObservation): - """ - Contains task goal - """ - - kind: Literal["goal_observation"] = "goal_observation" # type: ignore - goal: str - - def llm_view(self, **kwargs) -> str: - return f"## Goal:\n{self.goal}" - - -class HTMLPage(MarkdownObservation): - """ - Contains page content - """ - - kind: Literal["html_page"] = "html_page" - html: str - - def llm_view(self, **kwargs) -> str: - return f"## Page Content:\n{self.html}" - - -class AXTreePage(MarkdownObservation): - """ - Contains accessibility tree - """ - - kind: Literal["ax_tree_page"] = "ax_tree_page" - axtree: str - - def llm_view(self, **kwargs) -> str: - return f"## Accessibility Tree:\n{self.axtree}" - - -class ActionResult(MarkdownObservation): - """ - Contains action result - """ - - kind: Literal["action_result"] = "action_result" - result: str - - def llm_view(self, **kwargs) -> str: - return f"## Action Result:\n{self.result}" - - class TapeAgent(bgym.Agent): agent: Agent tape: Tape @@ -209,42 +68,15 @@ def __init__(self, agent: Agent): self.agent = agent self.tape = Tape(steps=[]) - def obs_preprocessor(self, obs: Any) -> list[Observation]: - return obs - - def obs_to_steps(self, obs: Observation | list[Observation] | dict) -> list[Observation]: + def obs_preprocessor(self, obs: Observation | list[Observation]) -> list[Observation]: if isinstance(obs, Observation): obs = [obs] - if isinstance(obs, dict): - obs_steps = [] - if obs.get("goal_object"): - obs_steps.append(GoalObservation(goal=obs["goal_object"][0]["text"])) - if obs.get("action_result"): - obs_steps.append(ActionResult(result=obs["action_result"])) - if obs.get("pruned_html"): - obs_steps.append(HTMLPage(html=obs["pruned_html"])) - if obs.get("axtree_txt"): - obs_steps.append(AXTreePage(axtree=obs["axtree_txt"])) - if obs.get("screenshot"): - if isinstance(obs["screenshot"], Image.Image): - tmp_image_path = tempfile.mktemp(suffix=".png") - obs["screenshot"].save(tmp_image_path) - obs_steps.append(ImageObservation(image_path=tmp_image_path)) - else: - raise ValueError(f"Expected Image.Image, got {type(obs['screenshot'])}") - if obs.get("last_action_error"): - obs_steps.append(ActionResult(result=f"Action error:\n{obs['last_action_error']}")) - assert len(obs_steps) > 0, f"Unknown dict observation, keys: {obs.keys()}" - obs = obs_steps assert isinstance(obs, list), f"Expected list of Observations, got {type(obs)}" - obs_view = "\n".join([o.short_view() for o in obs]) - logger.info(colored(f"Observations:\n{obs_view}", "green")) + logger.info(f"Observations: {[type(o).__name__ for o in obs]}") return obs - def get_action( - self, obs: Observation | list[Observation] | dict - ) -> tuple[Action, TapeAgentInfo]: - self.tape += self.obs_to_steps(obs) + def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, TapeAgentInfo]: + self.tape += obs # type: ignore thoughts: list[Thought] = [] action = None while not action: @@ -252,7 +84,7 @@ def get_action( if not event.step: continue self.tape = self.tape.append(event.step) - if isinstance(event.step, Thought) and not isinstance(event.step, ControlFlow): + if isinstance(event.step, Thought): thoughts.append(event.step) logger.info(f"Thought: {event.step.llm_view()}") elif isinstance(event.step, Action) and not action: # we use first action only @@ -262,11 +94,10 @@ def get_action( # there could be control flow steps for switching nodes and if clauses logger.info(f"Other step: {type(event.step)}") logger.info(f"Tape after run: ({len(self.tape)}) {[type(s).__name__ for s in self.tape]}") - think_str = "\n".join([t.llm_view() for t in thoughts]) - return (action, {"thoughts": thoughts, "think": think_str}) + return (action, TapeAgentInfo(thoughts=thoughts)) @property def final_tape(self) -> Tape: truncated = not any([isinstance(s, StopStep) for s in self.tape.steps]) self.tape.metadata = ExtendedMetadata(author=self.agent.name, truncated=truncated) - return self.tape + return self.tape \ No newline at end of file From cdd9b543efd1ba0b8437252f9036da34c8ded338 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 16:40:41 +0000 Subject: [PATCH 57/61] fix --- src/agentlab/agents/tapeagent/agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index 1c7acae3..eefda1d1 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -100,4 +100,4 @@ def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, Tape def final_tape(self) -> Tape: truncated = not any([isinstance(s, StopStep) for s in self.tape.steps]) self.tape.metadata = ExtendedMetadata(author=self.agent.name, truncated=truncated) - return self.tape \ No newline at end of file + return self.tape From 1befd83afc975f659e8d01cfbdf7736a664c7e52 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Wed, 26 Nov 2025 17:05:42 +0000 Subject: [PATCH 58/61] html pruning --- src/agentlab/agents/react_toolcall_agent.py | 12 ++++++++++-- src/agentlab/backends/browser/env.py | 2 +- src/agentlab/backends/browser/mcp.py | 6 +++--- src/agentlab/backends/browser/mcp_playwright.py | 2 +- src/agentlab/backends/browser/playwright.py | 7 +++---- src/agentlab/benchmarks/miniwob/task.py | 3 +++ 6 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 7477a148..7e0732f0 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -96,11 +96,19 @@ def obs_to_messages(self, obs: dict) -> list[dict]: """ Convert the observation dictionary into a list of chat messages for Lite LLM """ + goal_obj = obs.pop("goal_object", None) + if not self.config.use_html: + obs.pop("pruned_html", None) + obs.pop("html", None) + if not self.config.use_axtree: + obs.pop("axtree_txt", None) + if not self.config.use_screenshot: + obs.pop("screenshot", None) images = {k: v for k, v in obs.items() if isinstance(v, (Image.Image, np.ndarray))} - texts = {k: v for k, v in obs.items() if k not in images and v is not None and v != ""} + texts = {k: v for k, v in obs.items() if v is not None and isinstance(v, str) and v != ""} messages = [] - if not self.last_tool_call_id and (goal_obj := texts.pop("goal_object", None)): + if not self.last_tool_call_id and goal_obj is not None and len(goal_obj) > 0 and "text" in goal_obj[0]: # its a first observation when there are no tool_call_id, so include goal goal = goal_obj[0]["text"] messages.append(user_message(f"Goal: {goal}")) diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index cbeb5c91..11665cbc 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -58,7 +58,7 @@ def _get_obs(self) -> dict: axtree = self.backend.page_axtree() obs = { "goal_object": [{"type": "text", "text": self.goal}], - "pruned_html": html, + "html": html, "axtree_txt": axtree, "screenshot": screenshot, "last_action_error": "", diff --git a/src/agentlab/backends/browser/mcp.py b/src/agentlab/backends/browser/mcp.py index d6f1e9e4..6040d532 100644 --- a/src/agentlab/backends/browser/mcp.py +++ b/src/agentlab/backends/browser/mcp.py @@ -152,11 +152,11 @@ def initialize(self) -> None: def step(self, action: ToolCall) -> dict: contents = self.call_tool(action.name, action.arguments) - action_result = "\n".join([c.text for c in contents if c.type == "text"]) - images = [c for c in contents if c.type == "image"] + action_result = "\n\n".join([c.text for c in contents if c.type == "text"]) + images = {f"image_{i}":c for i,c in enumerate(contents) if c.type == "image"} return { "action_result": action_result, - "screenshot": images[-1] if images else None, + **images, } def call_tool(self, tool_name: str, arguments: dict) -> list[ContentBlock]: diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py index 0718b356..d2edec85 100644 --- a/src/agentlab/backends/browser/mcp_playwright.py +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -38,7 +38,7 @@ def step(self, action: ToolCall) -> dict: axtree = self.page_axtree() return { "action_result": action_result, - "pruned_html": html, + "html": html, "axtree_txt": axtree, "screenshot": screenshot, } diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index c47aa86f..25c99615 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -113,7 +113,7 @@ def step(self, action: ToolCall) -> dict: axtree = self.page_axtree() return { "action_result": action_result, - "pruned_html": html, + "html": html, "axtree_txt": axtree, "screenshot": screenshot, } @@ -220,7 +220,7 @@ async def step(self, action: ToolCall) -> dict: axtree = await self.page_axtree() return { "action_result": action_result, - "pruned_html": html, + "html": html, "axtree_txt": axtree, "screenshot": screenshot, } @@ -229,8 +229,7 @@ def actions(self) -> list[ToolSpec]: return [ToolSpec.from_function(fn) for fn in self._actions.values()] async def close(self): - await self._browser.close() - await self._pw.stop() + await self._page.close() def flatten_axtree(axtree_dict: dict | None) -> str: diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py index 80ffa12a..019711d4 100644 --- a/src/agentlab/benchmarks/miniwob/task.py +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -3,6 +3,7 @@ from typing import Any, ClassVar from browsergym.miniwob import ALL_MINIWOB_TASKS +from browsergym.utils.obs import prune_html from agentlab.backends.browser import BrowserBackend from agentlab.benchmarks.web_task import AbstractWebTask @@ -192,6 +193,8 @@ def _parse_validation_result(self, validation_result: str | dict | list) -> tupl } def obs_postprocess(self, obs: dict) -> dict: + html = obs.pop("html", "") + obs["pruned_html"] = prune_html(html) if screenshot := obs.get("screenshot", None): obs["screenshot"] = screenshot.crop( (0, 0, 332, 214) From 462038e3e4bc9661e8dec143e5fd99119eb199a3 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 4 Dec 2025 17:07:35 +0000 Subject: [PATCH 59/61] max obs size limit, function to prepare pair of turn data for rl training --- src/agentlab/agents/react_toolcall_agent.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index 7e0732f0..aea2a0c0 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -44,6 +44,7 @@ class AgentConfig: use_axtree: bool = False use_screenshot: bool = True max_actions: int = 10 + max_obs_chars: int = 100000 # truncate long observations to N chars max_history_tokens: int = 120000 system_prompt: str = """ You are an expert AI Agent trained to assist users with complex web tasks. @@ -113,7 +114,7 @@ def obs_to_messages(self, obs: dict) -> list[dict]: goal = goal_obj[0]["text"] messages.append(user_message(f"Goal: {goal}")) - text = "\n\n".join([f"## {k}\n{v}" for k, v in texts.items()]) + text = "\n\n".join([f"## {k}\n{v}" for k, v in texts.items()])[:self.config.max_obs_chars] if self.last_tool_call_id: message = { "role": "tool", @@ -182,6 +183,7 @@ def action_from_message(self, message: Message) -> ToolCall: logger.warning("Multiple tool calls found in LLM response, using the first one.") tool_call = message.tool_calls[0] name = tool_call.function.name + assert name, "Tool call must have a name." args = json.loads(tool_call.function.arguments) action = ToolCall(id=tool_call.id, name=name, arguments=args) self.last_tool_call_id = action.id @@ -213,7 +215,7 @@ def compact_history(self): ] try: - response = self.llm(messages=messages, tool_choice="none") + response = self.llm(messages=messages) summary = response.choices[0].message.content # type: ignore except Exception as e: logger.exception(f"Error compacting history: {e}") @@ -224,11 +226,19 @@ def compact_history(self): summary_message = {"role": "user", "content": f"## Previous Interaction :\n{summary}"} self.history = [system_msg, summary_message, *rest[midpoint:]] + def get_training_pairs(self) -> list[tuple[list[dict | Message], Message]]: + input_output_pairs = [] + prev_history = [] + for msg in self.history: + if isinstance(msg, Message): + input_output_pairs.append((prev_history, msg)) + prev_history.append(msg) + return input_output_pairs @dataclass class ReactToolCallAgentArgs(AgentArgs): - llm_args: LLMArgs | None = None - config: AgentConfig | None = None + llm_args: LLMArgs = None # type: ignore + config: AgentConfig = None # type: ignore def make_agent(self, actions: list[ToolSpec]) -> ReactToolCallAgent: llm = self.llm_args.make_model() From cf68ef661cc4542a7f0da8436b905d5c2e8a4468 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Thu, 4 Dec 2025 19:58:05 +0000 Subject: [PATCH 60/61] workarena bench, reuse bgym task inside --- src/agentlab/backends/browser/env.py | 32 +++-------- src/agentlab/backends/browser/playwright.py | 5 ++ src/agentlab/benchmarks/workarena/__init__.py | 4 ++ .../benchmarks/workarena/benchmark.py | 56 ++++++++++++++++++ src/agentlab/benchmarks/workarena/task.py | 57 +++++++++++++++++++ 5 files changed, 130 insertions(+), 24 deletions(-) create mode 100644 src/agentlab/benchmarks/workarena/__init__.py create mode 100644 src/agentlab/benchmarks/workarena/benchmark.py create mode 100644 src/agentlab/benchmarks/workarena/task.py diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py index 11665cbc..bc9410f2 100644 --- a/src/agentlab/backends/browser/env.py +++ b/src/agentlab/backends/browser/env.py @@ -3,8 +3,6 @@ from dataclasses import dataclass from pathlib import Path -from browsergym.core.task import AbstractBrowserTask - from agentlab.actions import ToolCall, ToolsActionSet, ToolSpec from agentlab.backends.browser.base import BrowserBackend from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs @@ -27,7 +25,7 @@ def final_step(): class BrowserEnv(AbstractEnv): def __init__( - self, task_name: str, task: AbstractWebTask | AbstractBrowserTask, backend: BrowserBackend, seed: int = 0 + self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0 ): self.task_name = task_name self.task = task @@ -36,20 +34,12 @@ def __init__( self.backend = backend self.backend.initialize() self.goal = "" - if isinstance(self.task, AbstractBrowserTask) and not self.backend.has_pw_page: - raise ValueError( - "Legacy task requires a backend with direct playwright page access." - ) def reset(self, seed: int): self.seed = seed - if isinstance(self.task, AbstractBrowserTask): - self.goal, task_info = self.task.setup(page=self.backend.page) - obs = self._get_obs() - else: - self.goal, task_info = self.task.setup(backend=self.backend) - obs = self._get_obs() - obs = self.task.obs_postprocess(obs) + self.goal, task_info = self.task.setup(backend=self.backend) + obs = self._get_obs() + obs = self.task.obs_postprocess(obs) return obs, task_info def _get_obs(self) -> dict: @@ -86,21 +76,15 @@ def step(self, action: ToolCall | str) -> tuple[dict, float, bool, bool, dict]: observation = self.obs_postprocess(observation) - if isinstance(self.task, AbstractBrowserTask): - reward, done, _, info = self.task.validate(page=self.backend.page, chat_messages=[]) - elif self.task.validate_per_step or done or truncated: - reward, info = self.task.validate() - if info.get("done", False): - done = True - else: - reward = 0.0 - info = {} + reward, info = self.task.validate() + if info.get("done", False): + done = True env_info = { **info, "action_exec_start": action_exec_start, "action_exec_stop": action_exec_stop, - "action_exec_timeout": 0.0 + "action_exec_timeout": 0.0, } logger.info(f"Action result in observation: {observation}") return observation, reward, done, truncated, env_info diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index 25c99615..00ec30b5 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -1,4 +1,5 @@ import logging +import time from io import BytesIO from typing import Any, Callable @@ -82,6 +83,10 @@ def browser_mouse_click_xy(self, x: int, y: int): """Click at a given x, y coordinate using the mouse.""" self._page.mouse.click(x, y, delay=100) + def browser_wait(self, seconds: int): + """Wait for a given number of seconds, up to 10 seconds.""" + time.sleep(min(seconds, 10)) + def evaluate_js(self, js: str): js_result = self._page.evaluate(js) logger.info(f"JS result: {js_result}") diff --git a/src/agentlab/benchmarks/workarena/__init__.py b/src/agentlab/benchmarks/workarena/__init__.py new file mode 100644 index 00000000..4b038f1c --- /dev/null +++ b/src/agentlab/benchmarks/workarena/__init__.py @@ -0,0 +1,4 @@ +from .benchmark import WorkArenaBenchmark +from .task import WorkarenaTask + +__all__ = ["WorkArenaBenchmark", "WorkarenaTask"] \ No newline at end of file diff --git a/src/agentlab/benchmarks/workarena/benchmark.py b/src/agentlab/benchmarks/workarena/benchmark.py new file mode 100644 index 00000000..19370ba2 --- /dev/null +++ b/src/agentlab/benchmarks/workarena/benchmark.py @@ -0,0 +1,56 @@ +import logging +from typing import Any + +from browsergym.workarena import get_all_tasks_agents +from browsergym.workarena.instance import SNowInstance +from pydantic import ConfigDict +from ray.cloudpickle import instance + +from agentlab.actions import ToolsActionSet +from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.env import BrowserEnvArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark + +from .task import WorkarenaTask + +logger = logging.getLogger(__name__) + + +class WorkArenaBenchmark(AbstractBenchmark): + model_config = ConfigDict(arbitrary_types_allowed=True) + + backend_cls: type[BrowserBackend] + name: str = "workarena" + level: str = "l1" + env_args_list: list[BrowserEnvArgs] = None # type: ignore + dataset: list[WorkarenaTask] = None # type: ignore + is_multi_tab: bool = False + high_level_action_set_args: ToolsActionSet = None # type: ignore + _snow_instance: SNowInstance = None # type: ignore + + def model_post_init(self, __context: Any) -> None: + self.name = f"workarena_{self.level}_{self.backend_cls.__name__.lower()}" + self._snow_instance = SNowInstance() + self.env_args_list = [] + if self.dataset is None: + task_seed_tuples = get_all_tasks_agents(filter=self.level) + self.dataset = self.load_tasks(task_seed_tuples, self.level) + for task in self.dataset: + env_args = BrowserEnvArgs(task=task, backend_cls=self.backend_cls) + self.env_args_list.append(env_args) + logger.info(f"Loaded {len(self.env_args_list)} workarena tasks") + + def load_tasks(self, task_seed_tuples: list[tuple[type, int]], level: str) -> list[WorkarenaTask]: + tasks = [] + + for task_cls, seed in task_seed_tuples: + task = WorkarenaTask( + url="", + task_id=task_cls.get_task_id(), + instance=self._snow_instance, + task_cls=task_cls, + level=level, + seed=seed, + ) + tasks.append(task) + return tasks \ No newline at end of file diff --git a/src/agentlab/benchmarks/workarena/task.py b/src/agentlab/benchmarks/workarena/task.py new file mode 100644 index 00000000..49dd1f07 --- /dev/null +++ b/src/agentlab/benchmarks/workarena/task.py @@ -0,0 +1,57 @@ +import logging +from typing import ClassVar + +from browsergym.utils.obs import prune_html +from browsergym.workarena.instance import SNowInstance +from browsergym.workarena.tasks.base import AbstractServiceNowTask +from pydantic import ConfigDict + +from agentlab.backends.browser import BrowserBackend +from agentlab.benchmarks.web_task import AbstractWebTask + +logger = logging.getLogger(__name__) + + +class WorkarenaTask(AbstractWebTask): + model_config = ConfigDict(arbitrary_types_allowed=True) + + dataset: str = "workarena" + level: str + task_cls: type[AbstractServiceNowTask] + seed: int + instance: SNowInstance + _task_obj: AbstractServiceNowTask = None # type: ignore + actions_whitelist: ClassVar[list[str]] = [ + "browser_press_key", + "browser_type", + "browser_click", + "browser_drag", + "browser_hover", + "browser_select_option", + "browser_mouse_click_xy", + "browser_wait", + ] + + def setup(self, backend: BrowserBackend) -> tuple[str, dict]: + if not backend.has_pw_page: + raise ValueError("Workarena task requires a backend with playwright page access.") + self._backend = backend + self._task_obj = self.task_cls(instance=self.instance, seed=self.seed) # type: ignore + self.url = self._task_obj.start_url + goal, info = self._task_obj.setup(backend.page) + logger.info(f"Current backend page URL: {backend.page.url}") + # backend.goto(self.url) + return goal, info + + def teardown(self) -> None: + self._task_obj.teardown() + + def validate(self) -> tuple[float, dict]: + reward, done, _, info = self._task_obj.validate(page=self._backend.page, chat_messages=[]) + info["done"] = done + return reward, info + + def obs_postprocess(self, obs: dict) -> dict: + html = obs.pop("html", "") + obs["pruned_html"] = prune_html(html) + return obs \ No newline at end of file From 805c717095af42e655341537f71c059a12b8f036 Mon Sep 17 00:00:00 2001 From: Oleh Shliazhko Date: Fri, 5 Dec 2025 14:46:15 +0000 Subject: [PATCH 61/61] fixes --- src/agentlab/agents/react_toolcall_agent.py | 2 +- src/agentlab/backends/browser/playwright.py | 12 +++++++++++- src/agentlab/benchmarks/workarena/benchmark.py | 10 +++++----- src/agentlab/benchmarks/workarena/task.py | 12 ++++++------ 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/agentlab/agents/react_toolcall_agent.py b/src/agentlab/agents/react_toolcall_agent.py index aea2a0c0..01df0836 100644 --- a/src/agentlab/agents/react_toolcall_agent.py +++ b/src/agentlab/agents/react_toolcall_agent.py @@ -145,7 +145,7 @@ def get_action(self, obs: dict) -> tuple[ToolCall, dict]: messages = self.history + [{"role": "user", "content": self.config.guidance}] try: - logger.info(colored(f"Prompt:\n{pprint.pformat(messages, width=120)}", "blue")) + logger.info(colored(f"Prompt:\n{pprint.pformat([str(m)[:500] for m in messages], width=120)}", "blue")) response = self.llm(tools=self.tools, messages=messages) message = response.choices[0].message # type: ignore except Exception as e: diff --git a/src/agentlab/backends/browser/playwright.py b/src/agentlab/backends/browser/playwright.py index 00ec30b5..01a306bf 100644 --- a/src/agentlab/backends/browser/playwright.py +++ b/src/agentlab/backends/browser/playwright.py @@ -43,6 +43,7 @@ def initialize(self): _pw = sync_playwright().start() if _browser is None: _browser = _pw.chromium.launch(headless=True, chromium_sandbox=True) + self._page = _browser.new_page() @property @@ -93,8 +94,17 @@ def evaluate_js(self, js: str): return js_result def goto(self, url: str): + """Navigate to a specified URL.""" self._page.goto(url) + def browser_back(self): + """Navigate back in browser history.""" + self._page.go_back() + + def browser_forward(self): + """Navigate forward in browser history.""" + self._page.go_forward() + def page_html(self) -> str: return self._page.content() @@ -157,7 +167,7 @@ async def initialize(self): if _apw is None: _apw = await async_playwright().start() if _abrowser is None: - _abrowser = await _apw.chromium.launch(headless=True, chromium_sandbox=True) + _abrowser = await _apw.chromium.launch(headless=False, chromium_sandbox=True) self._page = await _abrowser.new_page() async def browser_press_key(self, key: str): diff --git a/src/agentlab/benchmarks/workarena/benchmark.py b/src/agentlab/benchmarks/workarena/benchmark.py index 19370ba2..725a55ea 100644 --- a/src/agentlab/benchmarks/workarena/benchmark.py +++ b/src/agentlab/benchmarks/workarena/benchmark.py @@ -4,7 +4,6 @@ from browsergym.workarena import get_all_tasks_agents from browsergym.workarena.instance import SNowInstance from pydantic import ConfigDict -from ray.cloudpickle import instance from agentlab.actions import ToolsActionSet from agentlab.backends.browser.base import BrowserBackend @@ -22,6 +21,7 @@ class WorkArenaBenchmark(AbstractBenchmark): backend_cls: type[BrowserBackend] name: str = "workarena" level: str = "l1" + n_seeds: int = 1 env_args_list: list[BrowserEnvArgs] = None # type: ignore dataset: list[WorkarenaTask] = None # type: ignore is_multi_tab: bool = False @@ -33,16 +33,15 @@ def model_post_init(self, __context: Any) -> None: self._snow_instance = SNowInstance() self.env_args_list = [] if self.dataset is None: - task_seed_tuples = get_all_tasks_agents(filter=self.level) - self.dataset = self.load_tasks(task_seed_tuples, self.level) + self.dataset = self.load_tasks(self.level) for task in self.dataset: env_args = BrowserEnvArgs(task=task, backend_cls=self.backend_cls) self.env_args_list.append(env_args) logger.info(f"Loaded {len(self.env_args_list)} workarena tasks") - def load_tasks(self, task_seed_tuples: list[tuple[type, int]], level: str) -> list[WorkarenaTask]: + def load_tasks(self, level: str) -> list[WorkarenaTask]: + task_seed_tuples = get_all_tasks_agents(filter=self.level, n_seed_l1=self.n_seeds) tasks = [] - for task_cls, seed in task_seed_tuples: task = WorkarenaTask( url="", @@ -53,4 +52,5 @@ def load_tasks(self, task_seed_tuples: list[tuple[type, int]], level: str) -> li seed=seed, ) tasks.append(task) + logger.info(f"Loaded {len(tasks)} tasks for level {level}") return tasks \ No newline at end of file diff --git a/src/agentlab/benchmarks/workarena/task.py b/src/agentlab/benchmarks/workarena/task.py index 49dd1f07..d2d1efda 100644 --- a/src/agentlab/benchmarks/workarena/task.py +++ b/src/agentlab/benchmarks/workarena/task.py @@ -24,23 +24,23 @@ class WorkarenaTask(AbstractWebTask): actions_whitelist: ClassVar[list[str]] = [ "browser_press_key", "browser_type", - "browser_click", - "browser_drag", - "browser_hover", "browser_select_option", "browser_mouse_click_xy", "browser_wait", + "browser_back", + "browser_forward", ] def setup(self, backend: BrowserBackend) -> tuple[str, dict]: if not backend.has_pw_page: raise ValueError("Workarena task requires a backend with playwright page access.") self._backend = backend - self._task_obj = self.task_cls(instance=self.instance, seed=self.seed) # type: ignore + self._task_obj = self.task_cls(instance=self.instance, seed=self.seed) # type: ignore self.url = self._task_obj.start_url goal, info = self._task_obj.setup(backend.page) + backend.goto(self.url) logger.info(f"Current backend page URL: {backend.page.url}") - # backend.goto(self.url) + return goal, info def teardown(self) -> None: @@ -54,4 +54,4 @@ def validate(self) -> tuple[float, dict]: def obs_postprocess(self, obs: dict) -> dict: html = obs.pop("html", "") obs["pruned_html"] = prune_html(html) - return obs \ No newline at end of file + return obs