From 048a622ba4e3b072a29b151eaaf12c2377e88e21 Mon Sep 17 00:00:00 2001 From: recursix Date: Fri, 17 Jan 2025 15:29:52 -0500 Subject: [PATCH 01/25] Add initial implementation of ChangeSummarizer and EpisodeAnalysis classes --- src/agentlab/analyze/error_analysis.py | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 src/agentlab/analyze/error_analysis.py diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py new file mode 100644 index 00000000..a0fbdb43 --- /dev/null +++ b/src/agentlab/analyze/error_analysis.py @@ -0,0 +1,50 @@ +from dataclasses import dataclass +from bgym import StepInfo + + +def _diff(past_obs, current_obs): + """TODO: Implement the diff function. + + Returns a diff version of current_obs compares to past_obs, unless there is too many changes. + """ + raise ValueError("Not implemented yet.") + + +@dataclass +class ChangeSummarizer: + + llm: callable # language model + obs_formatter: callable + use_diff: bool = False + + def summarize( + self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] + ) -> str: + """Produces, a summary of the effect of an action.""" + past_obs_message = self.obs_formatter(past_obs) + current_obs_message = self.obs_formatter(current_obs) + if self.use_diff: + current_obs_message = _diff(past_obs_message, current_obs_message) + + return self.llm(self.make_prompt(past_obs_message, current_obs_message, action)) + + def make_prompt(self, past_obs_message, action, current_obs_message, past_summaries): + """TODO: Implement the prompt.""" + return f"{past_obs_message} {action} {current_obs_message}" + + +@dataclass +class EpisodeAnalysis: + analysis: str # complete analysis of the episode + summary: str # short summary of the analysis + categories: dict[str, float] # score for each category e.g. type of error or difficulty levels + + +@dataclass +class EpisodeSummarizer: + + cange_summarizer: ChangeSummarizer = None + + def summarize(episode: list[StepInfo]) -> EpisodeAnalysis: + """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" + pass From fd8fd95d74d023b90d7803f555e8a4f3fc7f285b Mon Sep 17 00:00:00 2001 From: Megh Thakkar Date: Tue, 21 Jan 2025 00:04:37 -0500 Subject: [PATCH 02/25] Added chain summarizer prompt --- src/agentlab/analyze/error_analysis.py | 74 ++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py index a0fbdb43..0a869522 100644 --- a/src/agentlab/analyze/error_analysis.py +++ b/src/agentlab/analyze/error_analysis.py @@ -1,6 +1,59 @@ from dataclasses import dataclass from bgym import StepInfo +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +{ + "changeSummary": "A new search results panel appeared on the right side.", + "actionAssessment": "Correct", + "explanation": "Clicking 'Search' was appropriate to display the results." +} + +Or for an incorrect action: + +{ + "changeSummary": "The page reloaded but the date fields were reset to defaults.", + "actionAssessment": "Incorrect", + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", + "suggestion": "Correct the date format or check for error messages." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Previous Observation: {past_observation} + +Current Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + def _diff(past_obs, current_obs): """TODO: Implement the diff function. @@ -23,14 +76,29 @@ def summarize( """Produces, a summary of the effect of an action.""" past_obs_message = self.obs_formatter(past_obs) current_obs_message = self.obs_formatter(current_obs) + goal = past_obs["goal"] + plan = past_obs["plan"] if self.use_diff: current_obs_message = _diff(past_obs_message, current_obs_message) - return self.llm(self.make_prompt(past_obs_message, current_obs_message, action)) + return self.llm( + self.make_prompt( + past_obs_message, action, current_obs_message, past_summaries, goal, plan + ) + ) - def make_prompt(self, past_obs_message, action, current_obs_message, past_summaries): + def make_prompt( + self, past_obs_message, action, current_obs_message, past_summaries, goal, plan + ): """TODO: Implement the prompt.""" - return f"{past_obs_message} {action} {current_obs_message}" + return CHANGE_SUMMARIZER_PROMPT.format( + goal=goal, + plan=plan, + past_observation=past_obs_message, + current_observation=current_obs_message, + past_summaries=past_summaries, + action=action, + ) @dataclass From b8c85b101c2acaf2912a5b063a228f1e1f4cd334 Mon Sep 17 00:00:00 2001 From: Megh Thakkar Date: Tue, 21 Jan 2025 01:02:17 -0500 Subject: [PATCH 03/25] Added error classification prompt --- src/agentlab/analyze/error_analysis.py | 168 ++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py index 0a869522..07406a43 100644 --- a/src/agentlab/analyze/error_analysis.py +++ b/src/agentlab/analyze/error_analysis.py @@ -54,6 +54,156 @@ Action: {action} """ +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. AGENT ERRORS +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. + + - Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + + - Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + + - Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + + - Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +2. LANGUAGE MODEL ERRORS +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, +independent of the low-level web interactions. + + - Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + + - Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +3. BENCHMARK & ENVIRONMENT ERRORS +These errors are external to the agent's logic and the language model's reasoning, +arising from flaws in the system, network, or evaluation framework itself. + + - System Errors + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). + + - Benchmark Design Errors + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), + or inflexible evaluation systems that fail to account for valid alternative solutions. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Planning / Thought History + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. + +3. Current Observation (HTML / AX Tree Snippet) + - The webpage structure or state that the agent sees at a given point in time. + +4. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +5. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Benchmarl Error - Benchmark Design Error) + • Context: The agent correctly finds a cheaper product meeting the user's criteria, + but the benchmark expects a more expensive product and marks the solution as wrong. + • Classification: ["Benchmark Design Error"] + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid + and does not allow an alternative correct solution. + +2) EXAMPLE B (Agent Error - Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Agent Error - Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" + The query is ambiguous because "Upitts" is not a standard location. + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. + • Classification: ["Benchmark Design Error"] + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), + leading the agent astray due to unclear context. + +4) EXAMPLE D (Language Model Error - Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Language Model Error - Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide if the failure is: + - An Agent Error (which subcategory/subcategories), + - A Language Model Error (which subcategory/subcategories), + - A Benchmark/Environment Error (which subcategory/subcategories), + - Or a combination thereof (multi-label if needed). + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". + +Output Format Example: +{ + "errorCategory": ["Agent Error - Navigation & Planning"], + "explanation": "The agent opened the wrong GitLab page and never recovered..." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Overall goal: {goal} + +LLM Plan and thought history: {plan} + +Current Observation: {current_observation} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} +""" + def _diff(past_obs, current_obs): """TODO: Implement the diff function. @@ -111,8 +261,24 @@ class EpisodeAnalysis: @dataclass class EpisodeSummarizer: - cange_summarizer: ChangeSummarizer = None + change_summarizer: ChangeSummarizer = None def summarize(episode: list[StepInfo]) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" pass + + +@dataclass +class EpisodeErrorSummarizer(EpisodeSummarizer): + + change_summarizer: ChangeSummarizer = None + + def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): + """TODO: Implement the prompt.""" + return ERROR_CLASSIFICATION_PROMPT.format( + goal=goal, + plan=plan, + current_observation=current_observation, + historical_summaries=historical_summaries, + action_history=action_history, + ) From 5cb6cc210d28871121ae70ff76d3fa0026abfdbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=A9o=20Boisvert?= Date: Tue, 21 Jan 2025 10:52:43 -0500 Subject: [PATCH 04/25] Fix typo --- src/agentlab/analyze/error_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py index 07406a43..5a36db58 100644 --- a/src/agentlab/analyze/error_analysis.py +++ b/src/agentlab/analyze/error_analysis.py @@ -133,7 +133,7 @@ FEW-SHOT CLASSIFICATION EXAMPLES -------------------------------------------------------------------------------- -1) EXAMPLE A (Benchmarl Error - Benchmark Design Error) +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) • Context: The agent correctly finds a cheaper product meeting the user's criteria, but the benchmark expects a more expensive product and marks the solution as wrong. • Classification: ["Benchmark Design Error"] From 9f531cc930f1674dd766368784731153220e2362 Mon Sep 17 00:00:00 2001 From: Megh Thakkar Date: Tue, 21 Jan 2025 15:09:20 -0500 Subject: [PATCH 05/25] Update error_analysis.py --- src/agentlab/analyze/error_analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py index 5a36db58..8b7b0154 100644 --- a/src/agentlab/analyze/error_analysis.py +++ b/src/agentlab/analyze/error_analysis.py @@ -226,7 +226,9 @@ def summarize( """Produces, a summary of the effect of an action.""" past_obs_message = self.obs_formatter(past_obs) current_obs_message = self.obs_formatter(current_obs) - goal = past_obs["goal"] + + goal = past_obs["goal"] # Use goal object from agentlab + # Outsource everything to formatter plan = past_obs["plan"] if self.use_diff: current_obs_message = _diff(past_obs_message, current_obs_message) From 31e5bf55c49f04bc618f30e4f3792b56c3a985fa Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Wed, 22 Jan 2025 12:41:51 -0500 Subject: [PATCH 06/25] added pipeline and tests --- .../agents/generic_agent/agent_configs.py | 2 +- .../analyze/error_analysis/__init__.py | 0 .../analyze/error_analysis/pipeline.py | 74 +++++ .../summarizer.py} | 3 +- tests/analyze/error_analysis/test_pipeline.py | 85 ++++++ .../exp_args.pkl | Bin 0 -> 2288 bytes .../goal_object.pkl.gz | Bin 0 -> 102 bytes .../package_versions.txt | 287 ++++++++++++++++++ .../step_0.pkl.gz | Bin 0 -> 7793 bytes .../step_1.pkl.gz | Bin 0 -> 7916 bytes .../step_2.pkl.gz | Bin 0 -> 7953 bytes .../step_3.pkl.gz | Bin 0 -> 5672 bytes .../summary_info.json | 44 +++ .../exp_args.pkl | Bin 0 -> 2286 bytes .../goal_object.pkl.gz | Bin 0 -> 97 bytes .../package_versions.txt | 287 ++++++++++++++++++ .../step_0.pkl.gz | Bin 0 -> 7728 bytes .../step_1.pkl.gz | Bin 0 -> 7861 bytes .../step_2.pkl.gz | Bin 0 -> 5613 bytes .../summary_info.json | 44 +++ .../exp_args.pkl | Bin 0 -> 2276 bytes .../goal_object.pkl.gz | Bin 0 -> 106 bytes .../package_versions.txt | 287 ++++++++++++++++++ .../step_0.pkl.gz | Bin 0 -> 8014 bytes .../step_1.pkl.gz | Bin 0 -> 4893 bytes .../summary_info.json | 44 +++ .../exp_args.pkl | Bin 0 -> 2276 bytes .../goal_object.pkl.gz | Bin 0 -> 106 bytes .../package_versions.txt | 287 ++++++++++++++++++ .../step_0.pkl.gz | Bin 0 -> 8000 bytes .../step_1.pkl.gz | Bin 0 -> 4879 bytes .../summary_info.json | 44 +++ .../error_report_trial_1_of_3.md | 0 .../error_analysis/result_df_trial_1_of_3.csv | 5 + tests/data/error_analysis/study.pkl.gz | Bin 0 -> 3761 bytes .../summary_df_trial_1_of_3.csv | 2 + 36 files changed, 1493 insertions(+), 2 deletions(-) create mode 100644 src/agentlab/analyze/error_analysis/__init__.py create mode 100644 src/agentlab/analyze/error_analysis/pipeline.py rename src/agentlab/analyze/{error_analysis.py => error_analysis/summarizer.py} (99%) create mode 100644 tests/analyze/error_analysis/test_pipeline.py create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json create mode 100644 tests/data/error_analysis/error_report_trial_1_of_3.md create mode 100644 tests/data/error_analysis/result_df_trial_1_of_3.csv create mode 100644 tests/data/error_analysis/study.pkl.gz create mode 100644 tests/data/error_analysis/summary_df_trial_1_of_3.csv diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index 86f617da..9089fcaf 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -257,7 +257,7 @@ ) AGENT_4o_MINI = GenericAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"], + chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"], flags=FLAGS_GPT_4o, ) diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py new file mode 100644 index 00000000..53021297 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -0,0 +1,74 @@ +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Generator + +from bgym import ExpResult + +from agentlab.analyze.inspect_results import yield_all_exp_results + +from .summarizer import ChangeSummarizer, EpisodeSummarizer + + +@dataclass +class Analyzer: + prompt: str + llm = None + + def __call__(self, *args, **kwds): + return "analysis" + + +@dataclass +class ErrorAnalysisPipeline: + exp_dir: Path + filter: str = None + step_summarizer: ChangeSummarizer = None + episode_summarizer: EpisodeSummarizer = None + analyzer: Analyzer = None + + def filter_exp_results(self) -> Generator[ExpResult, None, None]: + # TODO:(thibault) improve filtering + exp_results = yield_all_exp_results(self.exp_dir) + for exp_result in exp_results: + if self.filter is None or self.filter in str(exp_result.exp_dir): + yield exp_result + + def run_analysis(self): + filtered_results = self.filter_exp_results() + + for exp_result in filtered_results: + step_analysis = self.analyze_step(exp_result) + episode_analysis = self.analyze_episode(exp_result, step_analysis) + error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis) + self.save_analysis(exp_result, error_analysis) + + def analyze_step(self, exp_result: ExpResult) -> list[str]: + step_summaries = [] # type: list[str] + # this assumes that there is always an extra step at the end of the episode + # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info + # TODO:(thibault) make some checks + for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): + step_summaries.append( + self.step_summarizer.summarize(step, step.action, next_step, step_summaries) + ) + return step_summaries + + def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str: + episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis) + return episode_summary + + def analyze_errors( + self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] + ) -> str: + error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis) + return error_analysis + + def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True): + """Save the analysis to json""" + analysis_path = exp_result.exp_dir / "error_analysis.json" + if not exists_ok and analysis_path.exists(): + raise FileExistsError(f"{analysis_path} already exists") + with analysis_path.open("w") as f: + json.dump(error_analysis, f) diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis/summarizer.py similarity index 99% rename from src/agentlab/analyze/error_analysis.py rename to src/agentlab/analyze/error_analysis/summarizer.py index 8b7b0154..b3760216 100644 --- a/src/agentlab/analyze/error_analysis.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -1,4 +1,5 @@ from dataclasses import dataclass + from bgym import StepInfo CHANGE_SUMMARIZER_PROMPT = """ @@ -227,7 +228,7 @@ def summarize( past_obs_message = self.obs_formatter(past_obs) current_obs_message = self.obs_formatter(current_obs) - goal = past_obs["goal"] # Use goal object from agentlab + goal = past_obs["goal"] # Use goal object from agentlab # Outsource everything to formatter plan = past_obs["plan"] if self.use_diff: diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py new file mode 100644 index 00000000..f9570c2b --- /dev/null +++ b/tests/analyze/error_analysis/test_pipeline.py @@ -0,0 +1,85 @@ +from pathlib import Path + +import pytest +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline + +exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis" + + +class MockStepSummarizer: + def summarize( + self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str] + ) -> str: + return f"Agent took action {action} at step {len(step_summaries)}" + + +class MockEpisodeSummarizer: + def summarize(self, exp_result: ExpResult, step_analysis: list[str]) -> str: + return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}" + + +class MockAnalyzer: + def __call__( + self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] + ) -> str: + return {"error": "analysis", "episode": episode_analysis} + + +@pytest.fixture(scope="module") +def pipeline() -> ErrorAnalysisPipeline: + return ErrorAnalysisPipeline( + exp_dir=exp_dir, + filter=None, + episode_summarizer=MockEpisodeSummarizer(), + step_summarizer=MockStepSummarizer(), + analyzer=MockAnalyzer(), + ) + + +def test_yield_no_filter(pipeline: ErrorAnalysisPipeline): + assert len(list(pipeline.filter_exp_results())) == 4 + + +def test_yield_with_filter(pipeline: ErrorAnalysisPipeline): + pattern = "click-dialog" + pipeline.filter = pattern + assert len(list(pipeline.filter_exp_results())) == 2 + pipeline.filter = None + + +def test_analyze_step(pipeline: ErrorAnalysisPipeline): + exp_result = next(pipeline.filter_exp_results()) + step_analysis = pipeline.analyze_step(exp_result) + + assert len(exp_result.steps_info) == len(step_analysis) + 1 + assert step_analysis[0] == f"Agent took action {exp_result.steps_info[0].action} at step 0" + + +def test_analyze_episode(pipeline: ErrorAnalysisPipeline): + exp_result = next(pipeline.filter_exp_results()) + step_analysis = pipeline.analyze_step(exp_result) + episode_analysis = pipeline.analyze_episode(exp_result, step_analysis) + + for step_info in exp_result.steps_info: + if step_info.action: + assert step_info.action in episode_analysis + + +def test_save_analysis(pipeline: ErrorAnalysisPipeline): + exp_result = next(pipeline.filter_exp_results()) + step_analysis = pipeline.analyze_step(exp_result) + episode_analysis = pipeline.analyze_episode(exp_result, step_analysis) + error_analysis = pipeline.analyze_errors(exp_result, episode_analysis, step_analysis) + + pipeline.save_analysis(exp_result, error_analysis, exists_ok=False) + + assert (exp_result.exp_dir / "error_analysis.json").exists() + + # remove the file + (exp_result.exp_dir / "error_analysis.json").unlink() + + +if __name__ == "__main__": + test_yield_with_filter() diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b2856641cf90decf74c96ba05bcd59b7a81c860a GIT binary patch literal 2288 zcmbVO%WfP+6t(3zW6#8o1m_{~kPsjxLOsk(oY#t2hy-C`qT~Zg)!jAIrRnZ!RaMUe z2?-WRu~fSh3lKlTXYdg$*z*CLTRk2>NLb*7-F55MeV%)+f4KDXpBpRT$GsbCH8nzy z=0#V`DxqZ|N^82gQk8eDFK5+(9vT1I-}%vh?{B*{AK{*H61q>rA?I;7e3&loU?E}Q zc>d(w?$hS>fGoVxYRL=X-L12F(WtV~zKRN2O7C`(j9XTyROE{gt}jE#^P(HBc?Hew zgYRlBo{{DSSIz6jRp*@b-Ga}URhH-YJrF2-yLO;K{?TIu}mCV#trL^_0+)gU0T+ON4tO@4EnP8$pZ?GTr9z^|z zzK-4cFy~b8!7|(M@t*}}l~zS%y}P!k=ksuW5-ctIxu#;+{qUmQbdH$N$i{{&N8W9G z8=DJ{H)I zReFwNcvo>_&uSgJYr#`ush+3GS4nC!S&o;&<0gt!4u^4mQ%Nt~}+xNgAQ-{_qmjLzI(-OtTdbemObMmz$QG@q(dlE!x@*ppTMF zB;#1k;Mg^4Mu?Pr1|P_Hy}Qxi5yRf1?H07p?xjeT zVLt)zk(K2K`}3JTE{V-G2ZBD2>?w literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..482f9b3d0d7d72ae173eff5c188db4215133df8c GIT binary patch literal 102 zcmV-s0Ga9M^RuMT+E*I_=m_9LHlS_AVoG zNiJ`S)JaOBMx-2(mSw2KVs`hFJImRb^&F%&j?koOU?EGrJ_0v{KnMwOa1X|L?v3{`>Fx|IhK6 z)?fJY&ot40-s4%z+_Z_cRw<>(?J}{n5-}V*rJH8i_d1pwQl2+fP5)KD_p|;TKk2pF zP~S#J1HnV*4w8lEPW0PJFLZwQbFIRKtN|2Z+OQsq9}m_njZk zy?y)(1+SeW`reJp+Eenptd``8>00u0a42y5pggA$Gf>hDZPU#5mvz_1uP7KOI3+zep-uMZ3M99lHE$C;g|)uw9SgPKXpTwV|~61rE(>eGcA%Txw@mNmZesFuRCLroN3sO z+^M2t;uPyIX$^=nbvR* zYo7KO7ZKy<<`GrS4DQ}j9BvD;D^JdPass}y9f-X_; z-1V!dZf&&xzS8WqXH}c<(j(9X2Y{}s#+r*-L|gW{iIFq&nz6R1mH?;4>nOWf5ZZj6 z19aFG(Bo2Y-o9!9B%DG&+6*&Km}+;GRST%kQ&ECq^M1jnML7VUHu|@b&qwIrC*a!x zpQqq+f`1b5=~X@fpI-Q+;Df(u_~7pteDHS)KD`tcKj+|kLV1dK9mwl44hL|jbSe^# z5;W3Go9p05&9US7SNyG4+uY!9m>PVDci1kdd2{o+X*!falxLg9CF!+osJe?>yQ~j% zps)|EAk>4(5#^Zj5PXj*kHGga{>9^`>wL#4iQr*`~h-_LS7 z$B5U7tli+8CabC)Qu5>Mr&&3!JStAakMi+&Nxh#j_A@08W&kk?aWV3<5NF|5ydWPh z(BlP8yx`Me+T!PBrrp9#%dC2m+9#kNtfnV|ouqwYHOUapGtfs?dNNYl z&r4I0(m`H26e%6%r6V;xmO$!v0hZ_MON`V#!ZOr28|<=aipi6qreNDxVLC;0Jt^LvB0FM`5CK zam6oOMz<U5`uT~so74vz((^fS|%OcfXnoTntHLDZeUJ^ns`Uvz7=o`>4 zpiekQZ0~5!wJ=IALDra~i9v)~r)gVywkjJwwYAVls~I`qWntaIL(jKaQt(GdGHbC zmz4}ZK6UBrl~-tLz>SxYIh|F^6mqSEe4IO{!(%I+#o`F2ciF_Hbev=g)35E+k1m+!R z*;^$ZTmDgQ@w9>G0+BO4a^^(NK3&Z_An+6KC``R_>ySiWKuf>v*dbeN2D^jgSTme! zR-RZkx0>d?e~8y*uCCg|xek6NFrE9X<6yLLRAirglLtw zgnAibdtAZdP*IF8L4a&}6~Ji#OQ)Y8g&*)Z)>uGkEMhbk z5*qXQyLaj_4^F%`aGRDBmS+3|QC%m-np5Bt-eySfs^1ps4-bNr4>7~OM&05~T<(aJ zynrR$JPmtZ%L6@7RqqflxlFXRg5!fCyQ>^PA#ON{A5GRyl4 zggCYgG_KAsD!`jXpes_f4r=HMn%9AJ2XZ#HSFp$`@R&*s;8Gp^lcF`v!@Q*vqOSO& zORP%2t5uIuFiISJgoC%rSC)W@gG(2#&p*2iY}i)L_l`swC?dirM|fCN<%d%n!;~Ve zALI2OqV=VcX$-hp#&tB^W&q&_7}ss?Lk}4?~;aXgxdii$YIw=oSe_VQKjV z9EFylL026XecTbr7s4b_r!7Ev7%(%Bh#)irXc z6v@1{(fg7l%M^}7%&LwGFfYlDNzGtNe(6phTPVhbD|<@5wIbg+B>`F(B?XhF5)!Av zwp1gpCDVQKc<(6yTs5sEwxP`op3gZZrMX6o}z%PP7N1!b_RWvmti9&nkTpA#V`~>N!83K`I?6$WR#>R2Va%DCE%oV) zJd_%gr{sS4scMSMj2m=&uB_Q+o+K@KppF;lgSJkC+!6CmFV!O!Awep-spf0dh|;CW zgrH2(AXc2eHv0nlH4%dj%+BAy-53agt(%rw#ySx-4ge)^m~Rz9_uoc4 z1E5shutqrkwyWpP&t1h5;pyAv7oU9&VoHbTCxn+=VK7+M_gXNzwhRdu--;V-P;mY_ zmIkgoExb1%LlAFCb4Ts9U!9%2cy&3CW$6HVxC))d8qt;Z8_O3a)~;i>)1v|OSQUC0 zYXtA@&fIgA>t$poJrO`pR-s3rW>|36;;!MTV60_WuygSIbIZ%u79qeg^@hIZbQL4; zQ5!eb@+t%{HOKOdmSMsBL1rc(F9`w}CVA>@$0DS<7bfb-ME^hN261V4f%0b2#3vC*=wjqx#yBxugm8LZqxZbB0@8c`s7 z%U0;lEyD|e6+2p>K&rwv3I-{dTPBtbiDBNq>u$*t?bhBp?RwpYid(R3Ez%demfLX1 zA&N)g6&Pi!{b>KMol%YSjv2E z6kF6ZgiFx8H9bF1mmiB?gi_Tz%6EchuPyM+Af=_cd)h#nPg(jA(4qv3|AB3Jhq+o` zC#r24d<&Q{e#kYbauVAK#2>KV&e;!mR*U}`u4gvFhOE^v4H${ykHY}<6u-&!I{3aX zH@o7~zzK-pH55TH0VhgpwI8pEe%#x-@phW4X}Lz)zv98Z-J{~82{p~3X<|(CU@qi7A>^@;kiWWxkn0H0>%0n)`#dCg z>y3!J)R4HpP!)Fo*}>Nq?M9L$N&!OhmI7m)KhHZq9F7QYHK1V!VRRhGt{doaXCplh za`XQ)C99$P2-UPW*si1=^upam(#c1tnryaf223Bcc)Hl;n2`MOhT7;VjUP%f_mrZN z8oi(sL!LUc3GE1lJhtrDuxX$)mvI5N0Yxdie?8Hg5l@sJVsmHiVfuxK8IOo@$p3%n zVcv{;m_gC`$w=qZ`#Q9P4h^$a7Oq1>T#R6xVKEMWAarO%bUq#F{K&oz?Vv-L_@u{# z4vh*iM&0@72SSG?MCV5%ogdrRp&fK+oUPSz9hwwk9E*r?`~#sw)1vbek(CB5 zG|5)=xem<;F-B8@$q$4MofSFtP`LBy#&c?E$x$6G*QgSGX)m*{cRl-wN#U)y6lTMH zub&8qXPecnt_FI1xS<}iU9kr@+iXCbj767COuH#X++Zxj{LMQS-s%XhGv|YbyH-~< z+wr|_-s~dZ)j}9HC*c>kibhno?v$6!QQseP*=n17Lpg@q0j%3#J7mjdrO*h+a08JR zzjr`b2%p@2|3G!DR57Jy)8w$qr^G}k5Udh@Hqr1t@?fnzuJ?O1^r{4E zZ9OK3;c)LqABe4s3sc_Pwl27QSGGRNH8dc$Z|i%s^?H&FY<(oE<72|sMy`LtM$i|nfnKj7uKkdE)sq-novtvS~FGcK{c$AEQ9%@C&Ina~xm*v8MZwmhkN z@0Wk+%)jv`ShZ%+wj=jydY!r8`2qL5?#P{8_gs2OC(^yr*3@)KJe{bhz}aGPp$$d8 zh<{yrIb%bBuQdwzdZU0}X%z5{MgiY!6!5E!0>0HK;MW=j{CcB+-wXo^RiJ2ywj^X^?4mj-g^eE1^w#5;*u}5+KU5z!-?O%8|CybcHlJf+=wWrf3*XrSL$&3*V}U%y@fc+>34SEH1cqYD5rn31E(u> zIQ`!CoJMbBj&k~UJ8&9#0!Eb6@9)5Axelj4*g>X|TdAX*{=*KOMjo>f<@6tS;Pi8K zIQ^&XIgQ@V9p&_&ci{BZIynEj15V_b9}!Mn?;(CyMiBY3;^KI`r`~ol{01U+@6fSO zQHEHSY4M6~$Pmj&1|P8K7d}>a3ZL9pI1zqsU(#z?4Lz-|(6{BY`u_ImZ=Ipf>TAbm z^f{1J+72+!qdn)|VmG%b6)?5kcN9?ydip~Cw@P4j|}PXOi%S@>K+;IewyhKw^H{E#7K?ANo{lcbS-s8 zjomWN=#BQswv~%Q?4U}OeMCsW1Au(bWQt1OuOX$Te$ac;oiT6d;b>h_vCk_<`8)0B{{A=L`05Fa*$}sT8@EU|L+!%wSI$aA0v6uQS z1`FXUn2Q5QEwnt%!T(Xj%lyc0nRnN=<$HLj4p!9%gaGTQ)-=1(ibGJU4$0jez?);$Sg{GsGH-dMI%KwcsC5OVjkRRHr51llY?(=oIB^Elo|KCmqO% zx)TnJT5QTCb>12pUy*~WVdX@kugZU)U{vY^xQUe@8psLsEpA}XDO%H{e<&I%bEel+1e7M2JN5YMB!`@Tl0v8(}Yq0V0aO2!C_tdz+#l|Na zYkYVx+_<~8r*;7inj8jUbK6NUJ+Dg)Ci`W}HDLZupL%xlMk~&?qf_eI8TA$u>R>)C z@`{EshsN@Tz!V$G85+$Pc9k#O|7=0z3Srl|t3=@brvafPU`wuscf-|viSME4z9m;< zyW#4-1nyAS-jb`yU2t`OA~K7|A%?2q>?apYgHRd@q`Y1JAfqnIR5CF%k?6$$yoQUG8onUL`{)*LlA$oZ*LiM@ zJ|#T-E}1r*w66O%o12@O=4FGx?9DM{HtUm}0+Fjvo1`y#=|NlVp%c~~cdbXx5-251 zP%!(YC_PLHJ>2o8l^wp{NTvM&{#P}oQ_v^dk;iT2cc{cuxg1(p5$=fUd~oP6lKd?v(!?bTvFFT zhAwCZ@?h$^$$G(fbExWu2U(I%oRBX;t5XuYgy9 zFNK{+kaJS+W|{6zYVjZJb7 z#U>LQs-~m#0o*Xna&lAEoJ`4{l2>&T9OjHXG&nfeE0s(bjZDehAj#Xju1~J;BT>z! zo3-_QeAK7ps~QSdzm?F*Dh__4q~-HE-TS6;O5iRss}w}@S^XzN*W5JjH%+&E|0uGT zMv5s3X#?~yFmAL`2K95n}G;&~UbGKEPXMxr8Rge*!_A~L!RHYGgedd=@asjSt9)1wI#4uXlD(t3DLE6HW$P~i@fM+Z1z<hMKkCrffXIdQlC*!an9=kT{>eVNCA8ZuDO6$Kn(+rMS*bg3}OV?tddPB zS5T1&=o94;EgEtoN7LC<6enxgDC8Pp0^qYKcv%Gi&@KQVVPw#bS4RuWj~iCVWPH${y2~r_N_b#X|d889TAsnC#MpMo_7Xo#&N zxVZiUrUZR}7?&dvNO1x^?pN{>2>3khh2x@oTS$)E&LsSHX6?O+GD9e5r_kPd|B@aPe~ z7}<4RwMordWH()lyqSja6w>^_FQ0ECU)-#yN*9D{XX#tI9RBt)BLgaLRWiC>qVE^y z=YDVpir1ZW!L{Uhxzfbnm71Zqs@eX6-{u`MT_;qeEPJgv$m{(}XNI0Xd-K*zIOy-FQm_q?CTlTT4O9r?3})y!RuNpJN+YO zKfb8T_l}m-+rWgQ>VlOkP46@}vGNGlto$&R8)P%+OVe4e3+)%LO9jh@4t)I!`0tmz z!#JBVjg0M<%ckY{cfBKd@YZI9Ub@bhc_hS4KN#vr|KNB(K5xc$v&G+UV>fbj8?3bq zu2lOcbbw!?1j~PC36)9$ywOYQ=T!3%_%~J1z=C z7n3O8tF{o{nO}Iex%h|B2c^0AN0B>}{um3Jihlw>pbN!6#na{Dx9J@LeE7Xbf`;g( zS+IGt@Rw~~HwLZ@zlt9u{-f{y%V$vU4l&qj#lO$}B38IwCpcBe2JmjX0u163;4fp! z;DFnJv#I|D$IJMZv4Q{q D2|yQR literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..52c5209d9359c3ce77071a75d525ddca3a9945e0 GIT binary patch literal 7916 zcmVrhrjTKHL;Rio*4e;TnN~ zB1X|RO$!A5X`3bu(i92O21U~VML&$b(4;-*K4xcTm*kR`D?6rE%bmIRoO91T_q^`C z^Z0!87s}5!(0|^;8Oz+XiM3WPCdthbv9uyF96PC-X36*3mK{=>GgeLiWxwZF{9As) zYqp`tR~o$oX2zx^hq8IqNf(K2t82veZ}`{MHQP}Q2aX*e4#&lkKRfl^ zAI-dd{FS`d$`O6<>C4(<@|>&|<+ABo@>6gqaC@OVs~R%!w`y8Nd6Q&hN43{&8BP&A zDL(^DP?$M6PxKNN%A1;#Kj*u^PQJrycA=Yo{t#>Wz7y}{4||U-6P;uodHK`p7kXuP zO;(MZoYl4Ly1eXWikg%3UE(#bnX2yRk9!?>kTx>~!0{KnBW8&h=@Rxj?OHl^-Mg}? z>13k6zdvslNx#!y)C_IY%=DFX*T%0X=+8SvJvgCF_GR-VyPh#`5<7{tzUmzdwcu!u zPQ~keY`DczIhi#rk}SHqqp6mqmVK`?ZIP^L z*pB5wXI}Qb77FNltwxT4yzDCtFZdT5GyaSIlAll-mBttR{G(n=&MB7&qBLpa{+-*b z_JG%T#$Q-ixbna7kN!~_96xjCVp{|M=fkOY00iadAM~W0Srn2)b!pF;S(mCF^biX= zM8Q)_SAh|)h4vp6uQj9EgqI$HE;s;mRW;UJ)FRrl*GY`5nbVB51+@q`O^}9fMKM7j$xbg%ja)PS=Nu^im1Lf~m z29!Zi{vl;p83E-VQ^u83Ae__68Ia6b<(zUJM)ZPm5%gqQnE?)Gi6^O!V`&-JVe#jN ze}nQ^;Z7p#T!Qm9eqJfLKPVBHVV)VxqaiG;eFXZ!YPurW3EC%ClL+BF0exhpCnKeO zyfhgp9pI&dk~xGTCXI6;T$Ca!osN{w^3uzZ(kr}FiIgtz(#4RJ z@q8&be^_}6;GU-C&nVZR?pdg7Q+|Oe83{A@i!ma<6eE%e5y>Kv2=;0WJ0HU?gs|7E zu#FgYDTZx@u7Ydip?CKyhv@MmSUn+!s%OtD9Qw2(Fq)Kw z+4E2SG)m-3hzL|qe=b7g)3?~^LWqb8#cxN6+z1hY>X~;UM3yGlsTv~U0FkyRky3~V zR8PMY<>WRybwWg5oIQW>Xq3qFAtF%y#Q6x3X%78jh{(&K<1fvgf93}atKiRm{^3~*9_69Sf4KSJ{V zz~fkb0j0i(QC~=?&*$&nsmDAx@mj!bT25G+@ef3GofvCQo=<%nA;+tHo2WlL2vR=8 z411Bf#cR0m5h-~VOFDTP_KcPTdZ4P_AzpHsXlr@L2Ss*OuhoW`lCehB{I0H6ZM7`r zb#B6}-`uqO22&#`-;;;OXT}Bxr!S4b-=PZw17nxq_u|Od#njly%!RRO-)lc17Jq!N znGtWkyfS~)4=-1yH#A$zXu9T<)0PUEsqZxo`rd(zmV=`K-+M3w3qneptLdU@t=qnL zziMe}A1+Yl2*^>Rd9Cj?_})=k6De)JJTo=z7rxNg(BQS4pn=$HlzgvcWp3r_3>>F; zW!s6YS!7Hfl3CuDC&aO3pmB9>K>^+@0$q_Rbx=ds(!4gL+n=>h2(2#`O{3q{(ypWFHUk*r04J*%g#HoBP<39!e;C?0N9)?D zUljTjhi;N^6qZ+>#ZhP)9N@(du^Es2oVn-xz-jGFHQH(IuQGR01e$qs-+S!a*1wQ*4;y4=y6a(h1cDa#1)l9H)>~h zuiPz|3M#Z5EABq2@un!5>jO97e^N7mAGBEcBKqFeS#`lH*EvufetRw&1F$T*WxNl^)dkd zEckN-+M-hhQ!^4cRJ1t|A;KqF(2*x6C*=X)R1oPlOeYTt)P0(EJiHNPd)`PrA_n`!b>(k5G?R} zO&CL)2FHD`m5Ww1i<@pxbzuqX1E-!6ejLa^IB!aEm+iG)oj!Hx>PimFQUUZ(1v-T_ zqEoF;uUs5oTS5V(Mgr*33iJ@xh+echGf$P5O2|%XJb*q`fgXmMA;DRbyN0KNv8Exx z&cO>$t*k6AK&WMk4pPYKD#qfYHg2}%R0w5imgN;qLxT5%%wRxr5`;8N_tcw?MM!0T zOzcVVz14#TTZb>aM48vt3{a4qe}`|n((PB4S9HR>oQB;cx*ID7r2}_mo*|hj9C_E1 zvUv*5HArZ8+o*g=27X+aMqg6!Lh=A>@kSE)8bu`N1@r((h zrW%XZQ!o`<)eJN{gt+ax0d_0J7BzL@5;Sy8&&|<=$if$)RP~PX-J$8ldA>oUG*xy~ z>qz)1OaBhED8a&iU|Zf{uGULLwM~O>3p2(Kxdv5EVmpEO1NK`v`vK2t;XlLm%tqLd zH5;Y@BT@KS7@(HoH@IFK-wWnuS9k(A0TH~4A}A!_L}{+{<5kg*J6kv2O0zaCTTlDf zT-x*PHIRCfLJQ?M`3DyLYqAI>gCXblJqTlTyU3 z#uCimykp_5j^LtmK4`dWbyc$+-|OVfF7RC~gkf_MeyOWyM0IOUdD$HGgE5zVH`^^0 z++goCg4+h&w!!X#{S3U$AB!P#I=b%*NRaJ8UgUF^7u8 zomA@H`izJL?a^cKzQ_DUwFLUZ{T{j({vy~Fe_y+dfXkX*qwu{0bt&c`Q_KpnpDXWB zpM_U9yB8C$CCR|bQ&AmHiL@x184TZBE062_9u2)BfofZi$zdqm`;mKM>*AW0_qVMJ zF5i`{k8lkQi0#|@9&NprBm-L?j_UZRu=U}Ht&iSgTaW4eUTr;)K((#Mu=U9HPip)g+j>mz_iF2b1gdR4 zCWq7E-k-V`wvN}6zOT(38GKh3KFRcw(%CoeJsNjRh=E;?NA>xvSjmfQ$qn7(mAsIS z@6n`bzpJe|*86iVvAu_YbxTbXrsC<)#k1JPZ?d*Lsd59EKYQ-q_ye(;Gick98$P}E zZ150*J6>nxUa~tby&4qhF19o@%!-E{6`q8%g~Cz`ihLRWy7WrOx&U9T7x1-u0bj2d z@Qr!_->et#EA;}tRWINl)C>64dIA413@B8A0!9=eiu5NtA<;X8qc~r$11A;3`STrc z*gfu1oNsK4BkqNd;{3%9IFaW9=>_H1hC? zD5u}qfzvBBIQ{PSoJQ|Mj&k~UJ8&9#!bOzRzu$q=8#Orn{`QWFgsPdjk>d<{3|{qYW*zFY(6 zUw6QXJUAr6sq5XxcNl`mmtoh{kNbFIzU`#Ny+<81pe4Eo$Gdxc?^viVO)Sf__>D=O zkSdmt9zM9y&wnES7(Nj&ed7|c6)gYeBXJ``0N>a2$u6ff9g7#BbiluAD zIzt-jlJWNH$*%a_)m<{=#gko?TeG`lyn|}8OWc^<)gL1@94EES&Dhn{88vpBIioky zCEHdu##zB;{qARwRY;>Tr0@;f+hdM}G2^##Z;v`2YqXA=ytgNKYDhNGyNDn@gQU7%G}YVVi8t8 zgWAaIAU5JJuIch|P%duXr*R>d8J0Bfn%8Ft?mhoGvZ+BRg& zvBxIV=n4xV0rxe;!NLz`h&2>-R^k9^!Bw7>&d}#wot1n~;!k3rvz0$?YG?>OI6>~N zIrqQ-#-?FX`}M)G8**@Qt=!$+Tj9S~FeP@Rnrha2YxzNf|oE;c?|XX9hx#<_v+ zsd0geji0Ko@u7imP;ecl|sCy34}-=!rpXOxxwAf3_^LqmRt?(hO4_0 z@ko4h`UqZJLzD}kKZ4>%#%c>5(2IfxO@_Xr&c1|`3Oa~p4 z70XT)zPr{XX9ye;CQXZQ`sxI!Rlk!SURA`t#$6PPVAP3fX5*psrbe&JBVJvGm z9PouNugSN04^SC^MkIB;h+nos1OlAaY{(^Slr5Kv*A{+@PvQ4l@L4;{^i3)A{4G5c z5?`ltki9_#AM3?-as6wD8XFp(HC-7zG)PLO4a<&Iru{c%+tfFpB6xX^jPG&D1wCO@ zWOJ3C;wx-W+!DUEN7ndzYJgWeDPtS>rXRW)4cbB9^Ao|6K?!t_KN~R{X5g~u8FGT4 zNH;0Zs9C@SZNc{iNj4oH$~$cfWXNQmAxOj#xLn&ParT}efU!uHXtaX{*`%B#eV56w z^Rio1jkpN&#|m$^h-(ZP-+(U;C0Q;vzIa9i;t+_vD5Y@ zkQH*FgHW;v7^}o2PqMgoFK5aiijrlnS!%IYE~@JwLl-myc`$X|WW8X#;bWYlgDgoW zPRO&+>V(8DWB3|Kgo=g!$sey#c@Ti-fEQP&y`?dH!Xa>0A%nv!saw3n16w9mB9%i8R~m1HVcr# z{wK&-sYA3(cP&r-)n;A;--Zp_C6YyFmg$@_3 zdE%&8Abywg8j!q!)UV&@k=98$W2#mzjRDV2z$xW3RDo)lz=<8NScSP$s6RBr;8lwl zbye90-XJ`33&wDqL9;S9AU()oVqh9sFt(YSD$LTr>bY|m9`6?7nXE`7==$V!!Fvyn z_R^>1WwS{5%XEuiuh7mKUQD8E>ue^8g%{)Uq8U zb1~=&PEuhAaEV~~=~?Pt==~HLYXAZRE-;j2R65+HH;9D<=EJk!P*%p1y>cZh+sqRK zAkbOjWm&*2mEN2P7En$F50x1cA{+F=GD~RaOcE%|J}yhbL~RTGaUj23FZW6|3z zSYt~Y5LiUcBsO{*yw;6j(Y3Q+4GfgT5&l<|a%Dp; z!emg`cCT3cb*!@R4%FfTCyWlIOC7?s2ed-nG}rn3QG%#c0wsfXu!GwUJP#vCCz55- z2Px5|%#I7HO{!KhJLyX1wN4n%iXHsiE~{@NIozA5$PL71XXtCd9R7MVBMr)ImD9Rj zq^~mP=b?R(&WsCABge~?2L3+QG`)?@_UB(~@eY};6Dm?>z2+=r?EdVz!RFx$PyFPk zu6GFUPD0J0?}F!-bGPH3Z@55W_es$`lv}=c1fnV=#*oJAY1LX|EKBU1zV*TDSSvYw z!)7183C#D77S)@;grn+$l`9SJG&ZpE2-mFqFqRu+Gw4ghd9MR46E9T-n}rU1^%ea0 zTi#)uQkh2Dc1tDGa{SxgksSD8vrI2*XUsehVx|vFaJX+^tPh_>W4oEcU$(HDygCim zS{hfdeWe|TfED12=#_`OhsAlVYv*Bz?i2Ol3+R<*@1r%!`Ni}2UV2e0E_I*(vjg67 zQ5d?YMEOCbh49Y&{8D4#ub~f0W8s^TyO;h33mXdGf*;U@!ng5sw(z&~ZUH|0-UC5H zbdN09)S3T&i`R)EDa|kJ2Z{gWiPx##9b&MdisIzwp~Cgr!CgX}$2;-zFo?^5zk^s5aJSfcfRGPk9}zBjLt==g7~*eqjjsJO5I4GhREUzQP0&-l3(LXQq~> z)3a9>FHBv|I9W zl3YGTiIOOhqNtHbhoo(}in^WMz2wevc4j>fQjDQCsq0z-rhr>dKGGIJe;of9t`Qh* z;}k*CqCwlArg^kU(71K4xcTm*kR`BRi&6%bmIRoO91T_q^`C z^Y~=*r@!`G1O4Ycn6=Dxn^-Fwr4+eQCYDwrhGVC6(=7X5+oD6tv&ORNzvOp+%D?F+ zy=EJVe5J|TXJ&0$av)bwolJ??wz@)W|GJ-~aF(eP-)mQGTeBV2aNyVh;&5Cl`!kc@ z`O);7M?X{WS~;TcJ$X@kM4pw^lDuKMmiz=93fvwjtC{4JYSNnR&e*-`CPTZU5v zPsvXK6BK4vE)czph4Q-Q6wdfAuv6&pnqBCoUpT;;zUSCmg@fKBi$o_mM_&Bc>bV}- zU6EBIFXwbEw<<5X*^=g@e3y95E2gUZg`-{v9%RgH5peuD?~qw0My8Ct&bXG2UH2|6 zYdRV4>+36+CDP~gl{7&63i=98Ne@nFlfAhD$*pG18^lgwt*?4VLM=F& zqf_yEA2?4rJRg9|OCSpbS96ZGMkqo?GxFpH02^+pyphV87D<&{-O*IbQa60BGh>mQ zY1oeCLT6s`y%q}Sd#y&EfxP4^4bS`M8?*ik{(_%W8kNT9{ldduOWxTi6GUm!#{Ap2 zSnWQq@svL|H+Sj(;2-^?G&p|t_W8C3{?CU~ZyyNCFWm1*d9x%WiR#jxH*+pkJ?J48 zbcli{7A^xLUJLENuQYnCS=A=I^bmBx0iesOvErf@(U!eVV&u%cW~|JqC4$P@R(7)> zwAnldXtOs!k4wRM>#_xqa0>lsG0Z$+s@+jmEucP2MG1<{`vspSWgmQ6=-+xiAEJLB zf^QRi9)Zs>{z<^6Tlpw_y5W<85B{d%gTEv2!QXNCbW>RTOv3k=@(A(Tkk=&~4&Y8{ zS0o%IXrz%g*T#<;W5@BY_*<*CvCiKxHTV$kpj}Y&=K6wZI+Q__XN$%q>9wq>x{F-9 ztPgacun)~3)cwjK<%n_*e2*yi!S@0F#p6e7d`BtdqxA0)t}@hjEjQ_K#W3Mi2PiMv+(hF z0f`rs;stKJ;AWV%__?KA0A6kr??6_~tr8tbxyi|#l&hgUu zkd*O!Avk|fc>>^`q~%X3SE24{sB2Sxo+=p$GxrNIBEJ|Tk_{2bA(067atyl=!!Cxf zSF5m%7^I=agSno@a!Zfwn8Z7I1ZnPo^n%D2$Tfo_=mV&CqHgWT&hzOO6!>nR94|v+L2B})4xB3R6TQ6;m{`)fzhPQ z&76JwCs87oLPVf?>Jt$nAG^s;=R!nOD1I|a{H)oAP;jQp98Ef zv7$r4aKVSbTmD#7!E$XO|J=0?tLR&o(Y28eeUrenEPNUYDJrQdMu zkS#WX-9f^v8P2?wCzj2vrg8Tl;&qtI%QkTqz|RDxbBA>tj5e;^VUcqs#t0eH-fAcn zB4=DZtD&3_t@5T&FF?GHD_9&Vit$+`Lp2CpM2DS&`L|wEZku_EO$65v2o8^h;Nom- z&)~Zm>YDPP5FAX!A#Usmt{#nRstC?yna4R?j$l(~7D+4d6BBWqBH;7@rv@y=euxx) z$m3Xj0j0i(QC~=?&*$&lsmDAx@mj!bT25G+@ef3Gofs=lflq=PA;`HQ{U;K1qLxb0Hj0R$_QS!Z(rP-y+({P;T zm2JmzW{ELuQyw6pRuFAL8K6@})&!V*ld#h1tiKfDPNq`QD*O14Tp_ z<_PzSs{CMTZIDu=^&`Cg1GK(WGL1f0%eaoF+YDfo101hr5c)?fL)Cc^|6yoj9Ib1+ zeo^QX9J)!uQCM7h8b_gNpr02%z$QKNv+_Dx!nmC7FW)#R-;@w&fB{|oN^?Vlw5D2e z&LXNqLOMGkFS|w#hd!Ct*1DgSWSPQoh$+=k0p?lRF{v4h%g^5IVT;GOm}QU4*RIPq zPfCCmMoGb>sf5I-ur1ZdYsqwvJlcH{0GCZGiEU^T{U^0k)o3TR2-{H$}-@>%R{FQ{XGNS@+rALH7rE46&c5JIh7bxW#m$2*DfR(v%E_;CH%r4i#V=jL@-ZE zP=n$H**|WXFhMPab%!y9_0VM${$R?c6X#^&7*(_cZ3E?j4lbGCvZ$eJaTTR{82~>6 z{v3fe=~U6wj3f>fZ4N|;@F^B_Uab-lH%T+s$Sj1IU1c5)>F%tH`XP;>b-u8-$%$2Se8EOpdK!#01aOmFp-6 z5n5rPs)jKqCN|ZlGx9*HUmllx;isZ0GBa+_?dh^+n|YG7?%eEXGd#In4i6XxYK>@LuKSurRbxIgn0$xh|q0w=dzg6pT232ZuAMRbh_*uD&TvQ1YgHqQ?X^uK(j-LTdo^mw^D3TQx`5lL)Y~DEM15!{x+1V-eJB^G&O&PZxty`mA%zE z5`N0ke*i5?u=t}$>QsH z@_GF8Mn&a5DkL}@lHlum5)=}AEhIs@FhLg{sDGn&N_CHlkEYf%n`S98=D}RZJwnJM zAt8T$6Cu|Spx1sGBKJ8+@K);)cd0IMf2Jbt0J4p*GTQYdNt6PF(ZShs1QG?#EGw+=-qymvj(>k&_s z9$<54?qPa`hZ&8Calro>^f0f-Jxsso{E0~C(|bC!jSda6RTi#816+(?oMABzzAtoW zNOV3O>HN^14sD}DnE0eegboc0F-G0_@cTlC#zg0bBb^`F)1hs2Xq2thaveG$#5fWW zhe=+Ftas?T+3LWnV%5}bHn=+Ie_QxAkYpRPZrmKGh=(Q@@F z(Pwuv`+D26pO_Thh)ZEA-1pjvaCo*^+4QQT#|P``G20cpd$Y|t#K~B6*~GMyQpBys zGR)t+BjK%%;JR}@Xt-;2S+gDA>*USO@m(#1VRI6GsjFy2b!$#}*&Ow~F_*2h8SH;X zaNB@&8*C@olfY{g!mUc70gj;t5-omjpD++UxBJ1q%226dM$Kl)VU16Tc~B&Fq|v&D*yV}f=!MA1M6HG5Dou4V=-uYFKcV$W+lYih6#-gThS}f#6cH{={@EhzFu{Oi&SA?pHsxn96m>IHnYUclGt z1$@0;z|YqU_(r{ee^4*r7wQH4qcEUQ1qv8ZgecONw?m>g2uE@Lqz;^P4Cha`!C|+! zM{)jaOB``4d=%#|w!w)!7J{PQ5~yF>5>?z`AHuoi7QeX_PW%?&D5u}rhSSKiC!(DG z%{H7~s=?`Zw&pZ?8*-G>zukt@$OA5-oc`T5oL;ZN>G!tgG8l3*aHZqOejveLnN84~3c~nP~(|_EC)8}e%`cGSP8oebv%IS}{;q;{%IRCm0 zPUN{E5l&t29-herk&oUAjj)m2DD^ius_l5dq+Zb8Dd$c#qUe%ge&7~q#bxc|^u`Y^y&dvXV3{k!#*_gP?sBmSX9evC5DJ zyJWn(dZH_Sb9I*tDfL8G<<9Ib8E>GP=o0s3clE_c4aG@qaW8f?bw-WdWzOggcgePu zi*Z)8S-<-jWEIj#3@LoS_STrAVa)iQ+*_lL#Tu>S9`CIQp4gON{O<1>bWo;TNBWsc zY;yZIMMUVT_6L}D$$R&M@7)i+cR%>v{owD_`@tiw@l+p1Pdvz)`3>&q(y<6DpF(Y9 zbr2iz7ng#HS=TJGJj%6%n$Z&r~5-Y?T5RvWfukqYG747LkO^zmYrgEaB&E#WvXpM@*I1B zLXEDlXcBN=LozH9afVn!QNJY)pccI5Y3USw^3`d{_ay#I20C&1Qd2`i=n)EXqUJON zV;P%~N$uALMz71kWwvr6(Np2SM=&b21Kim4AV$aubTc(UdxAktU@nsY)XKUo$AebW z{q?n)9tgL}UBq?`(_AdxvcSd02kUHnDBL(V@m)18aIx``IvXDiH_lCTSB(o?Z2Uxh zjSu#R8+TVWX_sr!He(Lx(DWb(otsdCse4VLzSmK?oOdM^g!rNQ9l~j0Cy z|E64x?1Za364gU7eN(PZ?0~B~9EwZj!c|X{i z@gz%K&+yVYPjZM=(hS___HTRb;PQ+dKiOZkk~xSgnUcDK%bps(y2ktPChxeRFuvD5 zxk8^+9)8PB3(j^|eeds=3Pi3v-;%y=rV92ZN)WrnYB|JHRatT+a)y$pS zraUXJYq~C%Ei*@8q6#$Gv>U0yu~{nX#38f=e0xsHFSL30Q7M2_*0gOd*M57%v$U5aD<-1IVofqAbYQ#lYI8uDGMO+mj9^=JsiYWdu zR9xK$@;oNbhWaiokTngcQ-QH3;LG&#gSgm0VRhMr5|kZkn@alw{4Z-zOFKbqM=F~( zQ#Pusyw?Dw=BPGlc7eUD315sv1&^J!*MY2%3mt^A1$vj4nB++gxAf&K*aWKTilvr% z^O zMy}J;0Hcwifj0oJT`s7-hFijCCqdKVT`98aaG$-R83sB_@N2qe@Qj6E6ysaL;6^DZ zYb+&C>g(!;jo}V^j^5WI%RmthIEIZV)XS@Z^JOfGKx-QMMl9NjBt_6j$+Ij1b@3V) zHgyHNXP6LXEQ0Z~he8C5(z<{W&jV=Sl%%~im%TNY_wJetHTMgQuF&^!2|W*9iGkig zzNp!bX>Fh(3(LZMB}ip4LuT0uokH|W(kTXVTH-UIfw4q4_!m(cfDTlH;^iL^Pd(5LY_-AT<{iA`=|bb7;Bnk8rZv(e8sdy_Fj^33Q*o~3P4Hh zXz|PtomW{`H`XnDRRN^^dBSEwWtg^s7nZ?YAsOnWEH-tJ!k#KfbE)gJOn0R~JsX>m z(P>zjrh)qflMn!*yv~zO12e^=k>8ETc*# zN?Jaz(+zkkrvz?gvr0iUpVi+Py5_oZr)j$7J4cbd4kMW-x}#<)RM~k&@=P|M7~^0S z)~?!uS&Nyc@fU_2=0eUjP!-U~R6t`=rzG%YNx)R-aNfE?92E=1?_xm%lGl;?wd>u| zD%r@Is+G@RXta}XO8E>`pjswzHpnZMVZs&a56wk*)jY;$RkndQTsL6mhJibaW@WBH zB9X@w!ZdPVY|}SXTqqb9&S7XQA;dFLkw(zi07JfybgiI9~@bf^3$_x|A!Z$OQC>@`x45p2|6~@-0M(1Dwj^By*Ny=KmvThq z8Vr1hY_N_2f(g9_MpReUqa>PMLpGZV4GH!sSjK>c*wTfIOHN=)&z>rGSB^An3YL5BKY|O#Y=5W=i#+W0^td6icwMXO=MhItgyqV5{`*dQvMe08P!)9J1riJ5K8 zhERpTBJw7&(c9p4a}0|vxP_LGXC2{xRVlw=s3n*-3ft}x>&1?>A-n^%xWEacL+Mfn zalrzuP&dt0KGT#SDwRRWpdIYsmIKej2-1mUne>@Vbp5mAoNAM*<$%R7jpWf~dVEtgHp@o#yD^5BWh z4SJD0W9GgPGreGhL%sc@z4%ZZ(1_w+wy;~qIt|uZ1{c14r5%TW72x~rl?S{B#d)nO z@nMMW5%uBA@s(!pgEh+erTF-|dr>PcoWJtbecn+~7`iS+`F^E^@b3PVrN-i4K_8UH z;@2bhLVW`Z8;akAAJB#3xA62r@vrH<2YmRw`+|n(=31~9bmd=KyiPQo48KhPB>v++ z|M4eL?+!57O2tned;ZH%;d8BfzUzM?7_ H+k*fAPzrYd literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d7f75bd67b3cd8e24a871270d0321296eeb5a08d GIT binary patch literal 5672 zcmV+@7T4(?iwFqr7LjKH|8sO@a9=YnaBFM;?LAv;97l3|KbDj%$#U%2=ku64dzU9_ zxeq=?QW7ar5;YVllawXL$82VIo7`FL&g{)Y5^X5Bi{qFAI)HJW`#2;p^5ZiC7=~dO z2yz!V34#DYz79bixFCoxmrIcQN%CPNL6BGVYj$RKmpi28lO0o|-I?jGs;;W8>ZYDztVv4_m(-`FMM)`3+q!K?*Wgm%4ncWN(Int+Q#Z=eZIqQP#k^@sa7)1B z(sf{h!YoP!RH~3d>9%SW&N?=*Q|QATHi+pI4zZ>mKJjtkFn)Rsl~B%-)}FgLHze6x zlA`6MTuIH{l-BHQS+(Mhjc~`7u9Td@G2BNkGJ3WMILtNoUKW}p=F*$jb2c?SkdPGg ziGpjZOTY;3r1Cp*EAGxJCgP?|m4 zG0y>d%x$pavU}gXX#gbLf*hTio=42I`zne7)EB8L!LYepaJ0$oaCFk|FvsKc`$_n; z!SOU4C-@PCV^IDo9D{Jg;UMP}9OOI(2RWaHW01mNY5c^%FhP0#G}E1b?z zgnJ2V&vQ;=Rnztu`7w5Cm5<4fiW_l~zY;8|cN$}-)8b+V5d9Dre5Wfx7QPlNK*54? zu)q!$-1X8HrycnM@Ny5~Ls=zv6KVNnPzN{(>8Cd6r4G3p?21@jpWH7Wk`F__9RbZ9 zgT6a1KLY*rDD>OUgYiEhe+i8L%kopQ1g1X##`_hpl4s;+!H{Fr{7=e5@+mO>VL2fu z!T8hih&&3$KQ2$mlc1a_c^WixMm{T_gC3oe&x4)J%L~BaMT8@YWf^MLwpjexa5gB9 zRqi5)-ACZQhu>FAj=LoS)5|l1dCG%DYM%r-Sj~VBJ4R(?>X7r7IpSllxWo z{t5XSz&%gPugf=}?ggmpk$-_28S*msivc3P6d;oI5XliDKJ3i^b|HXW^kCnt!PWxU zl>oNk!8U8KZw0XL1h8N7V87f*{9pFZS^*ny;G@>78|s#-1LmE!dP5eRdF`UJm$!91e-duNLr1bBoQZ51TsA!Qv;S_KZy!I=5Z`s zKnWKy!i9uzWB%R;^}q;@a3>6#hUL{}@PMdZLfV#9;FI81Nb{$lBQ$<%1d!P3aY!gw+{e_<59({qW$_ysthA00oR8XsMl8=rS@?+LLCVX9%W%SKVSut*!4nD3Js&a}f zRpt@sQKfmUgIgSYgx2^O0l+5}GG@nN!jN5A5F5eacp=S8?bi4nD%cJEY5Nz(m*D`PId%>%fL-#8084ZMO8i> zzm=vGY5f?le}vYT%eppfs~Ou;OC|#t=Kv>b83g%|p(!O^#7_)uf};&IlZ!%6a_F`Q z>4mlR7f3I(B@?{(F*fOuUY2eXPdF{55|uks(%lFFN;04uf7sE|61k-qQqDk%g*-Mp zBW>DRj&yx2uihGbIU-3Ejzi2VmI5#@OO{UE;I#De-66JkOct}u)6$C@(%q>Dpm|Z^ zFlj0i;#Ap&qUF_CYDgL%oC3g2-H4Gk)S1MTI$eu4rJg)FC=pHsFB&uj#9ZZXAq{~K zpf&2-KzV`^X5b|yi%I|u;>49o1!?(Ml$J#YDGa@KYpC*~gK@I95{nY+j1Ear!IWE} z=2>y{q{^G3WL`{c!2h_a0YAiJrIRz9T?XXkA!|ejsiFd^;+lO?-5{NzluVARZpK{g zZN?Hq$w6saO2kKnYIY-I88)X9fU5RfykeV$7-N=;bT>ji@MR#&HZ&jRR0L{>dO`PB z4IL(^Wv}fprbs=AOq8FFn{?tFi=Lo{7NumZI-|f4v~>pbPB8BdQafT160}m% zm3*xkQMxpl5R@qz#EQ9<`4`BzCSp+g{NnTE2@Qn6j@yP(A$1~Zv;!p=FlkScayL9E zU|^&`4F8=}7yzZ_Mry>s-?=n9x3EM?#7N({xP0{*#FQQ(Cxn+=A>l6d<2DjQ+maIw z?&hi$&63A&P&Kzo>fJ#-CB|_e1L3?a#fNO%y)-|0VQD>2%2FxyA#3Ya1@(P49CCBoLwzS~Ai#ZHvE&?Hqq@n=H)c(uF~hSqEq_F^O@@?J4<@r#*tjXfCO-r zBNzXZkoBV${|n0G*KOQE#>XnuVaNj@V4yq;Jq)g4D%fq(l{p~@CjnUz3~>1ca1NS1 z^}6f&kfqRsjK&to*0S65xh})=fE9b&pdeC(4GRXzm|a1J35j6d*|8h)M8(>l&^GSZ z6!L(Utx7uLndUAWa){zlxY@B0SwA&&VAE2os%s^Hy_=9Hn{;*5&*8DG4!x#mE5
Pp#bhVUUqzjS7-+)pDAK}l5=2x!pM@4d5_1S8OgrBnX zZ$OI@EdD!b3m@iYy^0i5*Z2cs#`qyyqsEC$%T>S2em7^|j2sHtyk1g}K`mp9M}p1#b}*6k~8BcU0wgOUQArapT=IYg2P!+P`5_$#=Iv zI+_$aDPN7)gGZL^!6sv0R?E?)84Zc;? zV66BKxp^195Yywc#VZ7rg!!uS-Z@a7P~O`f^P>uj35AV#6!z|c!s;k2R(y}#yidM9 zsG8hYg$74G8hr0SgF=JvdNfEECg{Qg9p9**Qrn~Iqp3B`rdf(i@?fsyA)(|kkCNYS zpyWCV#Jx)px#u9kyBS8^V87{AXrlHFO`PmKFrtlh$1+>^&r%yoajEX1A)r^g#`#i=CDZ$scd1 zldjh|o+R^UpQ6^-6rEJ^w4+m4mM2`Y#lz5UMS5bLECYuyvcl(g6noD%ilvfl-pxm+ zQ(|-)_o*@I{68F>-V2US2_gKXPx#b<4K=Z$G+UwJHk9OQ1n2asG5v+Gp%Ec`$|wBD zfekgWA(Awu#)J)x3N`wN^U*Ja4NVB)M}5MN9oSG48yaWpyxfK+g&N0vY8?MU*wC~P ze!?gG#DNVpv7t%!M1b4Sj8LOLC7Ap|*w8tVQzyN`r^4sdku^)P)Ld92dUHQBw@*BC z3TWYjpcdx6ve!?9y))J71G5kk9}Xokd!qK>W~w37$yl`6q_&?@B#$X8Fn_~Gy^l%U z->maaBTu}RRMT>BKW}!KKM^A^Y)-;|CMz0I?7GusHoyJ6&4b-%r#{>sVBG}Yai3?9 z-#K_&jd(gZfa+?r_~UlrAbf835AD^iQpAj!&62$q9~bi=U+hSw9<1N{ykMVR&t(tz ze60qCy>h1?gwMNA#Xr+-?{Zn!^NSy~hce70Gt4TnFQA9lHoG4euct}Z%Ts!aL4U1A5mzE5ATr%Bh> zNBlNECVYLw=j&q+*w+Jgzh7T>HBjs80X>X+r5}GFzAk=|_SyDz!R06N^+|4_F0lh& z->0wF)1>R`V}2VyEqvYg2q!i1fPFn+_xtsAR|B=a9?-*-SNh2Z;p^o0q@QUwCk%cf z51(OnN$DI+%lGHo0Uf$-J>fU!Gh#8$_f#$YfEVLDCcaOPrgGOhb3ppDHZt*Jz`CJi z5VP=%=eM(@jc>8GaHP6_%(-~>U-_G{I`-EwiYlc;=+eB1Pgkf8wI3JdslSipC}0=^p-@V&5r?}r8aAS~eTga!QF zuz93Vk^8K=TDp9_+AS^ zQFjFD4|YWrJM29;J9hEUcf$#80rqqH7fm?zy?esX>0dSB^l}|ef4DoR{%y#9PXD?I zr@j|l_&NQXCY)~6;q-5J=hVLy+Ry1nO*r+vMZ?eO-!0sla~NmkX_vPZhuS5G}%!o0jDzqi2;emzhJX%JSc+9rwVMnnJs^?Y)iVRZ&D8@Eq{TA>)t%V` z64^jCGa&Y54-5xLjRZ;UvKPCSI-|yRnKOE$1CnXv0-P01CigIdtU($JAbIy|?~Xa{ z#SHG`-W_!!&}fJ~-n$c=Y)CM;`@0SuqEl`o31$)vwtrJZ0$tO7l3AB@upj(jKls6Z z@Pqx}pV$51zM*k?m_$! z7-bk_mw63>7jBHfQJby>gxE{REd~qW%bk=1NG-H9&B6c!hmghlkpTNJNqw%65b<7g z50E6kK$#iPH81sHL7x0Tfb%s>853M)4c_KtJv^@vsl=XyNG0oXp4=u82WBF+$N$Ukn03#f%r z^Gsx#zWM4*#K95%P6j%0`LL~}#q$aUDOz_LLSh-4kwto6Opb3z?oVZYLw(MQ9!M(+j2Z;HI)e0YAWe%l@Afk8m75euw{Wu8c&C8 ze8k&0ckz8SE^tZXV<8(K_cqR5bYG1NT+;YtxW>~7Z{zmXe!3d4pv5%kntM)^S$P9m zVnZG7s~ENhlY9DpwHu`lG8OmFw`-@{4QAW!WLqR1q0)+QNrmf?p%RL)>BOFriBC0^ z5Q&6$Fx^v@aQ|}zPlnKttMp#Dx-Y@s6Z#u+HMSS7?n_kn#Po(-P40oK`yQ@Yu=em) zjdh=1q;=!0C(816`N^?ymw|38M&9%AEn+CQGrV*eM=WHNRgFBXc6M+tjD1>;-rU1o zd?gcrlr6H-sgl1|L*A1_9=n=Q-di#BO|?Xx9~N5hVe%Jk*2UkUF{^*qrWa|qGVC>- z^e@DG|C_&Wvp=Xp)_=vw!=YQ4a`#AbOe%w=MWt2@2fUcdAcZ*k{dRUt- z<_ACeZQRGiV2#NwLM#d;x7b+_A4XasW(*%%UAR8GI-j|?v@$ol1SXaE3aIS54n literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json new file mode 100644 index 00000000..34e6f226 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 3, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 4, + "stats.cum_n_token_goal": 27, + "stats.max_n_token_goal": 9, + "stats.cum_n_token_url": 72, + "stats.max_n_token_url": 24, + "stats.cum_n_token_focused_element_bid": 3, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 8, + "stats.max_n_token_last_action": 4, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 2892, + "stats.max_n_token_dom_txt": 966, + "stats.cum_n_token_axtree_txt": 667, + "stats.max_n_token_axtree_txt": 223, + "stats.cum_n_token_pruned_html": 1014, + "stats.max_n_token_pruned_html": 340, + "stats.cum_n_retry_llm": 3, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 4339, + "stats.max_input_tokens": 1464, + "stats.cum_output_tokens": 225, + "stats.max_output_tokens": 84, + "stats.cum_cost": 0.00078585, + "stats.max_cost": 0.0002646, + "stats.cum_n_token_agent_messages": 4512, + "stats.max_n_token_agent_messages": 1517, + "stats.cum_step_elapsed": 3.0203144550323486, + "stats.max_step_elapsed": 1.3659462928771973, + "stats.cum_agent_elapsed": 3.8209800720214844, + "stats.max_agent_elapsed": 1.8219048976898193, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl new file mode 100644 index 0000000000000000000000000000000000000000..6bdd8639d29d9df84bd8ef01e47587d6eaf127e5 GIT binary patch literal 2286 zcmbVO%WfP+6t!`jv1j5(f)gU~kPsjxLOpnl6Wc3dArcf5V}%bWRd?4+m!`X`RaHF? zBqUfM#Zv83EI|ASd-i+;EB1T<=T?u$4-yu5VRzlSb)V;+>+i4q^ykV#_;GK?T8)j+ zgK5zflS*h=h|-#Fu2kh+>x)Tss0YTs^!I=8-}!s4%?G$=oP_SE4Qxe!dWv%Rw$ZSO_B zeP73Jxu0_?_|81r@bRAoXO&h(WxcyKtLI5LKMt1W{ajKp?7n~1ZaPIwXk=qUmILor zzK+e6$1AlFEVD(9d^+?^_0vM&7_Kab@l*FEUG-IhKPeNrpq!zyPGX_8(kxfVZe}Bs z#wtCcMjecE_}~M?cKb}6VO+`#8M zeVJRK=XPd=Fie&C6ppr3B}y*CJu*v$EN#YOnV!Acn8^|BOa!=G zN4%(x&pvRS!|-{faqUKd83BBJ^co6?MVVv;*TZhiDSZ9nnDIRKKhn9*t(--5>aH6LmY_m!YGL)1g{=BYCj_0N8AQ znMkB~>Tb+YM3{f!+H@`u4*-c41$9*dDi+Xfd_pV8f%OI`=kit32P9DvfE(Qmatuis#8X9@ib+HG^XO3W~vazV1ntvTHr9C zJ7Lmlf3)S=#2sv)Rb6??f0Z;s+5OQqs)r~?4Vh*OApCN4nJ-r@H^Ui2-JP|y8$cf= znMj7Qn!vGZ)Qk|RKWa5Tj1`sfN6T&{6Ffz8D83t7B$x}JJQWIMxIO05f*dqMt@K$Y zxL6rUF%DdL>{=#QV;};luMs&QS9FwH0YN9kHynqJMESL_##kyfwpYb%mdhB?ohTzG zCpco0-E)hdV6HMn%VrS+uzALQD2oA5k@C`ELHpuf zic}f)696AsSxzG>49q@#m5tZ!HViL18iwXy)n$9$`sZL^%B;ab!|#vUZtIc+y+~DU{n2~wI)x}x z?>^T!- rk2In{#UcNc+U{29kH!Bg7e8P6&)coP7VmVH-d+6Tcks@0P)GM4!O@Rc literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..45522b9e38d9eb2edf0a451937e726adf406c308 GIT binary patch literal 97 zcmV-n0G|IJiwFqh7LjKH|7UMuY+r9;YGq?|E^upX0Bc~GYR&)wu~TZNX!Nj@R2HO8 z0aB?IB~vn(dt`!Bb5fH_6#PG~ z6CIzw{>c{l&wD&)nHx5-);23Ca=S__twIdPPU)su^}X(E4yi60Mbm%TANZty$4`14 zHWc|vi`Q=EY+7;Q2%u2lVn`EP&! z{98|y9`?F8qVGL(QF}^Wl+}v7X}Xqt9S#NVAe84-Lk9kerd5$QNKSTC`<5-kDT1fu zXMqU{vnZE{Ud2LrLvu=}d>7a$^?DsHbki>#qk4d1?LDtNb0o-5q4x3eCWUJWWtZV7mYwuD~)5*;6@Nmhj zkYQ)Iq8ZwTnH#F=u8m(&FkEsfdT>IU9LkqS{#MSsP3#ob`l@#{)PkcqI+d*V@L9^? z*#KN#0Wl!Bns>ByLJ_i>Q6RSg*l;V=%~amBNUGxMj;30cy6Jm;S&QUN!*(neI`gvc zby7gz>oN)q81!uEL5T#9<@?W~kYTLcmS$}zX z`O=T!AN`}WIDYP>v)wKHpAV;AI|#}zJ?cpXvmzvkn$lA+^Dfmo=ph#Lih}D`FQcY) z(*FBOtJjrNZNf_rLl+zXx~v*&E-Dah+3O=l-YjUw+Ok>!oHnn!>gGUbivHpuR{&2|CUD1)nyh9X_4(Z!@3w(!Wo@w+%i|!RHwN zB;Yfkd3pG-wF8O?+knfC@g;F;d@MZig?|~>k1ABaHsSr5{?oy(n_1_ z=0~luznZB*OZ`4|Jfg4;>)X z1Il6LsPZ6uk1CJA_c8v(p|9@ z;hZLGsvT1D2iQ-m@__QFI1xX}2jV4-ekRz@j5wGB#3;mt$j?%og`4q$LcBna7dY{P zkA-QApBI%2z{_3Y9n7iuTf`_VgE+uQaGcs;lsc3y&@0rsUZqbts2qZEI|7n>0LJbi zVI4rRE9wLhn2K40?I$C zj49)w{FBO*G7Z8xq0EA0PAaFA(=eiEl(V2G=alon;R5j_)p0B>=Q=F(too~zN5S18 zMI!86g7a>EUMqPhC=r-po*B%OAuOzY0{X#f`Xks$+9y_%4B>nb`p8O;M@olyX)02h z=A|Q%(otSIR@Z~65D%j4M2s&cjZ-09lp-rV7b#ufr57Wmmw2fXDP88JS3*+8^Q*!6 z+sqWQmn&GH9o$&UO5OUE+pnpK$fPMje!aT9P zBYD@tD18kQ#ym|7BGh_JOIy>7c)9Bx&{fC}dHR8lJ_z$PzGH-bn#bnmoYN`hn22}4 zb~g2}E86P^Naak|C~#89Sj*CnT9;0@lG>T_mkm#pUymGT_;Ch~e~B;;KBoMdlI6!I z7EYgYXr89@(xL;EO2NiMC3)F{q-o3Plt#=_0w{M z$XO2kVu;Agq2n(uoPPEn8OW2I$Zvw&e~%R%4u%Up1m+!R*;^$ZTmDgQ^0a~H0+BO4 za^^(NK32;+An+6K2+X?jw;+i=iW2azTleCxVD<$T$Xd3 z!KDW_O=gj_7C$lR#_0e~1#lX`(&;Bi>3ckmH5X8tix|y?gywwy?wxwfgA=b4+@|G( zr5XP~RM&~I=9KtEsx|nw^WRqG07&^DGwdtWE#AQ8j!4OKSklMSuxGUb&;wQV4)T(V zL|ZF4J}9!QdR;cmkBl{{=J#}sYOBR4uWtk9`sRi`G?E$5_@10OKR$nM^8DHPk+G=@ zlcSUKeX*K8k8$u0 z`O-CD;=r}DR~J9L0&Lh;-uDhi8Ym;eI7fI$R24>3>!Xw+t)Jla57YWe#WaRpE$cd( zZZm*M4sg7lLFgZ`3{~ev{D+}UakT!O`bD9qIdq$Zqi}8IIUI$yku)!Uj7@js7vv4J zgc&)LuHHT&-;oe#gaO_7gN~LKXse;6*ur1XnXvxf=JUMUz0E?!T z#5T0K^a*XY9_@s7{P=*3oCsbFXbOl0!e1i>0UIFKsBHt~DN2}u>uQeZ01e$qsnsem z3dsa5OAJsLdK}bH;UxxAxQ>!cpmrt(<%D1=sL%?mIB{I#O;Iv8)2r}5r5V5vTC9A0 zjt93{|%|u2xJ2(kmv@gChg-tej4b3(;&x#<5&ZB?eU+ zxm4A)OG(Bo@6sI!zwpN*j%yha%o7sSpm;&{AGS=GpjN`V!oH6bm}?+}xa;22KT$ zgkd@*P@u#K+VwEc)C^(t8MAz!Bq1MXsK?>3Zo`E4C``q6b2-NVG9aM@1qRk?GOMXL za+79*up8@O$eNwav$czu;F-E|9pxZGD=bviFy`Fcmilx`9!aI;8F>hPYMLT5;|AS6 zU)5~0K$4a`+`tPALR%+6?udD3fa(#8kRTP^R15WLMCsCGLQtk?5G&4HIrkj;H4%f_ z&n-TKyD<;~J2ot}ighAtv;!q@m~Rz9_uol71E5shutqrk&dc*>&R@n7 z;psaUmp^*I%G4S9n%7l~xJPWq+{eZY+s z&Vvci9H}Q*sfS#ICS){%K(?0Kq?@!1F9cTZZi51;2HPwcq+D*5ST-br1^=$QB~P?l zd+W68^%*K|xw5rKU+hwD!y$(#AAys*F2Pk&%LFzZttPq_1nen7Hf+;1&?tuwED4CC0PCsY|A^u)%q$?ZPVaezKrpMu0fTP*iIn+fc-Ace!#O@{?Bkdvk^9A z9foPZNR?)R z@Z?MQ=gpeReMm@fJS4$4_9ZAJ_s6(3EYY0gX&VVe1JArA^6 zPlSa0r7eWqK!9G)Wr*BoAi=xUjJPXJiTevRaR-pyd@a##CP|_cAS7=oFxL4qyz`^s zi120;8g>9i$ARp+i5~Yf)8imF|9?}mI=YWgO^bu=O6ox`++8G{e1vMrW|wBb^g)ZK zi=B=M$scd1ldj76p(JxpDJrSc3py3#sY9pGj!?*B%Y03n21;`U7i^nQl+sVHCwe2| zi83Q>%FI2?knk{*5iySVKZ73Tjkt$Ni_T9+I-l9sp&fK+lr64s9U9?c%%me?9DP^l z(3t3aCerz_eI43ChcNNUOb8ts7h;UM^YM3u4o!*9k4HK`v9Cir=+GovtK~X0EyOqx z5#!{$LWgEW=cgi_pW4@<9du}#t?F|fniFD-rUcXP3LQEva_W(A=QGXc)Y3Iab+ml5 zO7z9O%)Z|C>?bCLH{((`7w&uGL^wR#tZi*I(c?o+^%!@-?%!Oq31K3GE}NA0QF^$U zScSQpcQm}s5nNNw2Ml+tE^D^qdwsmoWxk_@Fl;`;FJ@&;sBXi_E}NbHbj)IFZSt+- z7;ZPPZiCg3Et{1>qZ`8w1X})1yRZ#Dt^01f7$;RsrP=g2tm-K-0g42TOy>R?d`SKG zXzaA#F+;EK{cyO?qxZ$oo$5Q>(1F2*hF<==_NEkZgexM_*|+9BT60W@fi-8M`kWED zOEi3s-DhjA>G)o)8T(yt&9UB(hWk8zU#vM-e1}^zF!;8td7LXE(%HA>Jz8^2h=Dba zMfG_?So2uKnkVkFHP>`}uhxwHuD9k`?uoyL{}bU}Pv4gaoqwn2+xfSdZ_n}I zlHbp__ioTJAqMt571igHV!14`?=^a#m&-yrzSjtyZGyX+qBq+{asIFgoJGvxUD>j^QSxDL>{p~QSS=W zZ*7Yz?p6=s+;z);wH;3UcG@VXf4u{zk>@x>IsKa*IK9+>)4$!G)95X|QBJ?T1E=cN zoc{d|oUS(D^gGRQsy5*CA9mn0@?7aWI@r_gT z*?C>~%sdB@iW{HkYiXIqoAP8}aIQbyFVj%0lezx>laf4JB5HwEjo`UFOLIR~lA$GX z!y8%yO``+%H2qJl9{lxw&7)yOZYn#jVZ#!!c50aZ=md z&RkEOQDe7oGkW9wvTfyKoRw|X?_ma6hcppG3g7a)J?3N>Gk!Dl_NY^_Mw_@zdV7M? zTM~@lK;3{2%9QI!nyJJVcTiJAgsy9UglU(&e@pfLE!F$CRPWzX{nL6&b;LE!4rBDh zgPd8|cYKdE< zJ=u`&(cSXBOmf>ZKD8s`)4O7PG~JN#vE4GB-(FEhM;c(&Y(50oNTX)ijZYkcYLsf* zkQT?DJJ6sjEKUU6HxLJlF`OaRP}C!d1E>Xmcv70BkA*rZ`JTidWk9DSzueZ+5_KZOOYWS`d?|oaml!e0hUeEj*ed>1j zwJe=D>AdB8-&~Xp0#h%?l-ZO`c1on?bg2-bx*x{5U(OMj7!oEOn6gsPe%AGV?p;&z zN*Etsx598WTjmen);DG79`o6QvTe$X@`k4Cd?Kqk$P2B8fumi(XQGsoQ&mb{jZP>3RjfY{3R(c22V)p|DY$To7J&_&qe`-|EE2%3Px_ z{8-}enW2#QLYM>WMIgS{#rIPE8wXokTAnjq8GJ6x#bw&C>{y#rIKZ8$uR}%f`WYEt zM^cbM1*wo371=D(Q+yK(PUt0k?~JVR*R24rc1p%J@Le@@OBfV_zNsdHCBwMuFgSzs z#D)p6EP95VASlvJ$>-HPV1k0+8*d~oKjfXZh2j@7wrOV=E>|#0oV_UrU@Vd+8ttG# zHYg`4-(@oFyXaO_BQC@qFwtk7Ahsh_O`BRS@C?}j?=?WFdE5!tHM_)KCxq|s zp@PRw+Z#Yu$b}9<)goY@5|cd1;|{u<+XPWmEpyFMD}!=Hy#+FKEvlzQQ`gOnKu=W% zpI8(fWJx-9OkRLiXC!tp!S8}Zs95Nq(gO_&Z;T&qX_1b}S8^~o>#WBhmztnRb%xT# zb-@{FKa<(dWcD+e{Y>WVWir&E%->jn`TUHuY&v8{z6p%1Vu+`4#?Zjd!5>#k>X6}9 z@KHa|w0MDstUBBquW5#XZV)_{t{H^AoQ+_7;(My#ASozo3~~W|L*2A7kYUf!uUe#t z*?}oAY($~{TMgVQV^M@q(}4UV8b`SV0vaiKk;RsNUIW9Xu3`5K69R}uFftBOh=5UA z7f|96o`yt8+F$F~U+Z`Wu60m5zQEK@DDsSiE+sLNNuZg_7d6{4txdF0VKSIv1lcDh zwU}VY^Q^+Is`dD=1ddP!kAY;Uqp(dNV0{3TGr65|!=|35|=7w>vX}Z;WN0GfIAz2{0qh@PV*?CR! zOg5kxGmuqVduj`&1ZIJTKp1wIXgD`PRX`(C0gZW`lE9bf08^pEE7lTmR4fp`*Gd|Y zTt(_PR|lk9WHV=~Rw0Yg$xgy4xtoIb0UnyA56Rce3gItk zt$@8kJ8O8gg|6LVb2}`w$n&noW>fXkV5%1~%X@O(6sLh;OOvz9__l!yTk=q&NNCE%7yZ&5@5 zD5rvl%A5(|3VLCgRWx)a36$j!mnC7MwuSyUkPP119F%Mr3i^6qDjcx{g={eiWZApq z2GI{=LPaO`jS*0jCCD7v`cW_xm`tJ=A?wPEiE%9{Yc$2aftP_|Fw21%pn7rDmZXc| z+A%=jI*F)Uhk*}~4HgMNFrgQRi0ay6k3=(ENGMaGA;CTcYXr~`TNiL~JqJt)`T#L5 zPa>3pxGAxwAr8;dMZJ^K3jR<9#5D=3h(Zm8;OOc^Ue)ui&f*yGf$`1`+YCtG0^dYq zunYqL)_#t5Eyq|P5cBemgo|@5FoO44*T58o(L%U+NB^Ax+Q4vfdfCYwBsxC-dkZd^Sudp!(N1Mkbod#o$EVDYo>X7#mj+Dt4 zbNGyd4ns;>+G63yA%KkbOGl2~D1mxlheqy#Wmf8>xQJFG)@Ye;pza#^*d!{Q|MX|l znN0tn#LPBkL#RSv5e1Xj=xy*~Fos1}vudlS@}eXBuPPNb4YdL@H(}es@~dNQ3hzKI zE^xx=Q2NzTTpvIy)J^jipX^Bxm8zg*&<=KR+kxj{1nEF{4v!w;i;KNyRGZYTE%wp1 zMeoN#ls{Vw+uKMAHw0>O0x{V+`erGIzj4dRg7R9MSzWKtcVhE%Ke$W7>&v;|BnrG- zY2okp%+lMrY=24X^bVS?6Dm^Xy^cI&=>Ec~^zpBy5m9_qbWl_1OU3#1%{{n} z8ZMC7{X%r(;*RefW-o(>v|Z1t)*543V(0V~177c1)fpNyhw$Z6zIUXe-UcQdRTr#W zYWY!X3oDOs&B_m9xj{C9zOdP)Ggxa``Z9Rs01g2wz&DyJk9m)a z^G4T@!w@|v>cbb3!?3)kK{>yg9A9uQYQ?qROJ8pH9uS403ow+g)mjMeb}yZ4E&mzx zL1`_2BXSqdpJQQ5`J3_zO_sdOhGCA+F;cX(brMYkB#*BVqeYcqn` q%Q_Nj?5)(Z4~k8?5bzIwc)`tiJ(%h;6Igb0ss97%@+<#ffdBwXEM|8vxwD#`S!?VRrk0Q?5Y&^8w1wM#xJCiPHBtjj zilRtb6iEA%G)aR7LEN-Wi?%=?A3<9LMcZ@kV|He^B$u>OIWe_b?##XCo_o%@=XLL$ z$0wUU|A|l4(SP1!SR%W#U| zN%?sog2K$n1)`U+P~Oy>!a3gsatdu;lM8J6g$~~J-+#Vv%zJ#9=p^UJ3xH2G@^Vhs zayR8=H(S!2r0){1Y28$Hzwm(9h6folTLf@_!8>l2iIFKI!x`7ok!$bSHBBeueSLie zvqbuwzLI8Wn`XARth+XTML}P|De1up?XovlAi0}a^A@p_SnI3a$xsiD=IB(i-a{8D zg%<;Gc?HCP;A+m%HVAEy(TqH~1;B<|DsLrorbUt^S9dhkveYf#YtL9DXBxI+xxmaT zzSm3veebA|XCSZmO5JDti}hLmWq-*}DD_JHXZ*sGUUS~rDidtdppE)>@37iNuYST` zSXg-Wf8ihfqtrQm_U^@&I{wdxQ?C&O>e3Y; z#A~Mfqs|@8sy5-J$AJY0fUc;|piP6THRJwExes`nQ(Phv?rY;M)M7r{Hsn zf4bn)t$Y+d-SA1m2Y*xW!QWx{;O{tmx+yGvrr>)@d5U-~Nb3p?2T-T9DiV$obW%^d zYvD)rvE%qx{H<17U*m5W8+?d&%r2;Tb92cw9ZDd|vsvSk^qMzR-9@Th#seKF?K zdPF&{oK!vl-;>J2@O_kj@%ZT~-w_J=H2r&0ISb#Z$oFH)3jBV2&tLZa6vuOtc&$j< zHBM=wqS_%Pe}MheD-S4-h!gRnd?a2{?Pr+%jEjRwXc&dK6#1EtlW-$mkdGJW@d7tq za661!{M=D40WEil*O66oH;Iv70C9kk;5aqFC^acZL9bBj+LU&sLpcWHb^;{#0F2#( z%0n<-kHENn5S0Hh<#AB{4=Yb9GN^thDDOu=OP*Gq0Yy$w^*^ojD7~QkeM-MF0Lq_M z29+UD{t;zV83W;*Q6@k#XO(lxc^J_P%0|BEL7JgnSc`zsuEyFZ3m?uN6u=WYygVl6KS|=z^tR@j^^9=CFN>4{hdwFRxQrgc; z2O_0uUOKpqgV7KTqU=nJE+&n0p|&VRRyrLio#myMBc;#sQYBKlz)KfHQpWS8;QTS= zD!^T%<q4rp`Es-L2eeQZqsa<>!?|s@ct4Ie9>E= zr$8@(9s<3?6tTS%IoHB?y$q>ij;06^Y^|oHt!qZS-1Uy=DrAa05kZ$91biClF;+j# zqjYU<>Kt=X#5-a;TYA_j9rOdJvZia~IVvQqZvIF0^XKbHGVak^jnS{vc-C^J4ll?!&$WQ#Im{7 z)F1pqyf$-f%_hzg_?f_T?y-)8F~^lVEOM^IVEA$E-43OOpjLN;a=>lG^=~`W%Mj(` z3KoZows;j{<&%WDh&DS16K=h#+;;L5I|!~W5F8#0!Ntkgo51%n)D7h^AvlcP!XKV(vAzb2*D=IERt5@C#K&xF~CUyP6$|H{RAoeEstZh1(ez%Mr|RXHlM$L zryld*#A^n(X*pqO#y=3%bz-bL1wN&!55A53w_Z5{Qtn`ey-3~SbzJy}l)Qi??K};8 zUdsbMP*tykms}>=dcpBQkzLh0YQs#)Sf^@!PgkpUT9)$KH(}OqZrZ&AsiBnb$*Gy4 zsp*lKi&F!GqnAd~BU3{o0~h-*UK$)r4Nr|u_4{7yDY5wDdrb^^)8&=*~DbQ#25J^^)&3ugtAnnStXJuWUJ$GfRx<4w>bB1wtHK z1{_!B78Ib(BEX7Nse>B2mgcn}-oBh|Bf}P114@#~K3ue;f0DGOKFxbNCF+VVy2RS* zb+yV-5=M!G4|4D(`PpS4;>hyFrMc%;fDGHp`QGtJ2Ssc!#2Y*)s`BaNMw()z^~1dW zAzEK5nMR+hWn4$oZ3ZyH0Zwlx5cngOq3XPd|FCAGyjka7d{O8z4&5N(C@inMfTPeb z(9eq>WiuZ6C3zDqVO&o2mv5bsZ%YU?z<{oQwW+R7+E6VyXA#vQA)TF+*IXlqL!ZcN z8{IESvP|JP#I)+D0P~XUnA8l$<(F>vuq9($!m`KZ8>{l|GZHilH%Y>zsf5U>ur1Zd zYl&2kJkosz0M|?_fqiI`{b#g^?aj_;r%!jwNQt1ufUW>pApCV=5U>HVOl=!5k5a@8 zTvxM12WVg?sg}#c$S1mJSywlOp~pcD6<$|&5?53bU8tR1J#v>IDyY!%thnp6#=D|u zZuGCh|Dy$pa~1b>b|J1|u=H6wvTMY{tL zB7Bks9eHwcQtk&z1(GhqbPAwAU1uokVVbEK!ss(%`Ho4Ke3~^q4TsydPB=$lEVi4? zItHKt33X9mAgv;^ii#sO{cI5SV;!uqW@mD2`64EGrmkE^If&2-3sp7Ta&mGUxYxfgyanj$mf2Hl=1Yqpsu2}|y);stu3ud^U`YJf{OAQjzI^V`*k;-$%i zz)YJ#thlf^{Q~+m5rZ11=dR(t41~a@O-n6foroHZfC(HX9Z4(?!h;G5h6Td?H&bQ+ zl&Txn2*=-iW$MDr6)X{+zIkrpxvLOUS_GdEUUG&0V1eIjz!=&vAY6RIRiy925|#yy zJSBWLG=&h}kQxv*4swu61%T-aUv0{O+Z;{ ztkPx>YSMzLh8$9n4v0z%dX8MUy0Wsk0HKvBHgKKORgAqSY}{PSs}Q=>9Lpmb(t`LS z%s@a=5`-{J@6=n4MMz~oOzc7NZPX(MTX!#h4x1m<)(udIynmPPuhN}XmPfS1Je-Ev zCA#q{2B8giVV)=1DI96nlX3+M622wgxcFm48*N(rZzxkf=Xy=(4=Yf|AnrqeAbA#Q z7~GDj-%*>c#0f??3dn-Mq2*`57-+WC8m!(!Rzedn8cQHM%Wlz)Th=ZFR%~s60;vL9 zD;OkSZkbp%Bz}4Sj=LjGl&ym??RxEoircVk719?wm%C8NA&MvA>lz0@*+Oh>DT zt{wqf*B~#p>1t?{!UwV@jGAgJT32D}wWb-U<_K}eb%WMNX=_naQ(Jai_;M>5A@DA6Y%1LY| z5Pv}aQBHn9vs(PGa6PjTHe^kPX~0Mn|0fJkP4VkouZ8dZaDg=@-n8MVL<=>d+oK zlxE8;T!#j@7*qWbF{a-aIy5MlPeqs?Jk+5*bO;ll)UeQ@AtA=7J0E&q=+LNOekj8H z@SzUvp+h5Vt(NQ1m=NP|M2sWv3muve%#TKxA3fBeJ#=V{t?F|fniOJ;rUYZ}3mrNy za_WIF^Qqc%YH8U~9W7U@61{qm+1ICR@-QP5daZ<%pnoW3<@dOu1A$cyz4*PxniO$>D#>HD}k}zplRPfv4pa4m8+1&Ffjy5#_4%w=X^U)`rSJ1f zTS&(b8r*YLQ}r14=UifYj{@nInjuWZGog!Hv5#M2eR)#lhAw~h+`sc@Q#EDLwj(!S zdab$Op#XQi_Q<_mcU^k5CgNReuB)3B4<9N%2WJb#rDhcQGX8bxm5Mb1zE&&X>$L*D zQ7hn^wF16XE8rJv1$?_!z~8SG@XNIV{$UtUr~(CyC_=Q+SN3W|@7RsD`I8#jq+)IU zbdNUdp7Us%ukG4K+?yV4^XGfCi9Dx)HoYU7eq-0B;+FSNn>%jtTf4Q1-;*1q^xJz- z8hIE*l+y3)LFuzqDE+J5DUIG29HsQTdr+$GOzHRbpmengrQffOQnd=D@9aToY~Qu@O^D2+TjBTDH%>_O?Lt5EunyHgsyM>|UCKkY&3D^=S3 z>mF?)5Aujm>UtmGQ11brr%YqAP9{4$&r0${fv9;_HGt>x1Wf{2Ns5-pRd0$x&^`)FTj`o{ zlQl|r%6Pl;WM}+t=S~^&;>pg+tqMp@BjeD^WPZH=8zS_!{s)+L$%i*+AKsvSc!T!g4cb4gH)uy(<3t}uPdvz)`7Q3~Qn3gt zPoOrkI*5(Qzss7`p2VMwKxZev(NI?xdXRzKRdud` z0gFw$q}Ce)Bdc<7v8&wG)l;FrM-VEt0^I0o5D4V1tJkUm_5`}vMYEnRXjxgCrOlwr zRDW$Krq|c;DC_ zJ0Gj9^K^f>^W4EioTDO6gNV6}bTNIdiVUXmWy>{SicTL}cKu^bIH8Wtnzv7ycbGE= zQ)ZDm)RY|5mKp>mS5sP0Yf`YUl;GYc1S}m0JIH;-`gn9g4gJ>CwmAqg6Bv!uVe6)H;36 zclebs%{Vc=>3d&aDUgcO#W%Bb!oULSwyf&lT43tKwWdejENHm`);Q26Pi)PCc^%!T zP0EPIKnIMknxQbJY*W!tX!-`2>=qL*%!p}WCzD+#_tXptA1T1V1znj4GuLgK@|?V> z>AGCD%p3s{per^}#uva8TD*rT-=GCaT`%F6t?Gc#rZpSlmW}e{lJHu>FTpAPZZkf= zW|_WiWS+m|hC<@oXpXSAkl>@K*v_key`#Ra?gi78!3zVOGVNG)tS!nSNWj!Lpdxtp zjg0R?$;-eW^3Soz<{CZ4*T5jN62A0C*7&<#fL1#xV;}gU9J+xF8bM!^6KN&G!0Rxt z?V%?&%$jAv8B&6vh&L(Es4$4QB8{)xk@y%P?X)kDAro$vAQDHk;B5AY@f!39g}m$%AVy7t~(EE#c#Z zplk8^5LtD&V_w$`13e-*FI_VTdr2I@P{lW6!B%@X(l^yD8)F)BjxN?BYk(0B zcncewP*<)79+r_PqN!=FmU3e&-?BS=Uw&t*9W zoeK0z(gXuJD}^qb00$yU0p}FMVVBih;73*pI0xYs#KH6qQc#+Ff{jP5mRdP6HZ;pL zkmRI1Z`vZUE=ic3m$#e(U=lk)JZVFhQ`Xh3O$(p74;gu$&t=LmhXUUzgOfot)VEkH ztCzz5CCEUj!?R3xy+HjDn=4_*cA4bR+2sRwWLAOYXGT z)M7)`bQ~xEH%zmf*i8B@+fNQ!+P5;uf##k+=AfsAi+E zy}pMl$hdq(L*eQ-x^%LJdeT+W@_C(ZTvIu9;TABf6iD+~{U?Lh+%)brPPcsTFtYbL zB=SUe)Jz2{JFiHd$p#c-9IV1RP+u^iF!MBO!mz_M#km2h0veeNa7^hG1-|tMhzcDp zTJywFu|WJT7c@Y*ir8SBVc{Kh zM|0_;^0HYX{N1u8uvh439WUt6wVP~ah=mq;(bd>oZ2Kgd>V*t5Lk)?Qz))|2W`H{| ztvp?p0Xr4lpi>1>bU;geMnT0Hu#1ZjIDaIF0elEd{t0*m)G+W^6bPr%AV!eQ8rhU` z1r><^o+yoI(U2}VnqFB)!A>Fl2qOTWMZwD|0DzvEMwf%R(mJ72b+$Fn{4gWbvK>To zF<=EJoiGGAi@4n~&+sy9GQmLS<=OGv>`VE&0>ghVYTrqz`^uG$3sI$m;$ z!7K}Afa=9nTaqq=Yo{w}hOCOp4H)EBM2rcXo>@qEJI2IJ&lxQ}vvyvp5EH zV7#-#HUrYPz&FttEW-eRwNF#lvWyf0F)!yx4!#AN21f8c8yc9Ra2f@*vR1}dG^(tb zB_WsKlvh%XMT5jV5Ks;MinW?#bP|AyU>X4|5TsIdNrv*2s$>2d8*^~9Ib0B`GUiA! zt0SxqX)oYNnS3dTPg3YGB&GQsmW><&jnRJT$g!I;P!H_T$X&9`(l#m1q1CWq!#oOc zQFo1WY!Q`CnL1PbsZ?i=#LPBkL#RSv5qXo?=xy+NGuDc(g#~L`pd^m)zp9kqGSm`G z_JnQsh*ew1+7jM@T3q0S(V=vzXgP*CT@m-{adkPgjHauUar*f_mXDl?PIn-|A}U=!*rcck#f#!%0ZUy&z|d_b$@W}U#_@b z2i_Hg+Ctxn&M%{G#l6{ZfyC}lqI(~=eeXDXmp!EJdPcR@8Oahmr*A{>+SbcX@1WU> zZ^ZJw6D9Q)5aFo0Am!P*AJx~f@(9(e{1}!SWHaEU?!4EA)`*vgf}H{bU;YCA`&I84 z4vcAJY`0uCEyus(9nXXBHMi*H`UvFkNSG60gw+wxk zy>bMHfEC~i*_B7V$HaNHYt~_iJ|ODD7p^N!-Uq9c^NZH;{p_MvTw*@|rAF@oQ5d?o zL;15x58)m3`AhZ1zW^ST`ruWSHF)Px?U^T0K|5@1FisrxD4%AFadDD z3EP#8uvVBhX4sqGW~aXU!3xHIjy=EOc<2>8|cMmOuVV!A6#AmMc^%{)J~G@Y5f zvUp+Y3J>rK#sL+e!u8(#+KI;)1{6wM{QB%m{$Ho;35^-16e|OlEEx!KIa{Wd(t! TU-0j^ZZ`RUQ_G{6hk^hAAEhAj literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..d55bd69ac657ac8bb201fcf0d072b4e9265cd0bc GIT binary patch literal 5613 zcmV2E_XrllLSGK1cx9=Rd>(q%9=$N zZV@~#Tn8p7%(74*r3w}byRuO@XPdxIq0j0tK}@?az?=Tj+l3?6vm2yDa)z)9^du!O z>1+O`gsbNbXivy+iak*4mPF%+rD)8zrh) z>#2Fl;k*kjY=RmPT*?{pZ9)+;vXUov09Y~0mAmnrs*!lvEE%$-Y0_QW>d$B-rz*Ok znIOztw$(`iZL3?!Gmy7zvE>bWzBOyVX|LNcu~lq+!!A5;b>@w`6@n;j@}&LXKC5lF zT4(Il)z!=Y3;*bs*kah(2lG8G{GScCRy!!lE<9~T@@m=9Bw9*uUd@@*>Oev)=yM9L zuCD+iRwtF;7F(_EtfUiO`UHqz0MLr0?3idkv}LQGC^n6*W(oRrggS4X7_uRf0`(x!`CM+u`V>-(ij~((mWs(+0;2 zaGc~v6pkVBD{u_K5r+fMDLC*v4hNpkz%fK&@tA|pN$~|@^&qdC*d4%~*egb`m!OeW z+FTF6Y7JZmKhAf(+SZU~FEu!bbwn>nd3ASPRSn7@>a$bknzTA^OC=MzHkk~xqfmwp zP->U>gm_H+0(_2%KLej<_y@0Ftn-iWcfaX>r*{dN>I_ayY) zaq%hWucx8kein@Xg!n8N|Chw)MFC8I5RCWBU?ne#FM%P)sQI4~hsD!i{3BvQOoH*J z#Zhq#jDJF$6sJHr)8Y(h=B#*5JP$p3L7WFWSr8Y2!zE%xB*W0;tZA^&vt@5l9;@6% z2)mEKeGk8{mK=9W9853I4CZ+c7S=uoagGR(i@;dYYHUeWeLr zn)H>XdFg141d|>foU-WvUrZb4Jh-SuR=VITUE-yee5IFpspu1P^ON6%R%7G}2?NevwD% zaKqF&HbfDtOE>P8yo1s~4MDjp(Go%))O$P$M(JVf3E+TZpQ zdEG+d9L4CI)DB94CHxEtf5?iSaJvf*0y7YF?nWs{!!PkB zPat?!;JYjN?o8j^y=n#mDFCsK!u%_D3)1O%^z=K1?(xM|@H+pBVJ#NhZr>#c>7JWyK=p$r%{g7#Mn^(I95pn(OU z{21>*tbCrZA)-&u!Gyb1S8t6*iUx{naTSNxj^cuBG-vSB40TgH;V2F!;1D+s6&J?h z+Nz2(S=w;{7a`b$nMKlS{KWJdCk8kvzzG3Mte+!=AMiL9E}(>q7~w)fxG{hKgL+^D zCsrp6o0{R(X7GThULwklQQ%XmR`=7+&#ht?XnBA+_8J`)Z{Wg*ujCah>E~(KbvY08 zz*MaPUUG@ZI|aiALpCL=TZfsFvO~@Mk*-!XT9&f`XP1;s_R?fo}PLX>o4BF23E`(qeUcj^uP51=h$ma1xJ?;G!M zLoadgQ4ZcAT;2dCx;Ez5mtWolHgqj#TTl2JC?dibM>y_O<DPo1GQ5O(ln2AIra z0)^uc3z8uL%xi+7Qa3mwymoJxEg9nymVQRKxh342jsTh$B@UCOG7_iC)+8k_$5O+> z#LzSVZmU`h+mL4y)ACF$+O&M?)R2IjIJ_v(6cBTjze5xPK0r38Zv*8?N|=F{q%0`` zG>8+IDixySV^LZb9ilMw+O46=iw?zcMI{zR>x>QyQHLqFLe8_|=qZ^uMakSuY{CDy ztN=ggvBIfY&MpJ;@{rY{LsU@#RdLPUl((=mq>|2YbT7};4WaiLv;2ukR5-;@Pr+r)qZ2NXmx^v?vxWj>KtoXq z46IdkRyA?tCc!#kKi)t9eAOzCmtgz%CpB-{mls|{mlThbZg+pZ#i7uK=N9mrG8*bS%< z!rM|wr^Y}DQYjZOT?I^GjUz)_b{lWUT)^=vU>a*CY^$4hty8l~vRLDCmYS;Kj6zM? z;i@f%T%=v5(hfge7p`t@uB}37Wrht>=SmXB-lIA`T+2%kx}+S-Bihmq^IgnAKvNM2 zVVK@YcMOe?>hmz?85Dnv+NH2{_u@AYy<6T-#SMw{us&nQH%c#W#TtXs{_ZwD%25(`w$>Vo`o6)*D&?# z*6B)|BM2t}Sr80p`9*LBnl1IZtM`zV&;*Re63EW7cj<#$hUWn*_O?MmqzW4r43aOi zLNpx`zr20lY{(N8>tI5gR=*65FazP(vj6l%@XwT9jb%Kd~+A2si6>BI&Ba9|JSS2TX+;C(#X8{Vx06 zoPC#PsrX;sdgddn%Q_TQfu1P-j~Aex;kTGp4}bQ{-LCi&Z~`iL7gbPnPCbU4h7b0TR4hVbom?rS9LTs@sL^;j4;zm?Vi( zfRNl!VNCc7T==v%B779W!n&Y$49KoSNZcDnVmCMcsTo-f-KVLg1;O^Ebyo^=A4woc||$^7Z3s5Lf4CzL$x=yWX06E4|eVQ9C)p4h}? z-w;Mt`0S2i4}GIpD#<3^d~`bPj7}3iH74!&xH+*I>M)X!jB%>P!k)%q$xG-*wC1xM*na=_PMa3Nk{lGpYY>{Hq^w1 zCfGVJx1lLVjpIHwPJAwGXvPtK(kJ}np$#>$p(*x6fZNclqeg#9F!i~xq4Q2oo%9Ny z3ZGL)HVny-b776>orBEWKJm;cpoNcuT3GPPUOy4`&Qz-p$U;bbB$UMXEbZ~lQ$r{d z88q31wx80&N0JqoyIIG)k4W4fr}GZOr(G+uZrD~oZ*-ME4MP|@d6ChvE zNTnXH!+XsCfX+_k4mf(P^dnxG(~rf`jmoFo(SgCbj$ZtJdniLpazjKqhu(ZZZw@HY z_2!h{o>NZl;t$`WkJ+26HhxfVM!9RfIUs%7EA!Z6@#gIIr`(%?!B6DPW84st&Y?FS z(3=BFbiH}hZ_ne7H;?+fdHgYZbJfNV>dh#3tv3gxpYX~&@i@Gh{*3HX?aq|LPvp;2 z%oG`+&!Izi;LnFYsXqsm=sNVc-=xnt4()r?lA3(H4(&Db19~)*yw;@y@=trEo_Z_+ zI`?T$w{xE`-Ja#KC3l!^AKak>N_72s(r?daoz*hmW3TjMUM=(3_(217u5P*=kp7%W zbn6*lU6V3|nRv$Y<5q0r*I8Ruq`INYUOM-${M}R?S@dn+2286r=e`u+fz|KZ+x5Vt zzt%*$Yn?4EOU}!OiZ8?6YH__2Rlb4WCjF&iD8P5a0=^d(@cpoWAA|*b7#8ruuz(+h z1^nHxfFFki{CzK=V+s_|X9#|zZ#6@rJ9hmze;fiQ6~OtECOB-*xgY1-d*V2I)BQMq z+62e|9f6eAbKDsv< ze~X42<#i+D><#Lp0WB6yr=vr*b<9(jA)2OYeB)3*q>2@!hwo6d3tuX{fN$t4obTZfb&=Cqtu}ZRXiy@^41>EjDI~d&UJSae3JUdw3 zDm^IR4yV~cXQTAsNPyI6kklTVqid-%YHZs#qc=7v=vpqoSy5+lk1)s@r11cfcLVj_ zm=j*i;MVHBQ6~e9hS+qyH^Hff1cTeL>(D`+avMo7lW4FXn<64~P5Vh^UBcl8?ZXY) zha0pHH)wxWH)#8Y#+ea}o_LW}^LP0`mkLB!VFs;{)j@2;Gp_9fp}b(D42<@FE!+qI z1~JAM25$SVLGZ$jGdOC~wSW+N>A1yUA$+-$asa7?7G_u&VBnJha4gIf5>AkJ`UL3U zZ$oAWRmDqk#G!fh8fRsM^5QU-)p#>g^{~8rrxN?(JC&@nF?}wpB62e&kZ^+R2ei(Yxv6)BahRjXvgSkf&baTP(t46G!T)cFn zr6*SMcKOLRtuBr1N?P9YoGhYAyBS`(YDEm9m1PAVEZg_3UKmuB9KCs9b@4?<0Fria zQBuY~?Sk(d!bhY!l=oINbz3gsqrO6mbp-$X%ck>3TlDH5f$1g6?F@UxCH-?MAAeK+ zH})q;aM|(>$@wdb3)WG8rON*V%*P*IIbnU#Q)gtPZ4iJibK-lst57|xO&9Zf3zpT# z#9)nyEsj_eN^CK+AU+JIj+ilPV14oW-1)86-^^Ct# zY<0oc7w0zCRzW>|3u~*34#0gYviS1y#@fPSW_ba@Bb##@A_6bGV&6B-Z2bQK6Y2`o HX=eZcQ0eF% literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json new file mode 100644 index 00000000..6f351629 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 2, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 3, + "stats.cum_n_token_goal": 12, + "stats.max_n_token_goal": 6, + "stats.cum_n_token_url": 48, + "stats.max_n_token_url": 24, + "stats.cum_n_token_focused_element_bid": 2, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 4, + "stats.max_n_token_last_action": 4, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1902, + "stats.max_n_token_dom_txt": 952, + "stats.cum_n_token_axtree_txt": 400, + "stats.max_n_token_axtree_txt": 201, + "stats.cum_n_token_pruned_html": 650, + "stats.max_n_token_pruned_html": 326, + "stats.cum_n_retry_llm": 2, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 2789, + "stats.max_input_tokens": 1404, + "stats.cum_output_tokens": 128, + "stats.max_output_tokens": 65, + "stats.cum_cost": 0.00049515, + "stats.max_cost": 0.00024839999999999997, + "stats.cum_n_token_agent_messages": 2902, + "stats.max_n_token_agent_messages": 1459, + "stats.cum_step_elapsed": 6.860883951187134, + "stats.max_step_elapsed": 5.8696064949035645, + "stats.cum_agent_elapsed": 3.769465684890747, + "stats.max_agent_elapsed": 2.946484327316284, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl new file mode 100644 index 0000000000000000000000000000000000000000..71da24d790723bdd2cda95d05e018a254d7327ac GIT binary patch literal 2276 zcmbVO-EJF26i(EpPVD?OqydQ^2?0_ov?g(anurS;AzDOesLBJ3W_QQ-$g?}EnOQqV zLV^pVMw+`07a(4Qm*Ad9;GPHIJF|}Sqm~PNVb7d7bLRYf=R5xK_Ah@eFT{_3J2iS@ zg&9nXt~jfNk%cIo?JBLS;9FmwRmWyv!^`mCr|?6#@7sKUbH+*NKaLy4({5bYF3w;f zV{vywYmP3laQVSM`!X<&p!P6ta}TZ)8L6SfO*FW-i>DyP#CjjmNx# z=KAsXwGmHA^Tt!n?dGY@HS3!NKVwciuJAn)r43&%{Ij^=qg<9%?XJt$Nxu}63+dB^ zN511kfh=-|*Ta*~|M-WmA3oraeE-Iy`lx@~Hvvb9nq_^8UNOm74 zdxxP;{Ze0XN_cM`ZCw0k!dYc>QMur^X8C*;_m88cc|C9G6n5XgX*Zw3Cls==AQ~rz)*v2ErVf@^`K}UU^;U{N835ppi>m(M&Xv367bTb=6 zX`MDxY~#I(TX$9K+;7E~GDrEmRK816H)c`BgW{cX!AR9ql;U% zPGJ!-cc@(_I%Qm`@H6de<>bt(bAQJ=ummM~(+Fp#EJoGWCL`#q|#q^BVms&UjHNL1C@pyB}C=1pitIk4UUq&T0fb+Uuk+=K<90M-sO?!(qM8`+zpjX;oUQB<=HcA zF~#ot#ZS;vLrsHb5#ZmvU_PY80;B-%F}U~BWOpy=_1NBCvb&%3ju>VZO}40gbtWac zjI#+~Pn;~Li4zuPpS;S(>*g977uAdnb1k!M?;yDF9Mnr;HA7iH;aA^wH=@fn4*>sCvCrdO@b=qx_05@J%5M%lX>tT z9%P3{S^97{*-iV0$w4pcC8^jyO88N>$M=u=eQ^l2HfEnFbE3ZRz;9SSMolmI7t^B= oMFNh&=TvaNOg|3)P_8athA-N!zZZ8px87a+^E0r`a}Y-NUzF5}2mk;8 literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..6f8de6744a71a3189eaf5d158aa6334e2ab00df2 GIT binary patch literal 106 zcmV-w0G0nAiwFqh7LjKH|7UMuY+r9;YGq?|E^upX0Bc~G>cRj4u~TZNX!Nj@R2HO8 z0aB?IB~vn(do-PM@{3ayN-|OvQZf^B^3xTP@+%aQDixA*GLy42^U}dmN)<|aQ%V!{ M0Kt^jsZRg^0Nti4g8%>k literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..94b8701c7872a1b71d3f9158069edad7c604f47a GIT binary patch literal 8014 zcmV-UAF<#ciwFqh7LjKH|8sO@a9=PkaBFM;?LAv;9M^SAiq9eSv@AK68;?obOUC4K zcS(vOCCZ|x7d;Z?NR%yCRkyP{ce%5io!QKTbI(2Z z-nlcM@A`W1r3C%+PvmWD!y)!+t&$>}RbuNEV!BSsu&ip}_uX_!b;&GR!PkP(F9i33 z5x>g;P@p9IeOBI~ko|>{>gFoManx1f1b2fGTF$l%68Qb9K$t7Y`5h!oyZt2Cq19nOSewPQ$3`+Z1)$+BcN(cPWMZ8fF^ z%6V&3&e!CEp%>P4bCqTdZ4RXZkN91ymTCm0BmMv;a#p@f3U09MAGE5(%vG`VInOpw z1pcL>ZjiZ&iHVX`ArtOIMK|>gD?eT}JO}>;V4~z!jPQb1JI+{1VNRg>ha=Qo-8HC) z{lkk`RKCEtUIAHPQMKUe>x350>83_D1OIW;t5j>Lf@PCb#WP%8wQaQ)_=7o{6fDzm zZ4VmpwZQMD1p~j=)L4Ef3Vn>;C}&J`ujjYLFu?JX;wu@5Y44uvkD$HJ!l;Q1_a>N zjjL#G-L&n2(&6{!RfllsL1=;tg|4dRs)trYYxW0;S+F$F-?CZ(oKC;5>g7RTOByfG z=hVQCE8%r-(T0+63GL`MEsZeK9;m7|P+y{=1jFX-f={Qi4?f-WyWP)I^!sV}?}X1V ze4YzGcr~Vs(@z@yv+$XM|1o7oIR)Rb5Apku#}!lqaHaGs5^4tschD;Oc&eiz-Rxg{ zH_s8Dec|uE*md|J{sE_?YSzXL%W^4WD8Fuws2o-v zga5tC)Sk5Xhf=qfqrRSAJ;!CRQ+Irq|+0b$~xJ!BK<$N7bB*LyGxbEZEb;!{WA~2)e zvNBIcuLy$M|48<-%>m- zfc1uokjWY^|2k*xH%VPIpNmv}OZftaevy~_5~uzKhyPXs{62?2Xn_9?hx-li-{tV% zYk>bghkvyJ9&q^A8{psI@Nb52aAGdmbeR*wi09JXpmYWOt)!b(uUW<*%~&iE==d6B zVUFV;u&Sx5J)-0ljd=bcR%l5B z8hTNOm@GzPKzC?pqvj35>mGse7JWySIR2r6XXB836GFrSjRIoW{g$n->SiO{^Lq>x zB0?T(ptldlbUK*hP@Due#axv5OoX4f%WCGZ7Kq>DxHTi{rXKwRMDvzsYMdxCo4EEt z$F*}Eq<*FRpy`G3W-OiM=`5s6H2oO(g$+fe>0{7UKBGKN(@!c_5U1^Qf_*sr)EB?0qsmbV`V4^nje!m+hY)o6Z~pScRSI}X5r~nh zBH5SE%?45OC%}cRDxczM4bsO%`trGz5muCejw*}bb~&o=T_m!E`IG|3~>l1YjoPQh^}JObs`JVMTfXIz=?%DayL5+hVY`pU_dX7_~FwB zA0fRD2G#*Sdfis)Jp#qYyTVJ}e}NXecsCGH!ST?@6CgKcZBGd|FITxlaE`z9R`9-JeTEQy_@WULg zl?Xx6QGlzzmcf~~{kwj*3*BoiPQRacS%MEq{>weayuTyV(8`orFkk=40$V10hbq(;rl>Gf1a)s!tB{u*=^HjgrftjAU zO3iTB7SNill=*`jx~W+k&Uhv}oeg|BGkIxh@xsEz^9$*jsSDHT#hLRnnc3-u*^{&B z^u>kQ!0#Ux>rsK<#Rzv@S-EyKh_0aK)^$hE>xS;ua<&Rlbl`Vn0)Jm#*C3S+{G+O^ ztK+ybsu7TqP9yEWPXzuUniGQ_j}Qh)=B#3_qS|W?)_if{LQwwGjzq%m9;R`E-ysEl z_sY`B)r*kM^31;Bf>mKm@0VG0S|Y@CWT0_%X;}f@YywS@>Uoeum-qZWq&rb?u)dN_ ziojzkHG%7h^eaVkIwpBd!y>Qz8IRbtaZj(OQlK4Pe2N$El3%(BO!VAbys`B13b5hW zg}^@;tDuYs(;VTb$kHZL>ywls%|FTWU)Wkduup7LH8_a>FtiztHnbDlwDc)n`pGS& z$4e0KIIPqxFZDPrRjF9!gs0~`S2tLJ&P}>ER+Hk!*WIF zbK{QwIzg&PG?rgaj?<;qylMkZRn>4EbxzJ!H|0r03K^TOlgN9nYnf0S+?1i4P(ivi zb>8)(#3?sm+%R>63z<_!*>Fp~?e7EGtOH9nfxd8SNuHOBo>{=2GNS40qo0*zSubLe zcKeah6!TW{Svfr_KRc4-;!2LDSYjjzoz1naT5|L>;36WaRNdOh=xKRkLM|8tDy!OL zUAH_35*ExyD`QZHteQ1Ics9j2F?FzoB@NJ|dlKca019*o_cu-0#4^YuUMsvEtsigU8?a=?ju5n2q%^EGH*63sF>dQZL=R$5nSa`aC6Zi@Pl z6w@NKjrL(%H8p)?Jd+;FrfE~aim<_v%;~Z0G|!@KY)N?oM~^t++4NW@B)mN~lSdpK zWCPu+j?NVbN_d&+JF31>Stt**02xuSW6j(!shaURY6cw0l|`Kg=!zO@vh6J>Rt*qf zy{+EwfE4hTqzItKI>(=7jRkYLSk)a%BO{p2k3+UcHMk=VHrf# zz!4G>qhsgaz6m7G-&$F@zKpb}1qc3eeH7V7ldQzJ4yqgngRRM1n+`34{uGG)Lce>F zU$YG;zcVXF;n6rjq!s;TgxnHzD#Xf&K%{{Ur>YhxSrKz}fT0g>9qqQ*A>xRxGtX@S zM{rQGZjK96;UnW_2rw-(Vo8}SHGz(ok(%gi!^_Zx$7GX0cazt3S0y+CK-VL(j;>zw zDyhI9idsC^c&rzUn@Cjj)WO%sX^#Z!O`mQDn+ipA>z#{ZfRvqSyZ8nc7sk1?Gj*~> zy{iytI~6c7)YPg4f#SGBc@~yByfangVlv%!c@ybRX4)@L>2t%vxt6-=+Jw~i&Bb0d zzvrUIWSe*8KSKJw`l<>2R0|$(|4DDlV8Q($H3Vq$4eD4$=NQ1<+gC|`0ll*4ONEjR zv#%Q*x%@;Y8bS=U{3O7XuXui!i95PssRIxJSHbCNH|d;?m1{Z|>viZ-zGwtThiM6n zo*EB9@IDf;4cNpJdm&z=SUHIRsb03-VlW%>z%@EBN7I7)UQ3>6lODdoo z!U#RpHr(+NvDX1PmV&4Z$;(! zJ-m3gF2~XG8lqwVUKidwZ3s_<*loyexGmZJXIrv+vOPjx3nBXlLZOBG++3QvW|QJo z-EoC=ghBftXJVNTp@>6;H|2LztZ0PKcHIS$n^SN=xKGSvuuDPjb1^7Jk>;)mP>4?DI-4j?#fsG4+os02vY;$8zYz<;#0-F~Cn~n)AvnQ|?0y`@Nb}}Zg?4H0{2<$~6u$h>^ zCieu^LSW~Fz)r;kHnk_P76R)L0-KEqY%Ed;i}1vZ(E3GC#az*-3GybxF>Ca{@3 zfwd6Wq7Ya%Ca_a`0%HPuFBZOLMEE)-qW-uio83`tVz06dsr9!bHL*4M3lrQHy3>@4 z#$tVZgVJehhl4kw#gf=V5QwjceI<(6<1|GOt7co4xO#PST{ZN^jf`&BS|g^QQdEtQ z?~iEnWQ(1S(0yb&wW9mEe4=_%48F0kB)jvBtG}w+>Z+|)OYKOmNL-UQI;eOE+EB)e9@Q_}hjA|M+2-#DBI<#}+F+H6{b*`lAEHbhOQTlc!Kj zvvDzX3nPgI9@)LgQ!9bZ2!X`{kL=V(+N&By(?=c6)E#6n8*TaYN89qYHi8~aF753# zZIK!dQBkR#@rdI;0gboS9AW01i`>e9O#Dd)WyY84_tFQKU;GJwTuE0xZ*9g;a{2v* z@Vw{)e=v5Z`~#2f%_H4!b|n&*#iJ<7--4^<^0&KCX@7{{9^IF2tKeJh7JR$ig736j z@O$kRe7D_#@3mX-{dNofUAqPUzTJY~Z@1t-L<#iaSN>pI z)P_U7@ml}8rAah7{2NF8pKWUu$A9BE|GNVYJ0={*`EXktadtS4)7iNljyPi+$LZ?a zzMeOm)YH?s6K`AV>Dv}ZoNbQdNITTi3}mEH%>@76pV0p*BoTei3{7NWB%lEJ521*K1xp25c^m4>4aO@et{G4iOS(pbv} zCH?k?-+z%lD5)18kmN#`aQn=mMB)rRFaiwD52c4>Iy4$&erV{dB%i^1x>;5RujMl| zjAM{2g~(g(Ld0Sdv>e4SbaRa%Ob*F-*lT{M@vPU73}NW}Q2n6Tkc_vI&JT%WVnY)R zq^254ZF4lNUFz)S7)F13NOtT(1E*z&$zXz&+=6tn0V#T%Yi=k+-xO@3WH)qpf@f<;L=0fWu!r>(*E*T5OsXqw?BXZUPE6Vw_|c zXsoRf>Ip<_Jg+uC#Pl#EH>?xQ5ApjRhj?W$uF=zUG~?%Rm~Kc8=jYG{8Q0J1%u%bJ zANoZdmV>B*A1s_7LJ721+NrHl+2zx94RUE8K
%3?CVG* zBF8-C-!!+ z2Cna9(|2V)sIcI%1@Vv*K$*HLha1|I8(s=BV->XFq>_O@P_;diXgT^cpWEN)!&R2} zQp(nqlqL%);W~x z1S5gm!$yG-R1pD2;=iq(`QsM*Z4>#m7x%HKuSr99JK}3n#coG?O{&=KNUup1yB*~< zsp2Dw@V0WY`_WyKI(9v>Yf{IqM|JU`Jn;M;n{24Ih7VJeY;_|SxrE*a{T^=+3Lg0V zF2pnie$m4>u;?>B3u`0EY&toL(cmiXcIo(97yqdi?|-3X0>6J@l|Je_`UaS8@SEmZ z;Q#YVe6>$^Wd3k&aQOJr5*gah*8R9c=L3iCh^Q_luUK-SVd)TG_OhYFkV3g`LSRhZ z0MA{LOR7V2K?yRp4XBlXHfy!)lq}ECaQLt2juTy)M7F%D22!F>d|L`2!B~dt`bI4k zl=}Q*Q~(elr3|Bj|7>X+q<2AgfOiK4%SGb%MPGhX{-6sVNpX|DVB{KqxeYCeZ=va7 zFCGc}Uhx>9+xsCPf6ekBBm*`;XjrlA+BLdrYRM4!tV2fl&KntDf1=6ILS$Zy7FIFR z7E_b(WjElO+3Q_^S0^Q79r%JA>aamb^z}F~EEz1ofaqe3UN{84M9Yv9EQ)kf@mEhAQ4yKa#N$k+3R_rj7}QXfjSi!Gg0V-sQ(7B$r#C4072Q| zn}$@{9^k*Ib6d_6$CY60qrDY)hWQ3>HIzats#Fc#DY17L;d_Fp;IYxp29OnUp-QOQ z1bV8(Bu@%>*A=drtGH1Elbi}nt<*J;p$C?MJXnTdu~snN@M$M;B}>xquzVS6os-zT zc|9NzDi-=xI?@Dq`=^KTxs&qsJSbZBxlC(ET+M6WpO_KH| zNqdu|y-CvEB<;02P(dcvke-!(py|0z^j(janq~d!42>g z;w40~>hkao#yu0`DhPq_t|0b`NrC|{zfTZ4GcC$0OUVnyhFWtlGNv;T2-Iv+1d5R0 z7!IP)fKUzNG{&Nc$*KR32j;x3A&r#0#Ddi!o`WOuDmKrwAR@&ZH}FN7SR!PU=7p5_ zl!38(rhHOmq6x{8F8!)nbf0VQ#33!gKftzzu9 zVbfcK3YJE2A`Hc0F0TUXV4X;Y8l}x9l~U9&!wC@WDYoUUmZ%}KNj6sOR7nAQi5A+i zRi7)MF-w^Y7)TK+pwm4FNgS+_TRWTCuB*C%dWGVqWmQKuRNc*0oH@B@SSsGqlu4)4 zqf*5JZR9G}IvLsId1G>or$jbe&)Awj#?5I?zN({ejXOyUa=}WH6THg5i(6u(q-KObPcaI`_UB`$<3|Gz7sj}<3+i1uQZZ(pWGk34DJPFcnE&x33XbMS%FdS<->zU8H{J?x?gzYI#ewwH$gvX9O-O zpOFkS%Mn~B;2A}jibvW*Lua0K9o?}iJGdT1L=Q$koyTrvt;4vX;fP_G1?X%SH&vMQ zL$Bw{Y*VL_LOksVDfy;VA^erb73i-}&#DR#LtkSH69}}yndxjP zVCxzVwTti`NeMdYCRhfz1G=oGx&->Es0JM*uwnpN>N83zF2LZ8xODk~AO*TK=MeA; zXkp;7Xdzt50WpGXiey77lvHE_+C+K89u4EHs~c=x1Xm8QQW&?Q1i)v};#Cz2fSp-p z(#1g*r$xBlMPEF~oGWA0c3dR07&ZmhJU|7w;&k(s%hbKlGm<*1KzEb4UR9+NT$xS3 zkifO*VoGGV=21YcK3s2<2n2Jeew0WBxTVrt5+eY>so)_qZ{;nJv}RjX?C4AqD9dp! zOTt9$2>Wp%@K~#jNe-xjz7U%V$0S4(Y%vMs*n1f1VkZvXbXBT30$Q>JQ3_iy4OM}G z3yKj&_<~r~sRsbf)`xEY3w;HNypV@(fab+bTavEOk<+1zd?Irl6dxfQu5f{1B9E~Y z+4Yr5iAGZpyrd#3!8U~pTu>2P)bemK61oz!0mirjiBSs2O^H!~s7Apv*f<7!;CSam`wU3m*4a971*U~I^!5upxM7hNjCloDa`ENk zbYKkcwdNu(sr98=zM$4*$uB2)KrN#ZqA^9zCKV!0c6&1%^~= zE=XIDY^w7=V>$;%Tfp_mCOSu!Ssr0|$a@JjW$~3r9^Fzkq@-&tRxMou$k>0W=Gehc zv;!xixl6WH*&@Xy>@`en7)K#3>aLNG8s6d#8AI80Haj#Xv2NSYhtPzeM`#vt(A(hM z=M0N3#fD3&U?i^azpA9wOtk{D1kty{jS1JT3GYCATOZ57Xl5u^jdb5whTuZ$0zR~@otVSJD-jNk4Deb)QI|IaO&ZN!F~Ty>#= z%$z)ZotevDre@~AlI>c~Fe>zI^87joj}iKVc@I2>#^FkWzbH3HA0Ofb*XDZs{g&rO zK*~A4s{rA8aQVgT#9tWSnNE2AemsJVW|Kqi`C+ksyxAxUkl0~px*2yb@DIX}3Q;k{ z@J3FxR~gF^yQUA=_Xk$1?)a27j&D2*{6iIW6PR#SL$LBv;)9L^gU7gL@B;`p$wt_g z#5sQeyCt62fDQ`{c;{RA_Xqv~oLE_A&he^vHYd36AJo7JTQz!CjxlpA!pu1Izp3%` z%s4(y)A91e19p&TQ(PO4hATUw>1pI(4l%MRT z50pEBWrX_uV_`*fvo+ixyyo@zgE$`L__4v@^10(*dYanZepa@g(RcdY&qIdi_k*v5 zK_1WJmq3X(0e=M}0T;ZnQ{S2GhxucUy}9r5!ViCZ@vpw~+?)PkAs84owqzcPXIfR3 z_ucW;ulP?!@<6<}gig;m;%k)arTSQ%G0XwhkVs|UO`ZFk*m#Z<{>dlHUf%D=xR-T- QWiOxlKNAC5HlKt50Jn^MA^-pY literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..636120ba7912abab96d636c7994216b2746b8980 GIT binary patch literal 4893 zcmV+&6XNV2iwFqi7LjKH|8sO@a9=SlaBFM;?L13x8_9M2lLD!)wOVD@wnpHt30lOL ze~HvCMRFzXNTfnizt{?D05ph!1~c${_}R7MILX#ZN!zh&CzqUDl~fKn%T{z7&TU{0=e4?fd*$E$WBr#`{-I2V1))nH-cheg z8A$bI|cBfqDvrO*)VI;0VznfVm-4YxJ2}n^azAtI2%%hXmtci2dZ89 zz;!^J$|xOhKux!Dns+VT`)1_~om|xni%52bNF`O#j6JDf97=^Fsi>*NXR5x(n&OA? zlWuxBWmTe(0-@g#2)s-$SUoAj(vCwjh)+UFfpLle-C%Bn`Lr_=~&(*MMJl2(*Y&EaOogJbm_2O;wWFZa?fY( zYHz{)+}(B)aC1oSMZU<1-! zMc;EUidfHdjOaz91peDpYCzLRN9s-i47O1c2qV@J#Bt5P9xj`J374S9pl*~15AD&q zVgmOKW=aTbp%=XS-$W;JXY*ML>=OJn$y;6pHguNSA1YZ6JUz@ULF^g8a63;eE;PHA7zUwaCAgy^{}t zK8&#u`nn+BZ@&p7&LY|brD_;|l zoQ4?A$}=pA2jzKrL0$y+-H?~S5I5xyzzRd)z+3Vvp)tj_O|{_Id;)puK4o$Z`!dA{ zzmCE6h`4S*F8UCUGAJp>xe}miv|bgh@eqdu(}%Yt0(9R2?Kt#W2$~eoR0x_8&};}g zEub?Y=$wGgx9D~wAejfd*(5Eu)2#sQy8MxV-VQ-O7En0^y(gfXp}u}w-j;XdUHO6h zP<{m8kL4$#|KCA8`FryBnO7djzboK*-&5;=`#>%Tcrno1yYe2iRAg1IvesujfKSPq zPy9gE1)&ju{XoFXkdW(4Ku5Ig3&9Qo;qpnK^9S;C0sTFJ`6I#o#{&KbP4GVw@Xwmy zp9}blCiou<_@6Yve6(~%`Lq}AscsRcWO#|b(QuA8Y?Odb zEvryzg_=iIixn(NK_jBS1k9qCCo;sM=ZcPr6YUNZenmF7hp0~(rn;x<&2WbfX$sVN zqJqGD?$6Y0ipB|cLY^da$g+>LU~1uSXebztUJ?yR)N}8Dd+&YNOBz@5zqY-Qe-S#* ziSrzsSHM*R=&DW=OE4Rsg94Y8nyYC1L)!|Nrx#2EC5WoqTV{!vmQXA(`HmXpvPJA| zaHOx@uP6AZRfNi2Ys;~dVLGtv@mNAx3(h@3mJS~Zw~RxtP~(B%xxeyHSbBhEP4oeT z3DNiL3adg+bN#lM-5z5N8G<}P$e0)ikJJ)Sfn2c0;gCB--K*FxNbM+e7>Z|b9MkE< z3jTJLJsmrMf(q7Y$!u;m=Ta#<{n5eUadu^KZZVTtUs;5; z*0@(cyL5mv9=Nl6@2(qM@#XhbOD(9HY9HlI1#*H*d$TU>FQ_Fr%eeII04^9OZuxEV}e;sDmmJ z%nE{wUQ=m0wLi@`vi3RAeyz2C5TBU3q6v`r;$-uJEdDyN8TNv}ey0UHS%G9@ajZpw z^%ldb)eJrDsCma$HIC4i%?OOX7&>cb_X#?yFPr7y$767ajl&8_kv@e*H;#3M8itgsuXienqnRkc<(OkCt6QVK-$@yamfl9u`jL9WO& z)ZREvvZY}`F@dJ8lyKd5UCPxDrD-Jc1)FWKfI`zYbU=rkp{Y7_km<}_4E-Q;CJl@P zn3Ui`5}eUCB9b4|ez46xh-4BNI1e_YWvT4wMH~-_lDa?nX-tyTGM=(wpO{SX93|b9 zGLzDKi3wq@iOCc{Nld_yvQ6V?V)6#iA|a{O?W4rx4JnL%G&4ab5L17_OV z6!ak}`VkPkm*SG>D#XG@3CLp4V@$##IM5~{-?R}EaZpBRn^>yfoRF?bew^$Poj~Rw zj|{V9C1$1;CZTm=DGg|zluEr0i;Vn^)eTEU5w1(0)*yr?ieP?X zPE1LP9lSEi*Cj8@fh6`iXbh*fjzGN$49khh=hAb((}qhYCcl??nqoO5#k~l+u`z5a zdPz+rvze(}hN%KkgbD{5XQy(rqKWC)k@Er09%06dnW?PL_|??h9$|8@4Ggaaxo{vj z;Xe0w(0!AV=REKNGz86#J&V9(ZpQ0i7zm(n7E2!BD;B5|k#CvtZh!$FZH;jUtU$gA zMgVSXNc<^R7Q$t{u3AveC9)|92itTAMEUc{7;4O?Lg1hoN~u+0#(~ZJ9uF4jFe%*H zT6=;+)SCkO*ESyFJuH~(1`bT6j%{AP?*~qh@|flb_tN?Rh7|(ld_>HRBeqfy_uKlC zFoTMlpF`Y2z^JGxIG;RhBp5gYQDO8X6-~n{2p5Rq#+jdY08%y$eo``(y*(_V@d7-k ze!?p1(4<=+r2+_jmVq>-Ks1d*hJi*cp?rPWc~&1CA?;pcxG4q8&|neX$eN2;%xx@` zA%$jHMzxJPj8k^1k+Ln)p`Y<}Lnn>wn4E_plwAZ2VX?<;@^%MfBr8OLx)%X>PZye5 znk%b^AzQ$trk7BYzN|O%4eZ=s*#sLj^@S+jwAc5KzW@*2da%2@1(-uTG+nycm>yo? zB94(%!AD7m#3QLyDUcNgitF9Dq=T#Cn%@zleK~J>oeVQXUcqn9$gaR>L8`1*VKj+h z)s-S+>lIVI5W(ZE(^yM@AiFxqq7>7a$oM+) zpUy_dXZ%HA!RqU@a%h``G;ZK~H_gSvpCO&!TdejX|6z4chpb$3Uy8)Yo)7V=eT)SH z=)BD`rZ+f7@pkefQdq%s>(E%SV#2a&TM$m>yXk) zJ8ZGW@!Br3hAnpY!Bawz!+Haz#geEk@PU?hr?O3ob$q%2TJ>`T#2UC!(O{Y5K+>~d ziB)o6I2~nTDji>8hmPqAJ}=^TUHnY-3h4w!^-tmAg+uUukZFL3w%Rb;j~Hbc>N1PX zO=0OS6b$f|qHmcGU?i8J_AYUZ7mnjo4>M}7B@!*6BgKlUl{VPToGLC&6nakF>0aBq zCvJAjeT_#w5th+RBpY}!hE)gqqGv=nZxh8bbn(233x3+sSx`cy?wj5he^`j`%dAxQ zgYA55aCZ%0iglgp4}%Ep0=~zgBjO>I7;e?Sgaer15}Kg;SBNyw(BqP)$IH&053{OX zEk^18vBNZfb@Z{cQT;{xh>IRMx)n_;kz$Q}wSIoj-s^b;I6< z5lhUL(f>6KpH>~%^v|X{ILzQ-77ufHU=0g+ScJn&1`k;;W->ty=etWoZu5qQ3%tM( zl4paGFLa&c^Pc2j?VKx;+Dt%ZfOebA3!cne+S#Cm7rWQO306n4;tnw~`^wZmnC-aS9RLpdp z%+ukhSPEKrrhB7;OS>GDcDCE4y&05tuG^*kASms8w@dqBP}+rVmv$&9?P9k}yAsS` znQoW%R#4h(w@bU)SczVFvl8`IqL%}=KbV=Wd8^Oaz|#Hd8-sN;Skg1t9N=2?GC8zO zB~Q3``M}LS?5dw@O=FF1B<}4G{zv?>V4zSi4#V4#=&7Q=3G5{u3+*U*>9D(3$oE=* zPtR>{*LZabt~RTwe)Lxwj}E(Y6AAHBRK&|s5x*4`@k&(0t5FfhqawZ^6>%ae;&-DW zPDMpb2NAuPVThrK2@_4fj)?6V3)9R+pvg7SEWCz>@0APFtiDF`7arp6sE8j&wdHnH zj=NDgK8%X^Br4+fqax;`A{L?|mZBn-qas$KB37d!YEcpOh=>mThIokLXYYb{7aMm= zT6SJ3^4)o(EGZ`@a+!%qm!1!_<%ww;rg%y?22)iX`Qfj)+{(8rSMldTmGR(@gA%l_ z9Qa94WvbH;f}|IJfAK^1gP>vjA&`A&yN}JYJv~coqY+5D9M8lhHm_)8IUc_mla_EZ z0B_3TwY0?MG!Drzh}80s3(?XHk5Oo<{)`h$$0gh(xEycZBN&%pPF{{THWgHJ`;!eEdy~ucpstf()AXJGP-kCkav>anhU3j{asxW-PQ&tDVJQe6#!U z&F;rHyC1t^_hTrtEv0cL#gl?jIuc`cx@oqRme6jz4JJ7J#yyhFSV6Q|j>hcOikAk& zW{NpZfx+6DpmBnP&HEgeoCO`A=wWR1)5Ro=wwHO>p77Wb?9ItG=@Wm~0t-TqFWO+e)k&s+zdeD`Nt zawfJgB&X;&Aam1g{L*pXDEC!Ug#TKbNIi$wg*Y>wLJ{_n3JvR>m75JJ_nQ5VzCOVn zr=2rdZZ0}+a$Mf?(Ro|P<-HJ{_rY;_FCIhgh`VBsDZ+H-7$O{3zv=8T;_29u=Z?WV zlH}eABmtQ9Z^o9`4-s$1TpAObd2ta=6M>Dl(nQO88VeS&HOH(fZ(X6cS)lvtbFVImRCz?JJA9?1)EolY`+8Gt{L0nF|8uSJD^?nfS+l#Vj1#Fb z0wuz6>x%%XS6c)si`F666;(x~3ZjkdCsH)D;d>$lLmR#)QY^ILdm@EG8@|8EBB8bW z6DkneFg}^$pbg`bDGb7Mo$JsclN=~!De(L@W-14H0o|lA7$h+8@L{$4f}XNr>Myd3 zmvl%xNB|^d4<9tB_=j=N`5!RY7Qj!{O{1)8_%OWEL(kx!%I$i81!pz>TJ99l_w)Qu z8`z(;eL4H{m;67?@KMAeDK<6!3q2QZG{hgveYvXq7kwwtX6KbMD8L?F;Qxg*uts>F zKBcC7MMt?ByfeASQ;T8AJx&4ChswoMGeJ*puRmJZUd!LUyLD^jE|~DY|NTF|`in3B zhMwPEd6W+~%Dq1N_WH`s)+U%|bZu*M-9vamW9#=fcDB~m^BZf39@|~nkr8?AiTlEF P3aS4CQjX$dzfS-F8Y+9X literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json new file mode 100644 index 00000000..351aa01c --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 1, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 2, + "stats.cum_n_token_goal": 10, + "stats.max_n_token_goal": 10, + "stats.cum_n_token_url": 23, + "stats.max_n_token_url": 23, + "stats.cum_n_token_focused_element_bid": 1, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 0, + "stats.max_n_token_last_action": 0, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1257, + "stats.max_n_token_dom_txt": 1257, + "stats.cum_n_token_axtree_txt": 75, + "stats.max_n_token_axtree_txt": 75, + "stats.cum_n_token_pruned_html": 658, + "stats.max_n_token_pruned_html": 658, + "stats.cum_n_retry_llm": 1, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 1594, + "stats.max_input_tokens": 1594, + "stats.cum_output_tokens": 64, + "stats.max_output_tokens": 64, + "stats.cum_cost": 0.00027749999999999997, + "stats.max_cost": 0.00027749999999999997, + "stats.cum_n_token_agent_messages": 1653, + "stats.max_n_token_agent_messages": 1653, + "stats.cum_step_elapsed": 5.879024505615234, + "stats.max_step_elapsed": 5.879024505615234, + "stats.cum_agent_elapsed": 3.029170036315918, + "stats.max_agent_elapsed": 3.029170036315918, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3399e40c8abf6f4fe97d95219df80f775fd8b417 GIT binary patch literal 2276 zcmbVO-EJF26i(EpPVDBVAuUK0Bm_vU(3-??+w_V?h!znVs`3D%+1;@{^6buPX4a08 zkl+HTk>;+$1&9~n8F&I7fqNc+@60;Rj|vy~!k#&E=FIu|&UgI7^`HM-U5X$7Zff+@ z3NxG)U2#$gBMVVF+f`au!MDCRsSeH1hS%Z#kKy}p&$syy=Zurke-bx}r`@=)U7W!} z#^Uz!F8Hly&FLXoc%#*l7b5s~F6t%2$|ZfB6ta}TZ)PLzSfO*F<}Tb`xS&+gjmNx# z=EmW7wGq!r^U71r?dGY@CF@%SKVeQguJAn;r43&#{k^>8qg<9%?JmnVNWT=*GwJiC zC%)rEfh=-|*Tb{Ve*c9&zxs`dv{IMM)>WmA3$OfUE-IyGlx@~HbLWCHNpH8;Pj(+A zdk3LT{mMXbN_cM(ZCw0k!dYc>QMur^=lOgQ_fMmxMLnQ~rz)*v2ErVf@s;MMndj;U{N835ppi8zdISXv367baNXc zX`MDRY~#I(TX#|G)NjX^GDrD5SH4A3H;V0VH%E`G`r~Za@UV2q<-mFalymtS=>w8T3BZl&5=@Ar zVCxolO=4|G3yl3+dR)%A+4>rp`I#)wdi2FupT)K z=+2mQCLHbfHgN|V8eLa`;$I_;nD%gVo$?{dNkgWiB@li&xy+Yqj@$8^p?2nF?MBci zx#Vcv;2RYyL>`XrHukL(l?_KLesv^xj>3>{H;u^96F_(_G*WPT%B2G_7=}EVvnX(| zI+0=;dGgq|tkP4U0b#EbF(6JBs^$vmyFG0N&Rx>2_|Bn%WbB?`Sq&301 z1=?kW9a0XF%xnzW=4ifSzk&h@;AY9MqqakI0aMiS;i&C*E=f>@T-Pofz3*?3e=-mL zqqM)fpA9me^dI&gCH;dznmj%j^pXdU#N+<{?nAzRz=LmN_K7kl>I?V%rsWgV^qhY& rJsMFY;23;P1^27;cRj4u~TZNX!Nj@R2HO8 z0aB?IB~vn(do-PM@{3ayN-|OvQZf^B^3xTP@+%aQDixA*GLy42^U}dmN)<|aQ%V!{ M0Kt^jsZRg^0NxBLga7~l literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..2aac84fd55bcf024fd5998d534e006eb38ef3ea7 GIT binary patch literal 8000 zcmV-GAHU!qiwFqi7LjKH|8sO@a9=PkaBFM;?LBR59LIGe#m^=6X<4#mH(pbYNyg-n zcO*rM5@k`8Ow%h-j!0Q_6y>^iJICAO-tJ{TNS<94NfEaavS^cLgM74U(*Obb+jWZ= zLEyLrf)*{%BIuvAKUy^T)%HhG7%f^9NZ*@}{W#u{r-+W7mOSuo_RYLEZ{BGE@qk=$rQ+JEc2Oh9f?Dg9{Xl776$gBQFckz%vG;dfAk=-JZHC;8VH92Q( z$+@bWH}w3vZm!X+{;mF0;1R!l%~Fk^c+Bs`MApicNZt*W{KHm-nAr-pKI_>Aiok!N zpc`anbab?6mC2|(TGmZ{)5?uh49~%T0T?a1Wh1VOkntrrldnZJ@qLMG1z@+XbICpjXbWP8h|UMN0CrFK)97w(alq>HR*c);=6v1 z_#6m-4Pbz0X{b=E5L80fA z^B|TB%JU$TPEhG3WuEww>bka`^ISHx+zIYb-nuwnhZKphYYDEq`E>+29zq0WoLg4r zsTfw2`!vt(Pf!@7ZDKitF}!D?g$(*!0y@H>sRVS4L(>W9IEPLopl3LAYFpdR#W)kN z3pLy_IbDk3UQu4)(1ir_B@R^*&}$sJl&I_D$_-^%Sy65(uPC?RcU5_nm-_^kQ$DHO zp<21Ae2T-f3H&Qi{-#pkd26v6pHS8z$56_ONpme`$URC03Uza$Hb-QLs zk;8wx1|D$umuui(;qb49aByNS*>af^!-(gFJ3;Y9^tY03R=j!{eKcdfNTB0ukcByp zf5@u1x@9gC1IFeU{5T(KFY|Gk4T{N{Dh}~9OSpf`7{ne=FndtRDH`$oBdpM(1~l}7 z4l!ARMz8MB&_>M}gx5U?<1PA*3~~G;dC$fnc^N{)JdFYp*gcl5ujytj-19pP6(T|& zYoNCe$8g=_Kx_ z7B1d!Xt1dCMxf4%anSO@#aF(`0LPW%tUoMVyd^Wx5#l@YA<_#MuO4GX8R#(hy-z7^G`$Y#HsvaZe=CF^j=>AcH=x^g zad?z2K$!^6(lFb=?%XIG*~70rj3JJ|WQ|Un=FwGbxlXJDx5AKK6Cyx&h5%bP-RrhS zY$p%}#iy-79No@?#t!=FqMw>5h4{Tzq2Lhr1~{>>NA6{Z!4O__7!2rz5kGkP@I#~z zz_8uRN3S_0>JcbD-W6W*{uQ*?&bxt#3QmSb-U4!CMlPBzZ1P)SQdC5_>=qHmN2mE;W_#?!r337Vf!38q(-M?OR4J)t~7ia+L2!S-Qt#}O_zk_ph~7B0S26d>YvfG4-zAd<>Kh(rux zuDL}n!VYw`!0SQQrLR+OyNYWv3CODm>Eof}EnNeAFeU#WhrCGiwW1qhPYVm&JG+Zo~Z7gw%b4dN@P*$v&%bGo6s)vT>T6dm}j>A*jb(=|wq1^#i> z*3}VQ8Py0#NvDx^;I{<+5t@^Lo{SL&NoK7=wyfIg4%U2S?s8E2gVvT7zvDEG6Z}>w z@H zZBhUpQ>js0N2FgVn$tSYYdS6RN}u+KT^;fCD3t>3@ZuA^c)R@4GBDA(Jbz>H*R@P24sOcOO{ie3 zF?HVcO0mS?0CYCjwyFa|=KvQGNx9-y2ZzqdqoZ=(AW&Jw zCL6ltIgqemK3X1zI%L(X0>U#X#)+weEi7t)Cf$`Nhj~z-OSr$O!zPwN9wBeweC5J` z{G1%x$r>>Ucn)&QvNdOLVt9H8atF?jLNURmTB#6I8$=3FCvsA!agk9>jSb5c%h8dA z8Ts`xn9x8T#82#jVOe$KS5{$07G4fGaj!s&A$h3^%^N_o92mMQ-wi8`R5~zpd+bh% z`j8aUBD9V6VOupdeQ+c_Hk=uwO#v&y2FEfdhclBri?*>L zB+)@O(7hsbu0T-2%S_*K^$p2Fd7uTzh>IO-=7vetjMs59;6Sb{>O4SK)KCYS-hyJ) z00Gw9qJ9UYfWHBX0BVdlew{TI%;ic&cPxzzrc+=JuI&-9^1DM4wwMitK*6w7ZF_?$ z1sV+q8r0HZP`Gsc@~hZI#Sn1d^5QGFCj?_%`=+f{Fi-gV13(E}9&I_ogzd2R`WRKpPx z660dqpMDcaT)Mfka(xMDQQHmtrD*)vL5Hlwhz?2|0aLBY+nWw8f}RwJokG83o?o-A zC%-Kt#^0eNL8KM`WrW-!bSK10ix8xS4X2{!DOnM1wStMC-agK4vp~cVoo1F>0gm0E zU)>xLM#9I$tq@>ZX3U1t8EOKpFC#V4%}&kb!DiM>-Es7??5JzH>*|i|5G@t>{c#iL zicXmQh>5gB*X(<3gmyu+&~wdTQ(1_9ynAtsh%!@67vIFb z3R<%uG8}Oz&%!`Y?@krD7$0l8yovP3)6JKs^tnCZjpD@t6Z|91;6| zbzH9(;>C)U1`r@cWt%Mq(;p8!BJ>iF-94`%Pqax7-eAw~GgZ7VgY8*_4r&MPc!|;| zT-@^r?yT4ru;}WM*q)-81&G%iI%XxgJC?IRTdH~8z6qVU05Q47O5F3j5W9V~#Rwj^D+L??rMPDy<_b zM%m{7eYmC7CV1332H_>$i)W6Ms_oiLTX8X7vvmZlmzde-}~ z;=S>yQW9yKLWISY0?Kiyg_+V<5(cTp=u0u(C&|;h$r$w-dJ~LZsM6BcLaPGOq)w-L z2QS{GbCvWMTbZLvP}C>UzD1|5bh=9Ce7y6GiOx3_>wMqc_gunry%ifSslc8S0-KBp z?0fqHYbdZ0A+R$sfxWXYFd?wFW8IjpUC?z0-aXSoc+)ZAy}K{GI>MuiB&zVoOl+B0 zJjOOL_Qh5sHYTuHA+X7Wz|#8yYap--LSSbS0?X_RtbxFu7Xq6~2yA>`U=0LzQ3z~0 zA+U*kfi)0Vrx4iLguo{E1=c`db6jBKV+nzs*%w#?fn5>;OD6<2wJ)#+0-F~C%OnIg zy)Q5(uy+#SYg&Y_6C&zQda|>-icRcHHX*g1W~3%IB7bgz+eUYia#35Xn{P)tO>J!O zMl@IndjJB-g|IKg5j&l_2x75p+Y(ooZf>ZCUb}zM;acm&6jTbT5%T>3?U`(|(?Pmx zOs7_K_m)po&xpY{F_vU@pK-b6 zNPKD8Rb4&5f@{7_NbvU_bS3;}>vU|f(o+*MV6HztKupHlygzvg#dJ0)rVe2wiNGVX zKY3~-uqh$1MBtH`_&|GA&1m|tqnWyk49><|KKa46e5d`NhZDz+J4@SSgacAsM&~`^ z_>VzTZ8b}n@n&Q9D`4Hf-%5G#rRWa%VBz_H;}0Qe&*iMG}0=ozLY&JUZw$<*Nd?JhX%@L&?> zA9lnMhYTssJ%RJ%9dX3z-6YOG?}$@#+BV7Jzwd}za|Ab5>pidZ-j1j>M|G36{zpTT zsB@$@iTYnV)+!G5CUO2}7aVr@H;MDp9dX2o;3Q64+fF#*z)ex&#`J`AaNI{qjmn8g)h4?rpoH+=lj@sIz-^YrmY zUHI4|7lMM-ks~cF=jkyJU~sm7tY4->qd{i-`!7iHdAwhnWu@_2K2L)-2FXx}yzN#) zEH+BZQ4B*j*BQcizl=w?X8UVTaP`X&bk6oi$G`ezykB&-UmX7GAFUxZQA=uv17FQj zXLrCb`jh>#W9Mr)EjdgEqpajMq%$>0@k3%eV@}mzHgSAxXM)oW3DzDg+nLVUMs&g> zXFKDk$KnKPPoverMI~`mW^wZVQMB+V8Wk2wXuF(wXWT!VwtqHl|7_Za;%r*Nvz;Hs zkrWd-ORMreJ68HRzz+87~9AY$z) zwAp^9hkm(cRcN-K-{siPD}!;3o{^&&@52$femR_Y~%eaPcyXhTS|fUPo~aj{-7f*7Tc)IucmNA}XA-i8@khyzs}HTWQ?V z$#_e5IoK99E`F-8DZiOH(|k4LA*z{bz8dEts+n%Sn$3r(=IjHsrpcp{KRY+$V-LX1 zL$rE4{QwsEFzv}aKs^uI9+3b~0|Cng={$YH%>^m&C4L$d*AM*vj-A|;2evH*V64R! zLZqJC>8U#+fE$2}&|MD;CWd8t+++3y+$$fJ;}ORIk0Yg?o@h@4$iVgO%-9{74kQWv|`=Q@MhJGKOz;E}k@8N^KC{+1{3HbIp+>1qP4;t#lP(_#* z3-Y#i;}2Vaw?lZhV{pepyE^UL>#(j)6?+}h)v02y!?`+D>~$ztr-~0KjN8u1-iL5? z>e%z}txg?#9=avR?!faqZL+D_8b0_?venIO>=L{_ba%YTC%EtTxDdqT`9&w+mZHz& z%&!j)WX1-DFyvdqJuDqx!QwyJ;9V`WOyKv-tl0JfaR9XTH`bVr15DS5?`^EE4i_-2+(9flIhb{zu4@+NrfqFhuRnhRQx zv2j4H2(($VWv6I)h9=vjY;7bK93d7d)9VeLY^H)@w|{~*7vi6kVU+QoEl-22F6$03 z>Yy080Q~OwYiUaFwd1oRmg%cMuJPB-(31Fmm`?UekihQ}kLg)G*xJ(as^vk%ML;|< ztyp&LDqZ=sWC(14Sv7ncjf^iQ(PU^2GAxD!t3WUDoh&G03134a>-@zlz^jvzu?~E7 z4ZSl6Btlv81z5qvZyj)p#Dep|qy34ec zBS^#*xZJQParWXIC}Wd6(P;xsvPn6?W9C$beJ^@t)vOg^@o4F%?c$y`@!%PLj=rm- zrDgR1$n&(k7;C$9gP>xWz?eZomxDf0D>fO!6$>CJJA4O_O4|ec7j$m8IpVky1fH~S z0?#mu;H`#IY94QVGIXcN-Y$f1@S%doMmw88R>*}ap<)y0mlBga$>R-ExbCguW(!PR z$}nM4*FlCJSO)T78HUAL!Fa>Rk|dQZNvBWC3sCEf#O|i+1c^|w(68dLI>6PxJBm+) zl&|MNIUB6SAeTr`q%uS4;*&^cr2RS2{v2q34zxc9+MfgM&w=*mK>Kr`4{8oXLz21G z6`PP5X~}ZQjC>m^zJqfK8mZFzQXrzM6x9*aE92P;a1r8tK(gxccn(H769XxTdGJ;q z_O?fY5iY;E4?-+jlvS3J=ZsCY>R>2L=NS;0*`xpzA;B>mM4=I&8U|*JMG=Hkx1Wr} zd0RsoDS44ar~Nz!2jVqso@qe{ig#P!J1en7$SBPVDe*Z14XP#S|L;;hHI)|_O(Clp z39S_8h-gXji@M`lb`_04Xa(coaEgR8FN}i0)3d204r{{h5_3!#;#G{(9XiJWn@DPz zS_LsTbO`+%T=Bd z*=&_!d;T!Dq8a(Bj>0u=4`4J4#xYRVHE=JOqH-FzgEyJcOhHu38Sf8WYt!6onqFn^ zD6$u33~I!1)oetST}P5zE$W%yU&Y&dl4NkyR zEP36&MqCvE;&-{I1Iar`{q~(9X`NJamTGHRbb!tvTv9$`8EBS+xE{bW3NYo4wTA}E zJnK4oVO4g3H*lA{2cvt=VYjk2V7Sn5ps>t5bhay7DopO7*Ym}*iRl3$p4mtmVbf=C z3*Lvg_m@5b9z!oXwELbzfBVg%U~$flGp zs>lSiiSmd&8irR_H`w|It`K0QFkHn6fX|}ED=HKKJG0CI7spnd1mU_Cef1x6u8dLJ zagoe?*c4pb02ScM(()?{)Vx4rP5mzBLKju;2|?-Cc(73T}m3jR<7#Dz0zh{6no;OG)sUN!Qb z!NxJ*1IIfj-e*AiHuxqw2E!!KhTeXe$22U!f-x`eN-nI3ZHwfq-kMS1bvpl0hRH0?buKSzt(|=7O{Z$+|lKHq$vc+B~j1 z*3mh#%<>4!L*9$1DT}W-@}QQgAthaFusZ1yK*s({HOCHVq8&If&Aot2?zK`}#9qVH zhH(^KkwjAb(Y!xHPZHGK$82zrEO5eK~u-YL$o=#p!=bP7h|3jeE0 zTGdp`FgXx?JKSw>?W*t&w8w=`7#&K#I*tng*b5EITIcf#2}Y#~7#WH76dEd?4$t0GwKBY7j|8CkPz-tMe+ghI63<6Fqgl%%*=w(+SROK zl#CzAf60H3(2BR`5CSrygeumkk}bzx|?=4@DIc22>~#K>qb_!*BHwZyQUAf_j}hW z?#P5Sg0B|~{3B&`3z%?KL$LBv%X_UY44&Yc!4DzaB%5JdS}yv%*aPtx1@uj5z}sKL zzpwj;aOz~4S;wp3k(uD0e^>*5YgOqHHpa|}7&9Z#)h0&9rbh6YmX4Pz9q(WVtouw> zTNXDs0!2am-=7A_e{qOga80x C24CC& literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..a426bd07904db15228cb5d4b18d279e5d180a6eb GIT binary patch literal 4879 zcmV+q6Y%UGiwFqi7LjKH|8sO@a9=SlaBFM;?L0|v8%c7!Nr9xU)kyM;chv%Cmf(mu zc}t{56v>e|Es-)09p04@4WK|Y1axEj;GtQ0y|yPtvTLuHc7*K+N9@C4hy8bi_v45C z^~=9~*f0P5upj>0nN`(|ZX8r|mt*50AkgT}%FN2EDzmx^_`SYA`Es+1z32xyLq9Ny zv3FEWki!}=)GE;|Gf~m?nnQ=SEmB+8in{ZaGx0;`l@p_VCV(8doA&BClR-xFCB@2C ziD@c(#B`oJF@|U872?n{ifO8*rDztMTfiL7t2O7|@-P2=?eC?DT{XOC?gNAy7I{t z3GLg{m5NikNQdzxtLMrjZ#f(EoL(bZwuahgZKHxFpr01i3b~m~CQEviB&}ps)zkw$ z7q3-p6F)8>S+c4X?}Bw3=b|LA#Zl-5A9qW&D$K<6!V02Ft6b_Gumz$jc}v|V3^}W6 z1#;-n_cXg&J4)nrgCweU#Znc+P>virk~K(P*G$W>L5Z&%+RqRjI;a&m%2$ru^@X$2 zopZi)ww##UEq8z6l&;bKf^}3QNYbOuJ8xd|)?V7Z=xl6keD*)^Vh6d)a&m80hPuSd zflJy8269SQXr!Q5T?=BkoGIven}r@|hk#)hu)FmD1fl&*cMQD2oMI9IJqIdSfb>Ao z_H2wI)-xR;T3#=J|2C8=(Dcxunw@_=d$^_; zz6%hZxg`x5RvO-v;eC^hN+07tllini9}-06Hb0 z(*fwLfX+4PcFQN33%lJQEw|HMAMKj_ses-KKtB^uIRJeqpc{d{eoNkxx8)sqSAHx% zf$yjCGtvKVBcA*n`Mb<3yYlY{c-HgO8sP5AIRVf6diy}$gO-x4%4OF2f(P&!x#AJ; z%9^cRCHit|K@nDl(!HZaj{L*bWaxjjIAMmN+wRcnOXbfBU@ohK>? zoX@>X%~CWb*l~G+&;iprs`#fC{)UE}Zfgb6fJ9vn|GWF)oo-UUlK-XUh5Yltc}ARP z;JgH`>O)txODz7`_&gN2tkhgV;~!e4&ph2=8Yn?j&Dt~y#4v?ozT`Ws7mFsbw!o2| zcE6tB!)6{T_ex8S?F`d~WtYnm%35&lDYCTrP`Ilfx`i4K1lRqgf5OrOENh?-AWVq1 zXO&nLGRpPaWOjRsHDmzt3?U<8AUshEKm~Ha8izyf6Lqg-IUu#I&_O7k!EsEdQ!Dse zRrYk`017Htr^VBmnT$iF`SkqU-OSX&O8V~V+}wP6E|XqanO>crok>ln((}`>)*5r` zXNUH2#(np99zJmVE57W$YN|Q4qFP5;LxDNLq1|bR_U6#!=i>>t_k@jzFIl=zY>>+aSci_R7}!qa6^#H1ZBT7wDjj1T%u* zlG{|6O6*TDj;wuFwBKm%AH*kyrc?w-d~vclK^A?R*bIAKV87pl9WTLTV{)tof%P84 zs#bL^X{%Y=QY##xC!G=)JrQ))_RceOR!=(3!B0ov5E+9Nk|KQ$i*6hlH>GqcRXe-| z%V~&u<>;oAtLXU`x1?8*F|op~qbjzgOV`zEO*e3nk4Om+(Zegkn2Ve0OM+aHX`sD+ z8fQzxoMHe?O)22I@1~Tg9ZFM3jfZ-yoxXh^WZ>>i20_4n23WiLfiOa?e@5IL-OKek7xvD4)R1d z3TAA2a()6@#}|`;=1Qs5YOvUdAqVIaC265tJt*cS_6&Y^zIR;H;!RBKJiL=<#k$j>G;HVQqL1?4oPq?f^KXK8;Vv? zWASuqGLvGeKop_EzQ&o!%#3JaI=1C}fY~F+cp){J_81>e&Fv8+2iw5#s*?)`f)gHb zfBW4xA-T>2FF=Fe?AWsiOy*|1_J@H03TLs&1AN5-bv*PfGu{m_;G?ZR?tm4@H_iya zjdh7X=gLC3tkqN#%DGrN0pVa7Hi0OAH4#CLc`5`B6xXCNw!p14w};1z@m#Blx0n|AOJ~~2Lr9Rx0oT*n}5ni7)=Q23Au~5D# zEwFjd(rW-`b8CeZEioH<6TYTvq&_*Or=d~U>S>D^yNwy*C89vZivYZv3(ahTE2@V9 zD?mEa3aCR*+Rb@=+mDtvK>LQi5XDV+J^%P~@Xg)bot;g<9N0JY(yY zQQZ)`W6cSyDYTGg%Arfby_k=|bE+11y)W|Ip$Fj3^-^l4?%eK2$ju!;?=R;eBK9=Z zG}WqPDtoGBsitI-Lc*a@f9whu2I4ucA+H&zdtb)cu)@%DJ9;}cv&C#^bS!l;bD_}> z5Z#To^|x~;O(6P63MJc9Iq4723Ka~!T&-jaYg5}nz za%dTZ)bH54cge-mo&k;DOe}XJ|3UCPOv44|jhG4910i0fkEj|ibGO(O>5hqEynpgGOk4wdS2YxUfp~s?rzIH^=CUF*3C>L z3wSYxWgGjVXGJ(~5yjLs@qmg8K5A<$D4{g>Oz(+5D8%<}wcPZd?<){O8;@f9JWB8!J}6$|?a!6NHC(>m z19_^`rCspbHq*J297s3iP8B!(GIyMX(Td*mHtouM%O4ff9VhctFe(=P7M||hsNm8r z`K6udbZKwHyHr?scuGCke z*WRr}-IeH7-|Y`(rW@|+bH=xHzy8i(Z4DN84K@q7CcR1yEknr?E?(Amvk$xK-$Vd#c{ zci1n02Jx3a)}iG*&7bS)T4XzmK+>gXDk`zOQXxyx=NzQ*JDw#&ULDI6xZII-cO+>Q|ME-q;$5YNVP=?r%cs#@TwhSBhDjv^gp&cJ@ z7vu5t>6D*A;|9kTwCE&3$|FvCx4Y5X-N=lE7HYP$7>w_>JHFfQ_-?ynM{IWt%xsHE z%%pgd(+fvp%uY3ATWJyP#@isl;WuuIY{c@T&2ThkuV%a?AU0CWataLA_5}44By8N+ zxD@4nh)NBM=B23E-WU~~L0)6qE?L9ZuUc4FGauht(8zth^?nf z(RjMaMf1%zIOV;UK`XHSAv@F*O-QiNP68eZuFnKv5-G*GNMpR*OZnNH;x5ewC$h$+ z=n8M*l^W*-@rpauJRSW?X41{ly=_CM$Hu2}fM=!+Prlu=B{>sY7?M-89g3N$7Jg~F zQIz|tA+~?5U8AnUYeJl9SE2SBM}=aw_Bze@b$ZJNM^6p^7Sr}qS7tUmZ*o%JbK!ZL zC*?gKp7+5?c`uwm?vUGHPbk7v>I5R3RKKb83F7I*l4nl9Je1__35CU0Jnw^)&T3v#y* z2~}!@D>OW%!xR|;+5LJ~Ui=E>#s710@hg@V^)%VpWyPr!6~3C_qy>eK)Ga1_r9|^! z>xeSqxRRlT$fr^zwBUOx6+#QXr&1lX;Cm{SK?}bBmsLS?_oq}5v|xNX)j$izr&9?8 zlbd7H0fQVUM#1-lHex6TSpnUk5sMfw-tcj>^O~NqAlK*F#Tz;x9u)wRvWJfnRQwOP zSNuN}EEC{oYKC4^EBLs()J4zY|HSRM|L$hi|1Ite(e|_ae;C;RuKi;9KmMBkO&LCH zI3)Ro#($;fgN?fQuee`4{D*&|@B7-UtWpF8*ux3@3y?b25bx8YJh*b74s$hlXL6UT z7Q>Re>>Q{MrHZR&jE-)tJz3sb&E9*kd3X5%nD9S;`KLen_}Y)@g{|c$*jH#gQ?gx55(_Go>3b9F7dzKZCPo#kyAkyoENuWdV*_+Q(pzzmX4002+Z Bp=|&F literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json new file mode 100644 index 00000000..a17872af --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 1, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 2, + "stats.cum_n_token_goal": 10, + "stats.max_n_token_goal": 10, + "stats.cum_n_token_url": 23, + "stats.max_n_token_url": 23, + "stats.cum_n_token_focused_element_bid": 1, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 0, + "stats.max_n_token_last_action": 0, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1250, + "stats.max_n_token_dom_txt": 1250, + "stats.cum_n_token_axtree_txt": 71, + "stats.max_n_token_axtree_txt": 71, + "stats.cum_n_token_pruned_html": 651, + "stats.max_n_token_pruned_html": 651, + "stats.cum_n_retry_llm": 1, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 1589, + "stats.max_input_tokens": 1589, + "stats.cum_output_tokens": 63, + "stats.max_output_tokens": 63, + "stats.cum_cost": 0.00027614999999999996, + "stats.max_cost": 0.00027614999999999996, + "stats.cum_n_token_agent_messages": 1641, + "stats.max_n_token_agent_messages": 1641, + "stats.cum_step_elapsed": 5.891982078552246, + "stats.max_step_elapsed": 5.891982078552246, + "stats.cum_agent_elapsed": 3.4504799842834473, + "stats.max_agent_elapsed": 3.4504799842834473, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/error_report_trial_1_of_3.md b/tests/data/error_analysis/error_report_trial_1_of_3.md new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/error_analysis/result_df_trial_1_of_3.csv b/tests/data/error_analysis/result_df_trial_1_of_3.csv new file mode 100644 index 00000000..4095252c --- /dev/null +++ b/tests/data/error_analysis/result_df_trial_1_of_3.csv @@ -0,0 +1,5 @@ +env.task_name,agent.agent_name,env.benchmark,index,exp_dir,agent.chat_model.model_name,agent.chat_model.max_total_tokens,agent.chat_model.max_input_tokens,agent.chat_model.max_new_tokens,agent.chat_model.temperature,agent.chat_model.vision_support,agent.chat_model.deployment_name,agent.flags.obs.use_html,agent.flags.obs.use_ax_tree,agent.flags.obs.use_tabs,agent.flags.obs.use_focused_element,agent.flags.obs.use_error_logs,agent.flags.obs.use_history,agent.flags.obs.use_past_error_logs,agent.flags.obs.use_action_history,agent.flags.obs.use_think_history,agent.flags.obs.use_diff,agent.flags.obs.html_type,agent.flags.obs.use_screenshot,agent.flags.obs.use_som,agent.flags.obs.extract_visible_tag,agent.flags.obs.extract_clickable_tag,agent.flags.obs.extract_coords,agent.flags.obs.filter_visible_elements_only,agent.flags.obs.openai_vision_detail,agent.flags.obs.filter_with_bid_only,agent.flags.obs.filter_som_only,agent.flags.action.action_set.subsets,agent.flags.action.action_set.multiaction,agent.flags.action.action_set.strict,agent.flags.action.action_set.retry_with_force,agent.flags.action.action_set.demo_mode,agent.flags.action.long_description,agent.flags.action.individual_examples,agent.flags.action.multi_actions,agent.flags.action.is_strict,agent.flags.use_plan,agent.flags.use_criticise,agent.flags.use_thinking,agent.flags.use_memory,agent.flags.use_concrete_example,agent.flags.use_abstract_example,agent.flags.use_hints,agent.flags.enable_chat,agent.flags.max_prompt_tokens,agent.flags.be_cautious,agent.flags.extra_instructions,agent.flags.add_missparsed_messages,agent.flags.max_trunc_itr,agent.flags.flag_group,agent.max_retry,env.task_seed,env.max_steps,env.headless,env.record_video,env.wait_for_user_message,env.viewport,env.slow_mo,env.storage_state,env.task_kwargs,exp_name,enable_debug,err_msg,stack_trace,order,logging_level,logging_level_stdout,exp_id,depends_on,save_screenshot,save_som,n_steps,cum_reward,cum_raw_reward,stats.cum_steps,stats.cum_n_token_goal,stats.max_n_token_goal,stats.cum_n_token_url,stats.max_n_token_url,stats.cum_n_token_focused_element_bid,stats.max_n_token_focused_element_bid,stats.cum_n_token_last_action,stats.max_n_token_last_action,stats.cum_n_token_last_action_error,stats.max_n_token_last_action_error,stats.cum_n_token_dom_txt,stats.max_n_token_dom_txt,stats.cum_n_token_axtree_txt,stats.max_n_token_axtree_txt,stats.cum_n_token_pruned_html,stats.max_n_token_pruned_html,stats.cum_n_retry_llm,stats.max_n_retry_llm,stats.cum_n_retry,stats.max_n_retry,stats.cum_busted_retry,stats.max_busted_retry,stats.cum_input_tokens,stats.max_input_tokens,stats.cum_output_tokens,stats.max_output_tokens,stats.cum_cost,stats.max_cost,stats.cum_n_token_agent_messages,stats.max_n_token_agent_messages,stats.cum_step_elapsed,stats.max_step_elapsed,stats.cum_agent_elapsed,stats.max_agent_elapsed,terminated,truncated,err_key +miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,1,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,7,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,True,,,2,10,30,dd9e91e0-75ef-4bb4-9db1-f91f06848dcb,(),True,False,2,1.0,0,3,12,6,48,24,2,1,4,4,0,0,1902,952,400,201,650,326,2,1,0.0,0.0,0,0,2789,1404,128,65,0.00049515,0.00024839999999999997,2902,1459,6.860883951187134,5.8696064949035645,3.769465684890747,2.946484327316284,True,False, +miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,2,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,20,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,True,,,3,10,30,187f0f01-a240-419c-a65e-0058a14f639d,(),True,False,3,1.0,0,4,27,9,72,24,3,1,8,4,0,0,2892,966,667,223,1014,340,3,1,0.0,0.0,0,0,4339,1464,225,84,0.00078585,0.0002646,4512,1517,3.0203144550323486,1.3659462928771973,3.8209800720214844,1.8219048976898193,True,False, +miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,0,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,28,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,True,,,0,10,30,b403cfca-4647-48fb-98f2-57e94306a38a,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1250,1250,71,71,651,651,1,1,0.0,0.0,0,0,1589,1589,63,63,0.00027614999999999996,0.00027614999999999996,1641,1641,5.891982078552246,5.891982078552246,3.4504799842834473,3.4504799842834473,True,False, +miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,3,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,14,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,True,,,1,10,30,4c89cb70-0bf8-42c2-be39-a9c1a39ffe8d,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1257,1257,75,75,658,658,1,1,0.0,0.0,0,0,1594,1594,64,64,0.00027749999999999997,0.00027749999999999997,1653,1653,5.879024505615234,5.879024505615234,3.029170036315918,3.029170036315918,True,False, diff --git a/tests/data/error_analysis/study.pkl.gz b/tests/data/error_analysis/study.pkl.gz new file mode 100644 index 0000000000000000000000000000000000000000..8611c7d37be0f3527cb0356f3d2250c33f769e5f GIT binary patch literal 3761 zcmV;i4o>kOiwFqO7LjKH|8sP8WO*)dYit1R8eMZ7NAb>CcV^(6Dsc(R9wuB$V=iZI)#An5XB-V-F4B|Ip@pRf&J_HpgfrY8d*Zf;74!)SY zU%}Ta$fC&CC2bmu6InW~>4n+?*V@cwu{dDbB-Gp7qD~@&IGT#!IPHq8CLU1kzw&0} z(`BSAbd|JDx|+;!KiDSgddNfKK)FiYAbzFv?{X=@GuMm4JbpjoUE1kkmrKuy$&j`| zjUEl9T?9fKBNA8O5s&jg#?&yiX-^k;3fb|rw+w21Id0wQ{U&l?r>nMc-Slmx!;9X z+0G+Dn5Ehjhh64NZAuX0uuyx z1Dl6HJ0>x_jl$FQ(1+f1oI%6EK{cVfqdm0gFmsjkEB2=qock8&cJw-N0_gMM26IB{ zr+kzm0-d`K$~w(G>Jp|aOR;E37zj5)qL9a8TD9)5(AEuR^;#WJco0^hz+Cnu+KKJQ z#pD(FSeUp8rM`fIeLP|Kyc1Br*=^@yfI(f;ZWDjCVh|j_v6ooWzCqW(qh#ybwBW^K zL1X|&81p3BaEBW@ah&))e2<)vDXBKao*)E!L`5I;Y3O&8Q8l=4qUnrVw9TcH$6%f? zcQB%Z4tJZN6PP9DDK1BtYq2$EMc}V=owPlNqIpcozLbFkab1kFKvx~sft9og&mqWS z3ZH>x$V>)9+!Xs#D?*#!5EJ;`26?18O-pj_f~kbG*C$X(JlJ|N@qPSwV$YyVG;Ww( zoOg^-!0QkOkHo#w`0?Bi#iRigFy)L#*yymdCI~*jJ4#1%oLH9LWC z8%)U<2G(urEcD~rw3xuyp*MXVd2;;YNFmi8*G3|Pi1BoyWBa-oznZ?K6FZUws6)M3 zg@Dl2LmGhboU*|vbO)_M?;BMTob8vA_IAx)w(Fbr%}x8(hW)C&0#fXQNl7LOf&{4J zxQdJcIJ^&51MC7yEjc9Wu1SWv1C9;r&ROx$5R-K{_hK>jg1eTt%<+%}t18S+t5H;> z=V}%s5LaSFf_=udhZ7|QA@zc|HY)bolvrS0D9t{fn&8;rrWWNO_|XnwA=<7ElkNAW z0%Ok_qa7LS>);sG;LPThXDZhy0Ge=A2SPUJ^r+SFV%_&V4G@EE%%9`**1PH#g>#a@G$V5*i} z{l`?7F%eU83~dNm76}Phibq_F1QpN}tu%b8_wnbU>biXH8iU%={#s#3|xZQuH1Z zRvGG&LX;Jwaw!HGVD2JVDj<@;I0G;e6u!2*Hc~62?6qB3+8^3KvVVeOwtt!*@z2CA zb}2i~X9kV)>c4j9MZw>$0_>5D>;Oj<+@s)_f_oJl-vLhS04EjPSAbIr?k~Ut3LY%L zLkb>N@Q8xb3Lf179#imm0iICsWC5O1@F4{sR`3x8Pb>JSf-?%9DbV%c7YcNbDfqa8 zPkhesi}wkiRPd~VYUfgVssNu>@JkA63RV<6ui&hL)dHMTa9+WM0=!Uw7ZqGo@REW{ z3SL(5%LVw1f?rYas|EP1g3l>ysqGj3cjS^%LVvK0oDrehJwoq zVpOp|h5$0UQwSO9g^-(>5Muu_A3&bP0D_DA$=*^%0u$C6NW?UV%Z6MC(#6|=e9a6s z;x-}E5=T-r=c`)7ft&<#lZOUiH5a#k82t~YwNY1t1aK8|;gpeSa~@F5!ZkXxxc0Fc zB>`|bHK?jhEf%#i0%R*w0g8Re>_!|-vCKCes(}h%aodu%7*QBP7Azhagw_K3wFuI< zoan{8K%5>{r8$s`&@H?(1Oab=sAq=OL(b|34HWxifrdK{@#F?5tDa(=HW!P>wyEta z@yu|l0gCR?aDQkXcx(_FUjbtBIc6?|^$<4Nnnrxqphp4*H|!ksJ2(Q^Yc<%hWzB*q(PalQLfgfZzNL1p`8NDZ1$g4BbN*oJm8@f8a6{$6SW)&Grd4o*o$Y61JC-ZoQ zF|#+wLLF>Z`oT8aOJ|_?9dv`Ns%RUuqA^zv|I`4i23AbNtn+6pVH?$vwsU%$TFrGb z(P1!w7Dljv?MJXI6>Hh$7NKPt@f$sbce_fJ3hZhkWRP^?`oYn zmPdkp1|5#-C{TxVJ?kxfB+mA!0cdrcx0MmYompx(4dz;4VU#59n8&x7<#C2uJCws0 zB$VfTMG5;zl(5g2`|ZqHb$lQmTMVh0&(z4XPr1HI^UclTU1hcU)$U9jL_mR12^l8A}q*99nNHS9RQoY5rPAziDbQ~Rk&6!m|iEr8( zb(=Ccr4tgcrmGRK@8kqJTY(358gZJ`@JU8ry$YX&cAJLiLFv!}ZbEJ#7c`QY$1GYS zxj2!l3V7}?DhN5I3dEObXlhpAr0__bj4tGGfknu(HY>5oq6jXMC1)l~0Q~W6cah6X zNGR00n+gx+r{Q_C$#LB1D>K8Y`U%OgEQw66IyR+Ffo;DcmrP+*;DD=SSMokGw>nF- z$6Pn(-fJv~K&!+8mB@r@(qJMt(N+Y{H*KI#K1hBmA1+;1&US9q=GJHI(Vmio?OY%fb~ zKyeQYZsv=j{BC?U(fXWg+fK@jdz)!did(mol}Wq9m!x<#U);K%;z63-`hW31TD>$v>@ZOFXn~fx*iC;8upwZO`ELFu_;uQSkdJ?n{ZE zv9|}AsFep$_R6-iy0btOj{~e;FQ3XI#_adNUaqB^kT5s#L$4TvBqvD`r#jSV zMr}R$vJJ$L5i$q!!LItGDh~F)Me2d)urQXISgl8Qq2Crk@lFmJ4J2!L$~LirFe5)S z6{EOU4C5O3GGvBcT)Tr06duZwEyY5N;B|&rjN^EFU)lO!;ogYFC<#N@$=`{^9zD52 z0*}WbD+P&lD*|b8(7Hnm>ICryj50FBZi_*FHRh2JHaiBeyu|~yehXefzO$ly9y+#V z_F6`b^sbKRkkrG51hy1c=4R(EXtPyqZceXOwb^-XZb?^nQ%LbrkH%e1&47mAm6Mb- z#Nq7;qDF(QtL1YfikKC*V)^{~>n}gw`-~`d9kE9Go=fHh6M(rO84)*@NwS&1MDx*>Bu_{u<oaE` ztIphBzBTh&L}%)JZRWzv?Cj;a>g5X;W?p~o+6>+ZIv0!mUN?m8SEi$R>}U~kY7f$j zj@&?HYO9s`B^0ASPyM>J*~)wcGVDWX9V*2pfgOvpON|TjRnuIWpKVkxE-qNs;>C;e z7cVVdT3DQ~RxixY&CSxe`HRr}k+iw)aFaNAcY`(1wB&sSu{blce0e7D&GS9NL%h&`O~t*x3plBvxIzGzkFtp$rTW{v7adSPLH$y%x|ROuySi5T;g zEX*!6s*NSE>_;;U28(`PiryzXj~;v;hf>5*cMV03#1!o0VZ+f(E=9z(2{E0+=s{?~ zMn0~cZhZuHR2N5JTM8U>i+U8IfNL^&kL4g)YsxDKI-b(V7SGtPZi*?o?&YsT*r6DE zVcpAJd9X+A39+|#b0W4oVhr1%Yu5n}k23nz#XfZ{CD{UP0GaRW?VyP_6Cengt2*9l zLCR5i7637+^=pv3HqrV7!D+Dvxq(~M#Y8d$ywVh(69)pamb+KA0X`&Ac(E!nTEB(% zeQ8_2hgbXCtv{erT7Q(~pP<}m{aHT$MV5cXKFfItT7Q#Oe@Aw$Pi2#TKsl_7yZ6$? bt_SL3^Ip2xeXm{6$f*1mc0^=Rvnc=o6lFOi literal 0 HcmV?d00001 diff --git a/tests/data/error_analysis/summary_df_trial_1_of_3.csv b/tests/data/error_analysis/summary_df_trial_1_of_3.csv new file mode 100644 index 00000000..545cfc29 --- /dev/null +++ b/tests/data/error_analysis/summary_df_trial_1_of_3.csv @@ -0,0 +1,2 @@ +agent.agent_name,env.benchmark,avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost +GenericAgent-gpt-4o-mini,miniwob,1.0,0.0,1.75,4/4,0,0.0018 From 000893d0845efcb1ce03f742e05671183715bc5c Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Wed, 22 Jan 2025 12:46:12 -0500 Subject: [PATCH 07/25] quick parsing to run from cligit push --- .../analyze/error_analysis/pipeline.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 53021297..f3e19923 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -72,3 +72,23 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T raise FileExistsError(f"{analysis_path} already exists") with analysis_path.open("w") as f: json.dump(error_analysis, f) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("exp_dir", type=str) + + args = parser.parse_args() + exp_dir = Path(args.exp_dir) + + pipeline = ErrorAnalysisPipeline( + exp_dir=exp_dir, + filter=None, + episode_summarizer=EpisodeSummarizer(), + step_summarizer=ChangeSummarizer(), + analyzer=Analyzer("prompt"), + ) + + pipeline.run_analysis() From 4727a9e18755890e1f7cb726d3ec8642739d214d Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Wed, 22 Jan 2025 12:49:33 -0500 Subject: [PATCH 08/25] even more parsing and making imports absolute --- src/agentlab/analyze/error_analysis/pipeline.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index f3e19923..4a961b76 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -6,10 +6,9 @@ from bgym import ExpResult +from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer from agentlab.analyze.inspect_results import yield_all_exp_results -from .summarizer import ChangeSummarizer, EpisodeSummarizer - @dataclass class Analyzer: @@ -78,7 +77,8 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T import argparse parser = argparse.ArgumentParser() - parser.add_argument("exp_dir", type=str) + parser.add_argument("-e", "--exp_dir", type=str) + parser.add_argument("-f", "--filter", type=str, default=None) args = parser.parse_args() exp_dir = Path(args.exp_dir) From 42f0362282bd7da087ff4b8b02d95a3b244a9f23 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 24 Jan 2025 14:40:10 -0500 Subject: [PATCH 09/25] . --- .../analyze/error_analysis/base_idea.py | 287 ++++++++++++++++++ .../analyze/error_analysis/pipeline.py | 33 +- .../analyze/error_analysis/summarizer.py | 250 +++------------ .../error_analysis/summarizer_prompts.py | 202 ++++++++++++ .../analyze/error_analysis/test_summarizer.py | 22 ++ 5 files changed, 559 insertions(+), 235 deletions(-) create mode 100644 src/agentlab/analyze/error_analysis/base_idea.py create mode 100644 src/agentlab/analyze/error_analysis/summarizer_prompts.py create mode 100644 tests/analyze/error_analysis/test_summarizer.py diff --git a/src/agentlab/analyze/error_analysis/base_idea.py b/src/agentlab/analyze/error_analysis/base_idea.py new file mode 100644 index 00000000..5d4827d4 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/base_idea.py @@ -0,0 +1,287 @@ +from dataclasses import dataclass + +from bgym import ExpResult, StepInfo + +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +{ + "changeSummary": "A new search results panel appeared on the right side.", + "actionAssessment": "Correct", + "explanation": "Clicking 'Search' was appropriate to display the results." +} + +Or for an incorrect action: + +{ + "changeSummary": "The page reloaded but the date fields were reset to defaults.", + "actionAssessment": "Incorrect", + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", + "suggestion": "Correct the date format or check for error messages." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Previous Observation: {past_observation} + +Current Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. AGENT ERRORS +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. + + - Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + + - Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + + - Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + + - Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +2. LANGUAGE MODEL ERRORS +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, +independent of the low-level web interactions. + + - Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + + - Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +3. BENCHMARK & ENVIRONMENT ERRORS +These errors are external to the agent's logic and the language model's reasoning, +arising from flaws in the system, network, or evaluation framework itself. + + - System Errors + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). + + - Benchmark Design Errors + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), + or inflexible evaluation systems that fail to account for valid alternative solutions. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Planning / Thought History + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. + +3. Current Observation (HTML / AX Tree Snippet) + - The webpage structure or state that the agent sees at a given point in time. + +4. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +5. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) + • Context: The agent correctly finds a cheaper product meeting the user's criteria, + but the benchmark expects a more expensive product and marks the solution as wrong. + • Classification: ["Benchmark Design Error"] + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid + and does not allow an alternative correct solution. + +2) EXAMPLE B (Agent Error - Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Agent Error - Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" + The query is ambiguous because "Upitts" is not a standard location. + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. + • Classification: ["Benchmark Design Error"] + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), + leading the agent astray due to unclear context. + +4) EXAMPLE D (Language Model Error - Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Language Model Error - Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide if the failure is: + - An Agent Error (which subcategory/subcategories), + - A Language Model Error (which subcategory/subcategories), + - A Benchmark/Environment Error (which subcategory/subcategories), + - Or a combination thereof (multi-label if needed). + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". + +Output Format Example: +{ + "errorCategory": ["Agent Error - Navigation & Planning"], + "explanation": "The agent opened the wrong GitLab page and never recovered..." +} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Overall goal: {goal} + +LLM Plan and thought history: {plan} + +Current Observation: {current_observation} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} +""" + + +def _diff(past_obs, current_obs): + """TODO: Implement the diff function. + + Returns a diff version of current_obs compares to past_obs, unless there is too many changes. + """ + raise ValueError("Not implemented yet.") + + +@dataclass +class ChangeSummarizer: + + llm: callable # language model + obs_formatter: callable + use_diff: bool = False + + def summarize( + self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] + ) -> str: + """Produces, a summary of the effect of an action.""" + past_obs_message = self.obs_formatter(past_obs) + current_obs_message = self.obs_formatter(current_obs) + + goal = past_obs["goal"] # Use goal object from agentlab + # Outsource everything to formatter + plan = past_obs["plan"] + if self.use_diff: + current_obs_message = _diff(past_obs_message, current_obs_message) + + return self.llm( + self.make_prompt( + past_obs_message, action, current_obs_message, past_summaries, goal, plan + ) + ) + + def make_prompt( + self, past_obs_message, action, current_obs_message, past_summaries, goal, plan + ): + """TODO: Implement the prompt.""" + return CHANGE_SUMMARIZER_PROMPT.format( + goal=goal, + plan=plan, + past_observation=past_obs_message, + current_observation=current_obs_message, + past_summaries=past_summaries, + action=action, + ) + + +@dataclass +class EpisodeAnalysis: + analysis: str # complete analysis of the episode + summary: str # short summary of the analysis + categories: dict[str, float] # score for each category e.g. type of error or difficulty levels + + +@dataclass +class EpisodeSummarizer: + + change_summarizer: ChangeSummarizer = None + + def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis: + """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" + pass + + +@dataclass +class EpisodeErrorSummarizer(EpisodeSummarizer): + + change_summarizer: ChangeSummarizer = None + + def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): + """TODO: Implement the prompt.""" + return ERROR_CLASSIFICATION_PROMPT.format( + goal=goal, + plan=plan, + current_observation=current_observation, + historical_summaries=historical_summaries, + action_history=action_history, + ) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 4a961b76..305d00b4 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -23,7 +23,6 @@ def __call__(self, *args, **kwds): class ErrorAnalysisPipeline: exp_dir: Path filter: str = None - step_summarizer: ChangeSummarizer = None episode_summarizer: EpisodeSummarizer = None analyzer: Analyzer = None @@ -38,26 +37,10 @@ def run_analysis(self): filtered_results = self.filter_exp_results() for exp_result in filtered_results: - step_analysis = self.analyze_step(exp_result) - episode_analysis = self.analyze_episode(exp_result, step_analysis) - error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis) + episode_summary = self.episode_summarizer(exp_result) + error_analysis = self.analyze_errors(exp_result, episode_summary) self.save_analysis(exp_result, error_analysis) - def analyze_step(self, exp_result: ExpResult) -> list[str]: - step_summaries = [] # type: list[str] - # this assumes that there is always an extra step at the end of the episode - # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info - # TODO:(thibault) make some checks - for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): - step_summaries.append( - self.step_summarizer.summarize(step, step.action, next_step, step_summaries) - ) - return step_summaries - - def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str: - episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis) - return episode_summary - def analyze_errors( self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] ) -> str: @@ -82,10 +65,20 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T args = parser.parse_args() exp_dir = Path(args.exp_dir) + filter = args.filter + + import openai + + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + + llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model() + + step_summarizer = ChangeSummarizer(llm, lambda x: x) + episode_summarizer = EpisodeSummarizer() pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, - filter=None, + filter=filter, episode_summarizer=EpisodeSummarizer(), step_summarizer=ChangeSummarizer(), analyzer=Analyzer("prompt"), diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index b3760216..7c5f9b03 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -1,209 +1,12 @@ from dataclasses import dataclass -from bgym import StepInfo +from bgym import ExpResult, StepInfo -CHANGE_SUMMARIZER_PROMPT = """ -You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, -you will receive the following pieces of information: - -1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). -2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. -3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). -4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). -5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. - -YOUR TASK (each step): -A) SUMMARIZE THE CHANGE - - Describe what visibly changed between the previous observation (or diff) and the current observation. - For example, did a new panel open, did the form reset, did nothing happen, etc.? - -B) ASSESS THE ACTION - - Decide whether the agent's action seems helpful or correct given the user's main goal, - or if it appears incorrect/unhelpful. - - Briefly explain why. - -OUTPUT FORMAT (per step): -Return your analysis as a JSON-like structure, for example: - -{ - "changeSummary": "A new search results panel appeared on the right side.", - "actionAssessment": "Correct", - "explanation": "Clicking 'Search' was appropriate to display the results." -} - -Or for an incorrect action: - -{ - "changeSummary": "The page reloaded but the date fields were reset to defaults.", - "actionAssessment": "Incorrect", - "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", - "suggestion": "Correct the date format or check for error messages." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Goal: {goal} - -LLM Plan: {plan} - -Previous Observation: {past_observation} - -Current Observation: {current_observation} - -Past summaries: {past_summaries} - -Action: {action} -""" - -ERROR_CLASSIFICATION_PROMPT = """ -You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. -Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), -followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), -a set of labeled examples for reference (few-shot), and finally the classification task you must complete. - --------------------------------------------------------------------------------- -TAXONOMY DEFINITIONS --------------------------------------------------------------------------------- - -1. AGENT ERRORS -These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. - - - Navigation & Planning Errors - The agent cannot construct or execute a correct sequence of actions to reach its goal - (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). - - - Interaction Execution Errors - The agent enters data in the wrong format, forgets to click "Submit" after typing, - repeats the same failing action without adaptation, or loses track of the changing webpage state. - - - Information Processing Errors - The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), - misconstrues relationships between pieces of information, or fails to validate data against task requirements. - - - Observation & Action Errors - The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) - or misaligns its actions (clicks the wrong element or stale link). - -2. LANGUAGE MODEL ERRORS -These errors result from the model's inability to correctly interpret or reason about the task at a higher level, -independent of the low-level web interactions. - - - Task Understanding Errors - The agent misreads or misunderstands the user's objective (goal interpretation), - loses crucial context (context loss), or performs actions beyond or short of the intended scope. - - - Reasoning Failures - The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, - or fails to prioritize important subtasks when handling complex goals. - -3. BENCHMARK & ENVIRONMENT ERRORS -These errors are external to the agent's logic and the language model's reasoning, -arising from flaws in the system, network, or evaluation framework itself. - - - System Errors - Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). - - - Benchmark Design Errors - Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), - or inflexible evaluation systems that fail to account for valid alternative solutions. - --------------------------------------------------------------------------------- -INPUT DESCRIPTION --------------------------------------------------------------------------------- - -You will receive the following for each scenario: -1. User Goal - - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). - -2. Planning / Thought History - - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. - -3. Current Observation (HTML / AX Tree Snippet) - - The webpage structure or state that the agent sees at a given point in time. - -4. Historical change summaries - - A list of summaries of changes in the observation that the agent has seen during the course of actions. - -5. Action History - - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) - along with immediate outcomes or errors. - -Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. - --------------------------------------------------------------------------------- -FEW-SHOT CLASSIFICATION EXAMPLES --------------------------------------------------------------------------------- - -1) EXAMPLE A (Benchmark Error - Benchmark Design Error) - • Context: The agent correctly finds a cheaper product meeting the user's criteria, - but the benchmark expects a more expensive product and marks the solution as wrong. - • Classification: ["Benchmark Design Error"] - • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid - and does not allow an alternative correct solution. - -2) EXAMPLE B (Agent Error - Interaction Execution) - • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. - Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. - • Classification: ["Agent Error - Interaction Execution"] - • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action - without adaptation ("Action Repetition"). - -3) EXAMPLE C (Benchmark Error - Benchmark Design Error) - • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" - The query is ambiguous because "Upitts" is not a standard location. - The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. - • Classification: ["Benchmark Design Error"] - • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), - leading the agent astray due to unclear context. - -4) EXAMPLE D (Language Model Error - Task Understanding) - • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' - that are older than 30 days and add a comment saying 'I can help fix this.'" - The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue - with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. - • Classification: ["Language Model Error - Task Understanding"] - • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, - it focused on creating a new issue. This is a misinterpretation of the instructions, - not a mechanical error in clicking or input format. - --------------------------------------------------------------------------------- -CLASSIFICATION TASK --------------------------------------------------------------------------------- - -1. Read through: - - The planning and thought history - - The action history - - The current HTML or AX Tree observation - - The user goal - -2. Decide if the failure is: - - An Agent Error (which subcategory/subcategories), - - A Language Model Error (which subcategory/subcategories), - - A Benchmark/Environment Error (which subcategory/subcategories), - - Or a combination thereof (multi-label if needed). - -3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. - -4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". - -Output Format Example: -{ - "errorCategory": ["Agent Error - Navigation & Planning"], - "explanation": "The agent opened the wrong GitLab page and never recovered..." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Overall goal: {goal} - -LLM Plan and thought history: {plan} - -Current Observation: {current_observation} - -Historical change summaries: {historical_summaries} - -Action history: {action_history} -""" +from agentlab.analyze.error_analysis.summarizer_prompts import ( + CHANGE_SUMMARIZER_PROMPT, + ERROR_CLASSIFICATION_PROMPT, +) +from agentlab.analyze.inspect_results import summarize def _diff(past_obs, current_obs): @@ -218,25 +21,31 @@ def _diff(past_obs, current_obs): class ChangeSummarizer: llm: callable # language model - obs_formatter: callable + obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available") use_diff: bool = False - def summarize( - self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] - ) -> str: + def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str: """Produces, a summary of the effect of an action.""" - past_obs_message = self.obs_formatter(past_obs) - current_obs_message = self.obs_formatter(current_obs) + obs_message = self.obs_formatter(obs.obs) + next_obs_message = self.obs_formatter(next_obs.obs) - goal = past_obs["goal"] # Use goal object from agentlab + action = obs.action + + goal = obs.obs["goal"] # Use goal object from agentlab + # TODO(thibault): switch to 'goal_object' # Outsource everything to formatter - plan = past_obs["plan"] + if self.use_diff: - current_obs_message = _diff(past_obs_message, current_obs_message) + next_obs_message = _diff(obs_message, next_obs_message) return self.llm( self.make_prompt( - past_obs_message, action, current_obs_message, past_summaries, goal, plan + obs_message, + action, + next_obs_message, + past_summaries, + goal, + obs.obs.get("plan", "No plan available"), ) ) @@ -266,9 +75,20 @@ class EpisodeSummarizer: change_summarizer: ChangeSummarizer = None - def summarize(episode: list[StepInfo]) -> EpisodeAnalysis: + def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... + + def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - pass + summaries = self.make_change_summaries(exp_results) + + def make_change_summaries(self, exp_result: ExpResult) -> list[str]: + summaries = [] # type: list[str] + # this assumes that there is always an extra step at the end of the episode + # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info + # TODO:(thibault) make some checks or w/e + for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): + summaries.append(self.change_summarizer.summarize(step, next_step, summaries)) + return summaries @dataclass diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py new file mode 100644 index 00000000..382c2805 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -0,0 +1,202 @@ +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +{{ + "changeSummary": "A new search results panel appeared on the right side.", + "actionAssessment": "Correct", + "explanation": "Clicking 'Search' was appropriate to display the results." +}} + +Or for an incorrect action: + +{{ + "changeSummary": "The page reloaded but the date fields were reset to defaults.", + "actionAssessment": "Incorrect", + "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", + "suggestion": "Correct the date format or check for error messages." +}} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Current Observation: {past_observation} + +Next Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), +followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. AGENT ERRORS +These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. + + - Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + + - Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + + - Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + + - Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +2. LANGUAGE MODEL ERRORS +These errors result from the model's inability to correctly interpret or reason about the task at a higher level, +independent of the low-level web interactions. + + - Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + + - Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +3. BENCHMARK & ENVIRONMENT ERRORS +These errors are external to the agent's logic and the language model's reasoning, +arising from flaws in the system, network, or evaluation framework itself. + + - System Errors + Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). + + - Benchmark Design Errors + Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), + or inflexible evaluation systems that fail to account for valid alternative solutions. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Planning / Thought History + - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. + +3. Current Observation (HTML / AX Tree Snippet) + - The webpage structure or state that the agent sees at a given point in time. + +4. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +5. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Benchmark Error - Benchmark Design Error) + • Context: The agent correctly finds a cheaper product meeting the user's criteria, + but the benchmark expects a more expensive product and marks the solution as wrong. + • Classification: ["Benchmark Design Error"] + • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid + and does not allow an alternative correct solution. + +2) EXAMPLE B (Agent Error - Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Agent Error - Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +3) EXAMPLE C (Benchmark Error - Benchmark Design Error) + • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" + The query is ambiguous because "Upitts" is not a standard location. + The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. + • Classification: ["Benchmark Design Error"] + • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), + leading the agent astray due to unclear context. + +4) EXAMPLE D (Language Model Error - Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Language Model Error - Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide if the failure is: + - An Agent Error (which subcategory/subcategories), + - A Language Model Error (which subcategory/subcategories), + - A Benchmark/Environment Error (which subcategory/subcategories), + - Or a combination thereof (multi-label if needed). + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". + +Output Format Example: +{{ + "errorCategory": ["Agent Error - Navigation & Planning"], + "explanation": "The agent opened the wrong GitLab page and never recovered..." +}} + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Overall goal: {goal} + +LLM Plan and thought history: {plan} + +Current Observation: {current_observation} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} +""" diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py new file mode 100644 index 00000000..e9fe0ecc --- /dev/null +++ b/tests/analyze/error_analysis/test_summarizer.py @@ -0,0 +1,22 @@ +from pathlib import Path + +import pytest +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer +from agentlab.analyze.inspect_results import yield_all_exp_results + + +@pytest.fixture(scope="module") +def exp_results() -> list[ExpResult]: + exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis" + return list(yield_all_exp_results(exp_dir)) + + +def test_change_summarizer(exp_results: list[ExpResult]): + summarizer = ChangeSummarizer(llm=lambda x: x) + step = exp_results[0].steps_info[0] + next_step = exp_results[0].steps_info[1] + past_summaries = [] + summary = summarizer.summarize(step, next_step, past_summaries) + assert isinstance(summary, str) From 394999bac9ffde98432e62e776fbb489d5e1e432 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 24 Jan 2025 14:41:10 -0500 Subject: [PATCH 10/25] chat_models can take str as input --- src/agentlab/llm/chat_api.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index 7392e666..fb65c3dd 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -4,7 +4,7 @@ import time from dataclasses import dataclass from functools import partial -from typing import Optional +from typing import Optional, Union import openai from huggingface_hub import InferenceClient @@ -13,7 +13,7 @@ import agentlab.llm.tracking as tracking from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs from agentlab.llm.huggingface_utils import HFBaseChatModel -from agentlab.llm.llm_utils import AIMessage, Discussion +from agentlab.llm.llm_utils import AIMessage, Discussion, HumanMessage def make_system_message(content: str) -> dict: @@ -261,7 +261,13 @@ def __init__( **client_args, ) - def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict: + def __call__( + self, messages: Union[str, list[dict]], n_samples: int = 1, temperature: float = None + ) -> dict: + + if isinstance(messages, str): + messages = [HumanMessage(messages)] + # Initialize retry tracking attributes self.retries = 0 self.success = False From e0e786cfbc9530866a17ea662a41add384af4e51 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Fri, 24 Jan 2025 14:41:19 -0500 Subject: [PATCH 11/25] typing --- src/agentlab/llm/llm_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py index ec608686..16920336 100644 --- a/src/agentlab/llm/llm_configs.py +++ b/src/agentlab/llm/llm_configs.py @@ -16,7 +16,7 @@ "test", ] -CHAT_MODEL_ARGS_DICT = { +CHAT_MODEL_ARGS_DICT = { # type: dict[str, Union[AzureModelArgs, OpenAIModelArgs, SelfHostedModelArgs, OpenRouterModelArgs]] "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs( model_name="gpt-4o-mini-2024-07-18", max_total_tokens=128_000, From 46d2c8c10646dc034584c80a68829e1d46dd40ae Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 28 Jan 2025 11:09:27 -0500 Subject: [PATCH 12/25] keep this here bc it's going to pop back up --- .../summary_df.csv | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv new file mode 100644 index 00000000..85b34311 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv @@ -0,0 +1,2 @@ +avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost +1.0,0.0,1.0,1/1,0,0.0003 From 8a882ad58d23438319952b4f7802568473964bd8 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 28 Jan 2025 11:09:53 -0500 Subject: [PATCH 13/25] pipeline mvp --- .../analyze/error_analysis/pipeline.py | 24 ++++------ .../analyze/error_analysis/summarizer.py | 44 ++++++++++++++++--- .../error_analysis/summarizer_prompts.py | 14 +----- 3 files changed, 47 insertions(+), 35 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 305d00b4..62c313aa 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -6,7 +6,11 @@ from bgym import ExpResult -from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer +from agentlab.analyze.error_analysis.summarizer import ( + ChangeSummarizer, + EpisodeErrorSummarizer, + EpisodeSummarizer, +) from agentlab.analyze.inspect_results import yield_all_exp_results @@ -24,7 +28,6 @@ class ErrorAnalysisPipeline: exp_dir: Path filter: str = None episode_summarizer: EpisodeSummarizer = None - analyzer: Analyzer = None def filter_exp_results(self) -> Generator[ExpResult, None, None]: # TODO:(thibault) improve filtering @@ -37,23 +40,16 @@ def run_analysis(self): filtered_results = self.filter_exp_results() for exp_result in filtered_results: - episode_summary = self.episode_summarizer(exp_result) - error_analysis = self.analyze_errors(exp_result, episode_summary) + error_analysis = self.episode_summarizer(exp_result) self.save_analysis(exp_result, error_analysis) - def analyze_errors( - self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] - ) -> str: - error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis) - return error_analysis - def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True): """Save the analysis to json""" analysis_path = exp_result.exp_dir / "error_analysis.json" if not exists_ok and analysis_path.exists(): raise FileExistsError(f"{analysis_path} already exists") with analysis_path.open("w") as f: - json.dump(error_analysis, f) + json.dump(error_analysis, f, indent=4) if __name__ == "__main__": @@ -67,8 +63,6 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T exp_dir = Path(args.exp_dir) filter = args.filter - import openai - from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model() @@ -79,9 +73,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, filter=filter, - episode_summarizer=EpisodeSummarizer(), - step_summarizer=ChangeSummarizer(), - analyzer=Analyzer("prompt"), + episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm), ) pipeline.run_analysis() diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 7c5f9b03..5c1fc343 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -7,6 +7,7 @@ ERROR_CLASSIFICATION_PROMPT, ) from agentlab.analyze.inspect_results import summarize +from agentlab.llm.llm_utils import json_parser def _diff(past_obs, current_obs): @@ -21,7 +22,7 @@ def _diff(past_obs, current_obs): class ChangeSummarizer: llm: callable # language model - obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available") + obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available") use_diff: bool = False def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str: @@ -74,12 +75,25 @@ class EpisodeAnalysis: class EpisodeSummarizer: change_summarizer: ChangeSummarizer = None + llm: callable = None + parser: callable = lambda x: json_parser(x)[0] def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" + + if exp_results.steps_info[-1].reward == 1: + return {"analysis": "Success", "summaries": {}} + summaries = self.make_change_summaries(exp_results) + prompt = self.make_prompt(exp_results, summaries) + raw_analysis = self.llm(prompt)["content"] + analysis = self.parser(raw_analysis) + return { + "analysis": analysis, + "summaries": {i: self.parser(a) for i, a in enumerate(summaries)}, + } def make_change_summaries(self, exp_result: ExpResult) -> list[str]: summaries = [] # type: list[str] @@ -87,7 +101,9 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]: # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info # TODO:(thibault) make some checks or w/e for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): - summaries.append(self.change_summarizer.summarize(step, next_step, summaries)) + summaries.append( + self.change_summarizer.summarize(step, next_step, summaries)["content"] + ) return summaries @@ -96,12 +112,26 @@ class EpisodeErrorSummarizer(EpisodeSummarizer): change_summarizer: ChangeSummarizer = None - def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): + def make_prompt(self, exp_results: ExpResult, summaries: list[str]): """TODO: Implement the prompt.""" + goal = exp_results.steps_info[0].obs["goal"] + + txt_summaries = "\n".join(summaries) + + thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]] + actions = [step.action for step in exp_results.steps_info[:-1]] + action_errors = "\n".join( + [step.obs["last_action_error"] for step in exp_results.steps_info[1:]] + ) + + txt_actions = "\n".join( + [ + f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}" + for action, thought, action_error in zip(actions, thoughts, action_errors) + ] + ) return ERROR_CLASSIFICATION_PROMPT.format( goal=goal, - plan=plan, - current_observation=current_observation, - historical_summaries=historical_summaries, - action_history=action_history, + historical_summaries=txt_summaries, + action_history=txt_actions, ) diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py index 382c2805..a37be0a9 100644 --- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -110,17 +110,11 @@ You will receive the following for each scenario: 1. User Goal - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). - -2. Planning / Thought History - - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. -3. Current Observation (HTML / AX Tree Snippet) - - The webpage structure or state that the agent sees at a given point in time. - -4. Historical change summaries +2. Historical change summaries - A list of summaries of changes in the observation that the agent has seen during the course of actions. -5. Action History +3. Action History - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) along with immediate outcomes or errors. @@ -192,10 +186,6 @@ Overall goal: {goal} -LLM Plan and thought history: {plan} - -Current Observation: {current_observation} - Historical change summaries: {historical_summaries} Action history: {action_history} From 3fab5b4fc1d75a899f30a8b1d8263021b41dad8b Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 28 Jan 2025 11:10:21 -0500 Subject: [PATCH 14/25] added a specific tab and viz for it in xray --- src/agentlab/analyze/agent_xray.py | 46 ++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 9764898c..7466db87 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1,4 +1,5 @@ import base64 +import json import os import traceback from copy import deepcopy @@ -30,6 +31,32 @@ TASK_SEED_KEY = "env.task_seed" +def dict_to_markdown(data, level=1): + """ + Convert a nested dictionary to a Markdown string with hierarchical headers. + + Parameters: + data (dict): The dictionary to convert. + level (int): The current header level (default is 1). + + Returns: + str: The formatted Markdown string. + """ + markdown = "" + + for key, value in data.items(): + if isinstance(value, dict): + # Add a header for the key and recursively process the dictionary + markdown += f"{'#' * level} {key}\n" + markdown += dict_to_markdown(value, level + 1) + else: + # Add the key-value pair with indentation + markdown += f"{'#' * level} {key}\n" + markdown += f" {value}\n" + + return markdown + + def display_table(df: pd.DataFrame): df = df.copy() df.columns = clean_column_names(df.columns) @@ -358,6 +385,9 @@ def run_gradio(results_dir: Path): with gr.Tab("Task Error") as tab_error: task_error = gr.Markdown() + with gr.Tab("Error Analysis") as tab_error_analysis: + error_analysis = gr.Markdown() + with gr.Tab("Logs") as tab_logs: logs = gr.Code(language=None, **code_args) @@ -485,6 +515,7 @@ def run_gradio(results_dir: Path): tab_axtree.select(fn=update_axtree, outputs=axtree_code) tab_chat.select(fn=update_chat_messages, outputs=chat_messages) tab_error.select(fn=update_task_error, outputs=task_error) + tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis) tab_logs.select(fn=update_logs, outputs=logs) tab_stats.select(fn=update_stats, outputs=stats) tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html) @@ -612,6 +643,20 @@ def update_task_error(): return "No Task Error" +def update_error_analysis(): + global info + try: + error_analysis = info.exp_result.exp_dir / "error_analysis.json" + if not error_analysis.exists(): + return "No Error Analysis Found" + with error_analysis.open("r") as f: + json_data = json.load(f) + res = dict_to_markdown(json_data) + return res + except FileNotFoundError: + return "No Error Analysis" + + def update_logs(): global info try: @@ -1200,3 +1245,4 @@ def main(): if __name__ == "__main__": main() + main() From 2be23e5270ac3eb7ef99d675c3a9303548d4c733 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Tue, 28 Jan 2025 11:12:37 -0500 Subject: [PATCH 15/25] added formatting options --- src/agentlab/analyze/error_analysis/pipeline.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 62c313aa..4330cce2 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -52,6 +52,10 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T json.dump(error_analysis, f, indent=4) +AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available") +HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available") + + if __name__ == "__main__": import argparse @@ -73,7 +77,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, filter=filter, - episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm), + episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, HTML_FORMATTER), llm), ) pipeline.run_analysis() From 41f8f69f1719954f934a49dc03a79bc0a1db0b1b Mon Sep 17 00:00:00 2001 From: Megh Thakkar Date: Wed, 29 Jan 2025 13:22:53 -0500 Subject: [PATCH 16/25] Update summarizer_prompts.py --- .../error_analysis/summarizer_prompts.py | 106 +++++++----------- 1 file changed, 39 insertions(+), 67 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py index a37be0a9..2b893d6e 100644 --- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -53,55 +53,37 @@ ERROR_CLASSIFICATION_PROMPT = """ You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. -Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), -followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), +Below are the high-level definitions of each category, +followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), a set of labeled examples for reference (few-shot), and finally the classification task you must complete. -------------------------------------------------------------------------------- TAXONOMY DEFINITIONS -------------------------------------------------------------------------------- -1. AGENT ERRORS -These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. +1. Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). - - Navigation & Planning Errors - The agent cannot construct or execute a correct sequence of actions to reach its goal - (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). +2. Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. - - Interaction Execution Errors - The agent enters data in the wrong format, forgets to click "Submit" after typing, - repeats the same failing action without adaptation, or loses track of the changing webpage state. +3. Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. - - Information Processing Errors - The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), - misconstrues relationships between pieces of information, or fails to validate data against task requirements. +4. Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). - - Observation & Action Errors - The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) - or misaligns its actions (clicks the wrong element or stale link). +5. Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. -2. LANGUAGE MODEL ERRORS -These errors result from the model's inability to correctly interpret or reason about the task at a higher level, -independent of the low-level web interactions. - - - Task Understanding Errors - The agent misreads or misunderstands the user's objective (goal interpretation), - loses crucial context (context loss), or performs actions beyond or short of the intended scope. - - - Reasoning Failures - The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, - or fails to prioritize important subtasks when handling complex goals. - -3. BENCHMARK & ENVIRONMENT ERRORS -These errors are external to the agent's logic and the language model's reasoning, -arising from flaws in the system, network, or evaluation framework itself. - - - System Errors - Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). - - - Benchmark Design Errors - Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), - or inflexible evaluation systems that fail to account for valid alternative solutions. +6. Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. -------------------------------------------------------------------------------- INPUT DESCRIPTION @@ -124,34 +106,19 @@ FEW-SHOT CLASSIFICATION EXAMPLES -------------------------------------------------------------------------------- -1) EXAMPLE A (Benchmark Error - Benchmark Design Error) - • Context: The agent correctly finds a cheaper product meeting the user's criteria, - but the benchmark expects a more expensive product and marks the solution as wrong. - • Classification: ["Benchmark Design Error"] - • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid - and does not allow an alternative correct solution. - -2) EXAMPLE B (Agent Error - Interaction Execution) +1) EXAMPLE A (Interaction Execution) • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. - • Classification: ["Agent Error - Interaction Execution"] + • Classification: ["Interaction Execution"] • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action without adaptation ("Action Repetition"). -3) EXAMPLE C (Benchmark Error - Benchmark Design Error) - • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" - The query is ambiguous because "Upitts" is not a standard location. - The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. - • Classification: ["Benchmark Design Error"] - • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), - leading the agent astray due to unclear context. - -4) EXAMPLE D (Language Model Error - Task Understanding) +2) EXAMPLE B (Task Understanding) • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' that are older than 30 days and add a comment saying 'I can help fix this.'" The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. - • Classification: ["Language Model Error - Task Understanding"] + • Classification: ["Task Understanding"] • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, it focused on creating a new issue. This is a misinterpretation of the instructions, not a mechanical error in clicking or input format. @@ -166,23 +133,28 @@ - The current HTML or AX Tree observation - The user goal -2. Decide if the failure is: - - An Agent Error (which subcategory/subcategories), - - A Language Model Error (which subcategory/subcategories), - - A Benchmark/Environment Error (which subcategory/subcategories), - - Or a combination thereof (multi-label if needed). +2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies. + If the task is successful, you can keep the error category as blank. 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. -4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". +Output format example for an unsuccessful interaction: +{{ + "explanation": "The agent opened the wrong GitLab page and never recovered...", + "success": False, + "errorCategory": ["Navigation & Planning"], +}} -Output Format Example: +Output format example for a successful interaction: {{ - "errorCategory": ["Agent Error - Navigation & Planning"], - "explanation": "The agent opened the wrong GitLab page and never recovered..." + "explanation": "The agent opened the correct GitLab page and ...", + "success": True, + "errorCategory": [], }} -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. +Please follow this structure at every step. Keep your responses concise and clear. + +Below are the details for the interaction. Overall goal: {goal} From 6163b47a7caf527bd08ee820f33a4cabb6d94b9f Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Wed, 29 Jan 2025 19:30:10 +0000 Subject: [PATCH 17/25] xml parsing --- .../analyze/error_analysis/pipeline.py | 4 +- .../analyze/error_analysis/summarizer.py | 29 ++++++++++---- .../error_analysis/summarizer_prompts.py | 39 +++++++++---------- 3 files changed, 41 insertions(+), 31 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 4330cce2..887a0ba3 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -69,7 +69,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT - llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model() + llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model() step_summarizer = ChangeSummarizer(llm, lambda x: x) episode_summarizer = EpisodeSummarizer() @@ -77,7 +77,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, filter=filter, - episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, HTML_FORMATTER), llm), + episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm), ) pipeline.run_analysis() diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 5c1fc343..7df5e754 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -7,7 +7,7 @@ ERROR_CLASSIFICATION_PROMPT, ) from agentlab.analyze.inspect_results import summarize -from agentlab.llm.llm_utils import json_parser +from agentlab.llm.llm_utils import json_parser, parse_html_tags def _diff(past_obs, current_obs): @@ -39,7 +39,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str] if self.use_diff: next_obs_message = _diff(obs_message, next_obs_message) - return self.llm( + return self.parse(self.llm( self.make_prompt( obs_message, action, @@ -48,7 +48,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str] goal, obs.obs.get("plan", "No plan available"), ) - ) + )['content']) def make_prompt( self, past_obs_message, action, current_obs_message, past_summaries, goal, plan @@ -63,6 +63,10 @@ def make_prompt( action=action, ) + def parse(self, raw_output: str) -> dict: + parsed_result = parse_html_tags(raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"])[0] + return parsed_result + @dataclass class EpisodeAnalysis: @@ -83,13 +87,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - if exp_results.steps_info[-1].reward == 1: - return {"analysis": "Success", "summaries": {}} + # if exp_results.steps_info[-1].reward == 1: + # return {"analysis": "Success", "summaries": {}} summaries = self.make_change_summaries(exp_results) prompt = self.make_prompt(exp_results, summaries) raw_analysis = self.llm(prompt)["content"] - analysis = self.parser(raw_analysis) + analysis = self.parse(raw_analysis) return { "analysis": analysis, "summaries": {i: self.parser(a) for i, a in enumerate(summaries)}, @@ -102,10 +106,13 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]: # TODO:(thibault) make some checks or w/e for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): summaries.append( - self.change_summarizer.summarize(step, next_step, summaries)["content"] + self.change_summarizer.summarize(step, next_step, summaries) ) return summaries + def parse(self, raw_output: str) -> dict: + parsed_result = parse_html_tags(raw_output, keys=["explanation", "success", "errorCategory"])[0] + return parsed_result @dataclass class EpisodeErrorSummarizer(EpisodeSummarizer): @@ -116,7 +123,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): """TODO: Implement the prompt.""" goal = exp_results.steps_info[0].obs["goal"] - txt_summaries = "\n".join(summaries) + def format_summary(summary): + res = '' + for key, value in summary.items(): + res += f"{key}: {value}\n" + return res + + txt_summaries = "\n".join([format_summary(summary) for summary in summaries]) thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]] actions = [step.action for step in exp_results.steps_info[:-1]] diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py index 2b893d6e..807f1a2c 100644 --- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -21,20 +21,19 @@ OUTPUT FORMAT (per step): Return your analysis as a JSON-like structure, for example: -{{ - "changeSummary": "A new search results panel appeared on the right side.", - "actionAssessment": "Correct", - "explanation": "Clicking 'Search' was appropriate to display the results." -}} +A new search results panel appeared on the right side. +Correct +Clicking 'Search' was appropriate to display the results. Or for an incorrect action: -{{ - "changeSummary": "The page reloaded but the date fields were reset to defaults.", - "actionAssessment": "Incorrect", - "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", - "suggestion": "Correct the date format or check for error messages." -}} +The page reloaded but the date fields were reset to defaults. +Incorrect +The agent should have fixed the date format first instead of re-clicking 'Show report'. +Correct the date format or check for error messages. + + +Please use single quotes '' to quote elements from the page, so as not to create parsing issues. Please follow this structure at every step. Keep your responses concise and clear. Below are the details. @@ -139,19 +138,17 @@ 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. Output format example for an unsuccessful interaction: -{{ - "explanation": "The agent opened the wrong GitLab page and never recovered...", - "success": False, - "errorCategory": ["Navigation & Planning"], -}} + +The agent opened the wrong GitLab page and never recovered... +False +["Navigation & Planning"] Output format example for a successful interaction: -{{ - "explanation": "The agent opened the correct GitLab page and ...", - "success": True, - "errorCategory": [], -}} +The agent opened the correct GitLab page and ... +True +[] + Please follow this structure at every step. Keep your responses concise and clear. Below are the details for the interaction. From a1f3416275ac8c5c616ba506db536ac6a5af598c Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Wed, 29 Jan 2025 19:33:49 +0000 Subject: [PATCH 18/25] fix --- src/agentlab/analyze/error_analysis/summarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 7df5e754..fb2b47fe 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -96,7 +96,7 @@ def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: analysis = self.parse(raw_analysis) return { "analysis": analysis, - "summaries": {i: self.parser(a) for i, a in enumerate(summaries)}, + "summaries": {i: a for i, a in enumerate(summaries)}, } def make_change_summaries(self, exp_result: ExpResult) -> list[str]: From a455d0d431b92491486f4b20591a455eff40f888 Mon Sep 17 00:00:00 2001 From: Leo Boisvert Date: Tue, 4 Feb 2025 20:50:40 +0000 Subject: [PATCH 19/25] add error analysis prediction validation script --- .../validate_analysis_predictions.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 src/agentlab/analyze/error_analysis/validate_analysis_predictions.py diff --git a/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py new file mode 100644 index 00000000..af5613bb --- /dev/null +++ b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py @@ -0,0 +1,31 @@ +from pathlib import Path +from agentlab.analyze.inspect_results import ( + load_result_df, +) +import json + + +def get_aggregate_statistics(exp_dir: Path): + """Get aggregate statistics for the experiment results.""" + results = load_result_df(exp_dir, filter=filter) + + +if __name__ == "__main__": + path = Path( + "/mnt/colab_public/data/ui_copilot/thibault/tmlr_exps/2024-10-23_14-17-47_5_agents_on_workarena_l1" + ) + results = load_result_df(path).reset_index() + results = results.loc[results["agent.chat_model.model_name"].str.contains("anthropic")] + success_predictions = [] + for dir in results["exp_dir"]: + error_analysis = Path(dir) / "error_analysis.json" + if error_analysis.exists(): + with open(error_analysis, "r") as f: + error_analysis = json.load(f) + task_success_prediction_str = error_analysis["analysis"]["success"] + task_success_prediction = True if task_success_prediction_str == "True" else False + success_predictions.append(task_success_prediction) + else: + success_predictions.append(None) + results["success_predictions"] = success_predictions + a = 1 From 82dbabad0289b56a02f6d5085ade2d1dbe9119aa Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 6 Feb 2025 11:01:40 -0500 Subject: [PATCH 20/25] black version update --- requirements.txt | 2 +- .../analyze/error_analysis/summarizer.py | 37 +++++++++++-------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/requirements.txt b/requirements.txt index c598b342..a59d4a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -black[jupyter]>=24.2.0 +black[jupyter]>=24.2.0,<25 blacken-docs pre-commit pytest==7.3.2 diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index fb2b47fe..14ab10ba 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -39,16 +39,18 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str] if self.use_diff: next_obs_message = _diff(obs_message, next_obs_message) - return self.parse(self.llm( - self.make_prompt( - obs_message, - action, - next_obs_message, - past_summaries, - goal, - obs.obs.get("plan", "No plan available"), - ) - )['content']) + return self.parse( + self.llm( + self.make_prompt( + obs_message, + action, + next_obs_message, + past_summaries, + goal, + obs.obs.get("plan", "No plan available"), + ) + )["content"] + ) def make_prompt( self, past_obs_message, action, current_obs_message, past_summaries, goal, plan @@ -64,7 +66,9 @@ def make_prompt( ) def parse(self, raw_output: str) -> dict: - parsed_result = parse_html_tags(raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"])[0] + parsed_result = parse_html_tags( + raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"] + )[0] return parsed_result @@ -105,15 +109,16 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]: # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info # TODO:(thibault) make some checks or w/e for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): - summaries.append( - self.change_summarizer.summarize(step, next_step, summaries) - ) + summaries.append(self.change_summarizer.summarize(step, next_step, summaries)) return summaries def parse(self, raw_output: str) -> dict: - parsed_result = parse_html_tags(raw_output, keys=["explanation", "success", "errorCategory"])[0] + parsed_result = parse_html_tags( + raw_output, keys=["explanation", "success", "errorCategory"] + )[0] return parsed_result + @dataclass class EpisodeErrorSummarizer(EpisodeSummarizer): @@ -124,7 +129,7 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): goal = exp_results.steps_info[0].obs["goal"] def format_summary(summary): - res = '' + res = "" for key, value in summary.items(): res += f"{key}: {value}\n" return res From 5fbbe57ae52e58d1f1f7ea1734cbcd95c25417a5 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 20 Feb 2025 15:41:53 -0500 Subject: [PATCH 21/25] phony command, joblib stuff, took think out of prompt --- pyproject.toml | 1 + .../analyze/error_analysis/pipeline.py | 41 +++++++++++++++---- .../analyze/error_analysis/summarizer.py | 19 +++++---- 3 files changed, 45 insertions(+), 16 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2a1e06c3..782b1f26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,3 +57,4 @@ exclude = ''' [project.scripts] agentlab-assistant = "agentlab.ui_assistant:main" agentlab-xray = "agentlab.analyze.agent_xray:main" +agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main" diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index 887a0ba3..f726891f 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -23,6 +23,11 @@ def __call__(self, *args, **kwds): return "analysis" +def analyze(exp_result, episode_summarizer, save_analysis_func): + error_analysis = episode_summarizer(exp_result) + save_analysis_func(exp_result, error_analysis) + + @dataclass class ErrorAnalysisPipeline: exp_dir: Path @@ -36,12 +41,21 @@ def filter_exp_results(self) -> Generator[ExpResult, None, None]: if self.filter is None or self.filter in str(exp_result.exp_dir): yield exp_result - def run_analysis(self): + def run_analysis(self, parallel=False, jobs=-1): filtered_results = self.filter_exp_results() - for exp_result in filtered_results: - error_analysis = self.episode_summarizer(exp_result) - self.save_analysis(exp_result, error_analysis) + if parallel: + import joblib + + joblib.Parallel(n_jobs=jobs, backend="threading")( + joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis) + for exp_result in filtered_results + ) + + else: + for exp_result in filtered_results: + error_analysis = self.episode_summarizer(exp_result) + self.save_analysis(exp_result, error_analysis) def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True): """Save the analysis to json""" @@ -56,28 +70,37 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available") -if __name__ == "__main__": +def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("-e", "--exp_dir", type=str) parser.add_argument("-f", "--filter", type=str, default=None) + parser.add_argument("-p", "--parallel", action="store_true") + parser.add_argument("-j", "--jobs", type=int, default=-1) args = parser.parse_args() + + assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir" + exp_dir = Path(args.exp_dir) filter = args.filter + parallel = args.parallel + jobs = args.jobs from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model() - step_summarizer = ChangeSummarizer(llm, lambda x: x) - episode_summarizer = EpisodeSummarizer() - pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, filter=filter, episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm), ) - pipeline.run_analysis() + pipeline.run_analysis(parallel=parallel, jobs=jobs) + + +if __name__ == "__main__": + + main() diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 14ab10ba..0b667cd8 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -6,8 +6,8 @@ CHANGE_SUMMARIZER_PROMPT, ERROR_CLASSIFICATION_PROMPT, ) -from agentlab.analyze.inspect_results import summarize from agentlab.llm.llm_utils import json_parser, parse_html_tags +from agentlab.llm.tracking import set_tracker def _diff(past_obs, current_obs): @@ -94,14 +94,20 @@ def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: # if exp_results.steps_info[-1].reward == 1: # return {"analysis": "Success", "summaries": {}} - summaries = self.make_change_summaries(exp_results) + with set_tracker("summary") as summaries_tracker: + summaries = self.make_change_summaries(exp_results) prompt = self.make_prompt(exp_results, summaries) - raw_analysis = self.llm(prompt)["content"] + + with set_tracker("analysis") as analysis_tracker: + raw_analysis = self.llm(prompt)["content"] analysis = self.parse(raw_analysis) - return { + res = { "analysis": analysis, "summaries": {i: a for i, a in enumerate(summaries)}, } + res.update(analysis_tracker.stats) + res.update(summaries_tracker.stats) + return res def make_change_summaries(self, exp_result: ExpResult) -> list[str]: summaries = [] # type: list[str] @@ -136,7 +142,6 @@ def format_summary(summary): txt_summaries = "\n".join([format_summary(summary) for summary in summaries]) - thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]] actions = [step.action for step in exp_results.steps_info[:-1]] action_errors = "\n".join( [step.obs["last_action_error"] for step in exp_results.steps_info[1:]] @@ -144,8 +149,8 @@ def format_summary(summary): txt_actions = "\n".join( [ - f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}" - for action, thought, action_error in zip(actions, thoughts, action_errors) + f"Action: {action}\nAction Error: {action_error}" + for action, action_error in zip(actions, action_errors) ] ) return ERROR_CLASSIFICATION_PROMPT.format( From 3a3d602f83f363a3606ddee7de29b41b031c6d87 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 20 Feb 2025 16:05:12 -0500 Subject: [PATCH 22/25] task_info --- .../analyze/error_analysis/summarizer.py | 12 +++++++----- .../analyze/error_analysis/summarizer_prompts.py | 16 +++++----------- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 0b667cd8..00a17e09 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -91,8 +91,8 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - # if exp_results.steps_info[-1].reward == 1: - # return {"analysis": "Success", "summaries": {}} + if exp_results.steps_info[-1].reward == 1: + return {"analysis": "Success", "summaries": {}} with set_tracker("summary") as summaries_tracker: summaries = self.make_change_summaries(exp_results) @@ -119,9 +119,7 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]: return summaries def parse(self, raw_output: str) -> dict: - parsed_result = parse_html_tags( - raw_output, keys=["explanation", "success", "errorCategory"] - )[0] + parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0] return parsed_result @@ -153,8 +151,12 @@ def format_summary(summary): for action, action_error in zip(actions, action_errors) ] ) + + extra_info = exp_results.steps_info[-1].task_info + return ERROR_CLASSIFICATION_PROMPT.format( goal=goal, historical_summaries=txt_summaries, action_history=txt_actions, + extra_info=extra_info, ) diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py index 807f1a2c..91ea8d9c 100644 --- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -132,30 +132,24 @@ - The current HTML or AX Tree observation - The user goal -2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies. - If the task is successful, you can keep the error category as blank. +2. Decide the error category, or a combination thereof, under which the reason for failure lies. 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. -Output format example for an unsuccessful interaction: +Output format example for an interaction: The agent opened the wrong GitLab page and never recovered... -False ["Navigation & Planning"] -Output format example for a successful interaction: - -The agent opened the correct GitLab page and ... -True -[] - Please follow this structure at every step. Keep your responses concise and clear. -Below are the details for the interaction. +Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant. Overall goal: {goal} Historical change summaries: {historical_summaries} Action history: {action_history} + +Extra information: {extra_info} """ From 5bf1bac08a99840843f213e14703b2b412be0d43 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 20 Feb 2025 16:14:12 -0500 Subject: [PATCH 23/25] added flag to oracle success or no --- .../analyze/error_analysis/pipeline.py | 6 +- .../analyze/error_analysis/summarizer.py | 15 ++- .../error_analysis/summarizer_prompts.py | 113 ++++++++++++++++++ 3 files changed, 130 insertions(+), 4 deletions(-) diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py index f726891f..32e5e9df 100644 --- a/src/agentlab/analyze/error_analysis/pipeline.py +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -78,6 +78,7 @@ def main(): parser.add_argument("-f", "--filter", type=str, default=None) parser.add_argument("-p", "--parallel", action="store_true") parser.add_argument("-j", "--jobs", type=int, default=-1) + parser.add_argument("-g", "--guess_success", action="store_true") args = parser.parse_args() @@ -87,6 +88,7 @@ def main(): filter = args.filter parallel = args.parallel jobs = args.jobs + guess_success = args.guess_success from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT @@ -95,7 +97,9 @@ def main(): pipeline = ErrorAnalysisPipeline( exp_dir=exp_dir, filter=filter, - episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm), + episode_summarizer=EpisodeErrorSummarizer( + ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success + ), ) pipeline.run_analysis(parallel=parallel, jobs=jobs) diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index 00a17e09..e184583d 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -5,6 +5,7 @@ from agentlab.analyze.error_analysis.summarizer_prompts import ( CHANGE_SUMMARIZER_PROMPT, ERROR_CLASSIFICATION_PROMPT, + ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT, ) from agentlab.llm.llm_utils import json_parser, parse_html_tags from agentlab.llm.tracking import set_tracker @@ -85,14 +86,16 @@ class EpisodeSummarizer: change_summarizer: ChangeSummarizer = None llm: callable = None parser: callable = lambda x: json_parser(x)[0] + guess_success: bool = False def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - if exp_results.steps_info[-1].reward == 1: - return {"analysis": "Success", "summaries": {}} + if not self.guess_success: + if exp_results.steps_info[-1].reward == 1: + return {"analysis": "Success", "summaries": {}} with set_tracker("summary") as summaries_tracker: summaries = self.make_change_summaries(exp_results) @@ -154,7 +157,13 @@ def format_summary(summary): extra_info = exp_results.steps_info[-1].task_info - return ERROR_CLASSIFICATION_PROMPT.format( + prompt = ( + ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT + if self.guess_success + else ERROR_CLASSIFICATION_PROMPT + ) + + return prompt.format( goal=goal, historical_summaries=txt_summaries, action_history=txt_actions, diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py index 91ea8d9c..a0df9fc9 100644 --- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -50,6 +50,119 @@ Action: {action} """ +ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each category, +followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + +2. Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + +3. Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + +4. Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +5. Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + +6. Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +3. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +2) EXAMPLE B (Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies. + If the task is successful, you can keep the error category as blank. + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +Output format example for an unsuccessful interaction: + +The agent opened the wrong GitLab page and never recovered... +False +["Navigation & Planning"] + +Output format example for a successful interaction: + +The agent opened the correct GitLab page and ... +True +[] + +Please follow this structure at every step. Keep your responses concise and clear. + +Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant. + +Overall goal: {goal} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} + +Extra information: {extra_info} +""" + + ERROR_CLASSIFICATION_PROMPT = """ You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. Below are the high-level definitions of each category, From 097213029f9575b88568ccf8eb127b9c4145b82e Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 20 Feb 2025 16:31:47 -0500 Subject: [PATCH 24/25] darglint --- .../analyze/error_analysis/base_idea.py | 287 ------------------ .../analyze/error_analysis/summarizer.py | 7 + 2 files changed, 7 insertions(+), 287 deletions(-) delete mode 100644 src/agentlab/analyze/error_analysis/base_idea.py diff --git a/src/agentlab/analyze/error_analysis/base_idea.py b/src/agentlab/analyze/error_analysis/base_idea.py deleted file mode 100644 index 5d4827d4..00000000 --- a/src/agentlab/analyze/error_analysis/base_idea.py +++ /dev/null @@ -1,287 +0,0 @@ -from dataclasses import dataclass - -from bgym import ExpResult, StepInfo - -CHANGE_SUMMARIZER_PROMPT = """ -You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, -you will receive the following pieces of information: - -1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). -2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. -3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). -4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). -5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. - -YOUR TASK (each step): -A) SUMMARIZE THE CHANGE - - Describe what visibly changed between the previous observation (or diff) and the current observation. - For example, did a new panel open, did the form reset, did nothing happen, etc.? - -B) ASSESS THE ACTION - - Decide whether the agent's action seems helpful or correct given the user's main goal, - or if it appears incorrect/unhelpful. - - Briefly explain why. - -OUTPUT FORMAT (per step): -Return your analysis as a JSON-like structure, for example: - -{ - "changeSummary": "A new search results panel appeared on the right side.", - "actionAssessment": "Correct", - "explanation": "Clicking 'Search' was appropriate to display the results." -} - -Or for an incorrect action: - -{ - "changeSummary": "The page reloaded but the date fields were reset to defaults.", - "actionAssessment": "Incorrect", - "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.", - "suggestion": "Correct the date format or check for error messages." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Goal: {goal} - -LLM Plan: {plan} - -Previous Observation: {past_observation} - -Current Observation: {current_observation} - -Past summaries: {past_summaries} - -Action: {action} -""" - -ERROR_CLASSIFICATION_PROMPT = """ -You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. -Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors), -followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), -a set of labeled examples for reference (few-shot), and finally the classification task you must complete. - --------------------------------------------------------------------------------- -TAXONOMY DEFINITIONS --------------------------------------------------------------------------------- - -1. AGENT ERRORS -These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation. - - - Navigation & Planning Errors - The agent cannot construct or execute a correct sequence of actions to reach its goal - (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). - - - Interaction Execution Errors - The agent enters data in the wrong format, forgets to click "Submit" after typing, - repeats the same failing action without adaptation, or loses track of the changing webpage state. - - - Information Processing Errors - The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), - misconstrues relationships between pieces of information, or fails to validate data against task requirements. - - - Observation & Action Errors - The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) - or misaligns its actions (clicks the wrong element or stale link). - -2. LANGUAGE MODEL ERRORS -These errors result from the model's inability to correctly interpret or reason about the task at a higher level, -independent of the low-level web interactions. - - - Task Understanding Errors - The agent misreads or misunderstands the user's objective (goal interpretation), - loses crucial context (context loss), or performs actions beyond or short of the intended scope. - - - Reasoning Failures - The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, - or fails to prioritize important subtasks when handling complex goals. - -3. BENCHMARK & ENVIRONMENT ERRORS -These errors are external to the agent's logic and the language model's reasoning, -arising from flaws in the system, network, or evaluation framework itself. - - - System Errors - Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts). - - - Benchmark Design Errors - Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), - or inflexible evaluation systems that fail to account for valid alternative solutions. - --------------------------------------------------------------------------------- -INPUT DESCRIPTION --------------------------------------------------------------------------------- - -You will receive the following for each scenario: -1. User Goal - - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). - -2. Planning / Thought History - - The internal reasoning or plan the agent considered. May include branches of logic or key decision points. - -3. Current Observation (HTML / AX Tree Snippet) - - The webpage structure or state that the agent sees at a given point in time. - -4. Historical change summaries - - A list of summaries of changes in the observation that the agent has seen during the course of actions. - -5. Action History - - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) - along with immediate outcomes or errors. - -Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. - --------------------------------------------------------------------------------- -FEW-SHOT CLASSIFICATION EXAMPLES --------------------------------------------------------------------------------- - -1) EXAMPLE A (Benchmark Error - Benchmark Design Error) - • Context: The agent correctly finds a cheaper product meeting the user's criteria, - but the benchmark expects a more expensive product and marks the solution as wrong. - • Classification: ["Benchmark Design Error"] - • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid - and does not allow an alternative correct solution. - -2) EXAMPLE B (Agent Error - Interaction Execution) - • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. - Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. - • Classification: ["Agent Error - Interaction Execution"] - • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action - without adaptation ("Action Repetition"). - -3) EXAMPLE C (Benchmark Error - Benchmark Design Error) - • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" - The query is ambiguous because "Upitts" is not a standard location. - The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region. - • Classification: ["Benchmark Design Error"] - • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), - leading the agent astray due to unclear context. - -4) EXAMPLE D (Language Model Error - Task Understanding) - • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' - that are older than 30 days and add a comment saying 'I can help fix this.'" - The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue - with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. - • Classification: ["Language Model Error - Task Understanding"] - • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, - it focused on creating a new issue. This is a misinterpretation of the instructions, - not a mechanical error in clicking or input format. - --------------------------------------------------------------------------------- -CLASSIFICATION TASK --------------------------------------------------------------------------------- - -1. Read through: - - The planning and thought history - - The action history - - The current HTML or AX Tree observation - - The user goal - -2. Decide if the failure is: - - An Agent Error (which subcategory/subcategories), - - A Language Model Error (which subcategory/subcategories), - - A Benchmark/Environment Error (which subcategory/subcategories), - - Or a combination thereof (multi-label if needed). - -3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. - -4. If the agent succeeds (no error), label the errorCategory accordingly as "Success". - -Output Format Example: -{ - "errorCategory": ["Agent Error - Navigation & Planning"], - "explanation": "The agent opened the wrong GitLab page and never recovered..." -} - -Please follow this structure at every step. Keep your responses concise and clear. Below are the details. - -Overall goal: {goal} - -LLM Plan and thought history: {plan} - -Current Observation: {current_observation} - -Historical change summaries: {historical_summaries} - -Action history: {action_history} -""" - - -def _diff(past_obs, current_obs): - """TODO: Implement the diff function. - - Returns a diff version of current_obs compares to past_obs, unless there is too many changes. - """ - raise ValueError("Not implemented yet.") - - -@dataclass -class ChangeSummarizer: - - llm: callable # language model - obs_formatter: callable - use_diff: bool = False - - def summarize( - self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str] - ) -> str: - """Produces, a summary of the effect of an action.""" - past_obs_message = self.obs_formatter(past_obs) - current_obs_message = self.obs_formatter(current_obs) - - goal = past_obs["goal"] # Use goal object from agentlab - # Outsource everything to formatter - plan = past_obs["plan"] - if self.use_diff: - current_obs_message = _diff(past_obs_message, current_obs_message) - - return self.llm( - self.make_prompt( - past_obs_message, action, current_obs_message, past_summaries, goal, plan - ) - ) - - def make_prompt( - self, past_obs_message, action, current_obs_message, past_summaries, goal, plan - ): - """TODO: Implement the prompt.""" - return CHANGE_SUMMARIZER_PROMPT.format( - goal=goal, - plan=plan, - past_observation=past_obs_message, - current_observation=current_obs_message, - past_summaries=past_summaries, - action=action, - ) - - -@dataclass -class EpisodeAnalysis: - analysis: str # complete analysis of the episode - summary: str # short summary of the analysis - categories: dict[str, float] # score for each category e.g. type of error or difficulty levels - - -@dataclass -class EpisodeSummarizer: - - change_summarizer: ChangeSummarizer = None - - def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis: - """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" - pass - - -@dataclass -class EpisodeErrorSummarizer(EpisodeSummarizer): - - change_summarizer: ChangeSummarizer = None - - def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan): - """TODO: Implement the prompt.""" - return ERROR_CLASSIFICATION_PROMPT.format( - goal=goal, - plan=plan, - current_observation=current_observation, - historical_summaries=historical_summaries, - action_history=action_history, - ) diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py index e184583d..2919f052 100644 --- a/src/agentlab/analyze/error_analysis/summarizer.py +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -15,6 +15,13 @@ def _diff(past_obs, current_obs): """TODO: Implement the diff function. Returns a diff version of current_obs compares to past_obs, unless there is too many changes. + + Args: + past_obs: The past observation. + current_obs: The current observation. + + Raises: + ValueError: Not implemented yet. """ raise ValueError("Not implemented yet.") From 46d10754c3d9220b48d07319982f90ab7f5e2042 Mon Sep 17 00:00:00 2001 From: ThibaultLSDC Date: Thu, 20 Feb 2025 17:00:22 -0500 Subject: [PATCH 25/25] tests --- tests/analyze/error_analysis/test_pipeline.py | 26 ++----------------- .../analyze/error_analysis/test_summarizer.py | 12 +++++++-- 2 files changed, 12 insertions(+), 26 deletions(-) diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py index f9570c2b..a2f6295d 100644 --- a/tests/analyze/error_analysis/test_pipeline.py +++ b/tests/analyze/error_analysis/test_pipeline.py @@ -16,7 +16,7 @@ def summarize( class MockEpisodeSummarizer: - def summarize(self, exp_result: ExpResult, step_analysis: list[str]) -> str: + def __call__(self, exp_result: ExpResult) -> str: return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}" @@ -33,8 +33,6 @@ def pipeline() -> ErrorAnalysisPipeline: exp_dir=exp_dir, filter=None, episode_summarizer=MockEpisodeSummarizer(), - step_summarizer=MockStepSummarizer(), - analyzer=MockAnalyzer(), ) @@ -49,30 +47,10 @@ def test_yield_with_filter(pipeline: ErrorAnalysisPipeline): pipeline.filter = None -def test_analyze_step(pipeline: ErrorAnalysisPipeline): - exp_result = next(pipeline.filter_exp_results()) - step_analysis = pipeline.analyze_step(exp_result) - - assert len(exp_result.steps_info) == len(step_analysis) + 1 - assert step_analysis[0] == f"Agent took action {exp_result.steps_info[0].action} at step 0" - - -def test_analyze_episode(pipeline: ErrorAnalysisPipeline): - exp_result = next(pipeline.filter_exp_results()) - step_analysis = pipeline.analyze_step(exp_result) - episode_analysis = pipeline.analyze_episode(exp_result, step_analysis) - - for step_info in exp_result.steps_info: - if step_info.action: - assert step_info.action in episode_analysis - - def test_save_analysis(pipeline: ErrorAnalysisPipeline): exp_result = next(pipeline.filter_exp_results()) - step_analysis = pipeline.analyze_step(exp_result) - episode_analysis = pipeline.analyze_episode(exp_result, step_analysis) - error_analysis = pipeline.analyze_errors(exp_result, episode_analysis, step_analysis) + error_analysis = pipeline.episode_summarizer(exp_result) pipeline.save_analysis(exp_result, error_analysis, exists_ok=False) assert (exp_result.exp_dir / "error_analysis.json").exists() diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py index e9fe0ecc..83418496 100644 --- a/tests/analyze/error_analysis/test_summarizer.py +++ b/tests/analyze/error_analysis/test_summarizer.py @@ -13,10 +13,18 @@ def exp_results() -> list[ExpResult]: return list(yield_all_exp_results(exp_dir)) +@pytest.mark.pricy def test_change_summarizer(exp_results: list[ExpResult]): - summarizer = ChangeSummarizer(llm=lambda x: x) + summarizer = ChangeSummarizer(llm=lambda x: {"content": x}) step = exp_results[0].steps_info[0] next_step = exp_results[0].steps_info[1] past_summaries = [] summary = summarizer.summarize(step, next_step, past_summaries) - assert isinstance(summary, str) + assert isinstance(summary, dict) + + +if __name__ == "__main__": + exp_res = list( + yield_all_exp_results(Path(__file__).parent.parent.parent / "data/error_analysis") + ) + test_change_summarizer(exp_res)