From 048a622ba4e3b072a29b151eaaf12c2377e88e21 Mon Sep 17 00:00:00 2001
From: recursix <alex.lacoste.shmu@gmail.com>
Date: Fri, 17 Jan 2025 15:29:52 -0500
Subject: [PATCH 01/25] Add initial implementation of ChangeSummarizer and
 EpisodeAnalysis classes

---
 src/agentlab/analyze/error_analysis.py | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 src/agentlab/analyze/error_analysis.py

diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py
new file mode 100644
index 00000000..a0fbdb43
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass
+from bgym import StepInfo
+
+
+def _diff(past_obs, current_obs):
+    """TODO: Implement the diff function.
+
+    Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+    """
+    raise ValueError("Not implemented yet.")
+
+
+@dataclass
+class ChangeSummarizer:
+
+    llm: callable  # language model
+    obs_formatter: callable
+    use_diff: bool = False
+
+    def summarize(
+        self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str]
+    ) -> str:
+        """Produces, a summary of the effect of an action."""
+        past_obs_message = self.obs_formatter(past_obs)
+        current_obs_message = self.obs_formatter(current_obs)
+        if self.use_diff:
+            current_obs_message = _diff(past_obs_message, current_obs_message)
+
+        return self.llm(self.make_prompt(past_obs_message, current_obs_message, action))
+
+    def make_prompt(self, past_obs_message, action, current_obs_message, past_summaries):
+        """TODO: Implement the prompt."""
+        return f"{past_obs_message} {action} {current_obs_message}"
+
+
+@dataclass
+class EpisodeAnalysis:
+    analysis: str  # complete analysis of the episode
+    summary: str  # short summary of the analysis
+    categories: dict[str, float]  # score for each category e.g. type of error or difficulty levels
+
+
+@dataclass
+class EpisodeSummarizer:
+
+    cange_summarizer: ChangeSummarizer = None
+
+    def summarize(episode: list[StepInfo]) -> EpisodeAnalysis:
+        """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+        pass

From fd8fd95d74d023b90d7803f555e8a4f3fc7f285b Mon Sep 17 00:00:00 2001
From: Megh Thakkar <Megh-Thakkar@users.noreply.github.com>
Date: Tue, 21 Jan 2025 00:04:37 -0500
Subject: [PATCH 02/25] Added chain summarizer prompt

---
 src/agentlab/analyze/error_analysis.py | 74 ++++++++++++++++++++++++--
 1 file changed, 71 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py
index a0fbdb43..0a869522 100644
--- a/src/agentlab/analyze/error_analysis.py
+++ b/src/agentlab/analyze/error_analysis.py
@@ -1,6 +1,59 @@
 from dataclasses import dataclass
 from bgym import StepInfo
 
+CHANGE_SUMMARIZER_PROMPT = """
+You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
+you will receive the following pieces of information:
+
+1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
+2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
+3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
+4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
+5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
+
+YOUR TASK (each step):
+A) SUMMARIZE THE CHANGE
+   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
+     For example, did a new panel open, did the form reset, did nothing happen, etc.?
+
+B) ASSESS THE ACTION
+   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
+     or if it appears incorrect/unhelpful. 
+   - Briefly explain why.
+
+OUTPUT FORMAT (per step):
+Return your analysis as a JSON-like structure, for example:
+
+{
+  "changeSummary": "A new search results panel appeared on the right side.",
+  "actionAssessment": "Correct",
+  "explanation": "Clicking 'Search' was appropriate to display the results."
+}
+
+Or for an incorrect action:
+
+{
+  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
+  "actionAssessment": "Incorrect",
+  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
+  "suggestion": "Correct the date format or check for error messages."
+}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Goal: {goal}
+
+LLM Plan: {plan}
+
+Previous Observation: {past_observation}
+
+Current Observation: {current_observation}
+
+Past summaries: {past_summaries}
+
+Action: {action}
+"""
+
 
 def _diff(past_obs, current_obs):
     """TODO: Implement the diff function.
@@ -23,14 +76,29 @@ def summarize(
         """Produces, a summary of the effect of an action."""
         past_obs_message = self.obs_formatter(past_obs)
         current_obs_message = self.obs_formatter(current_obs)
+        goal = past_obs["goal"]
+        plan = past_obs["plan"]
         if self.use_diff:
             current_obs_message = _diff(past_obs_message, current_obs_message)
 
-        return self.llm(self.make_prompt(past_obs_message, current_obs_message, action))
+        return self.llm(
+            self.make_prompt(
+                past_obs_message, action, current_obs_message, past_summaries, goal, plan
+            )
+        )
 
-    def make_prompt(self, past_obs_message, action, current_obs_message, past_summaries):
+    def make_prompt(
+        self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
+    ):
         """TODO: Implement the prompt."""
-        return f"{past_obs_message} {action} {current_obs_message}"
+        return CHANGE_SUMMARIZER_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            past_observation=past_obs_message,
+            current_observation=current_obs_message,
+            past_summaries=past_summaries,
+            action=action,
+        )
 
 
 @dataclass

From b8c85b101c2acaf2912a5b063a228f1e1f4cd334 Mon Sep 17 00:00:00 2001
From: Megh Thakkar <Megh-Thakkar@users.noreply.github.com>
Date: Tue, 21 Jan 2025 01:02:17 -0500
Subject: [PATCH 03/25] Added error classification prompt

---
 src/agentlab/analyze/error_analysis.py | 168 ++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py
index 0a869522..07406a43 100644
--- a/src/agentlab/analyze/error_analysis.py
+++ b/src/agentlab/analyze/error_analysis.py
@@ -54,6 +54,156 @@
 Action: {action}
 """
 
+ERROR_CLASSIFICATION_PROMPT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
+followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. AGENT ERRORS
+These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
+
+   - Navigation & Planning Errors
+     The agent cannot construct or execute a correct sequence of actions to reach its goal 
+     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+   - Interaction Execution Errors
+     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+     repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+   - Information Processing Errors
+     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+   - Observation & Action Errors
+     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+     or misaligns its actions (clicks the wrong element or stale link).
+
+2. LANGUAGE MODEL ERRORS
+These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
+independent of the low-level web interactions.
+
+   - Task Understanding Errors
+     The agent misreads or misunderstands the user's objective (goal interpretation), 
+     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+   - Reasoning Failures
+     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+     or fails to prioritize important subtasks when handling complex goals.
+
+3. BENCHMARK & ENVIRONMENT ERRORS
+These errors are external to the agent's logic and the language model's reasoning, 
+arising from flaws in the system, network, or evaluation framework itself.
+
+   - System Errors
+     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
+
+   - Benchmark Design Errors
+     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
+     or inflexible evaluation systems that fail to account for valid alternative solutions.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+   
+2. Planning / Thought History
+   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
+
+3. Current Observation (HTML / AX Tree Snippet)
+   - The webpage structure or state that the agent sees at a given point in time.
+
+4. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+5. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Benchmarl Error - Benchmark Design Error)
+   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
+     but the benchmark expects a more expensive product and marks the solution as wrong.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
+     and does not allow an alternative correct solution.
+
+2) EXAMPLE B (Agent Error - Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Agent Error - Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
+   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
+     The query is ambiguous because "Upitts" is not a standard location. 
+     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
+     leading the agent astray due to unclear context.
+
+4) EXAMPLE D (Language Model Error - Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Language Model Error - Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. Decide if the failure is:
+   - An Agent Error (which subcategory/subcategories),
+   - A Language Model Error (which subcategory/subcategories),
+   - A Benchmark/Environment Error (which subcategory/subcategories),
+   - Or a combination thereof (multi-label if needed).
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
+
+Output Format Example:
+{
+  "errorCategory": ["Agent Error - Navigation & Planning"],
+  "explanation": "The agent opened the wrong GitLab page and never recovered..."
+}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Overall goal: {goal}
+
+LLM Plan and thought history: {plan}
+
+Current Observation: {current_observation}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+"""
+
 
 def _diff(past_obs, current_obs):
     """TODO: Implement the diff function.
@@ -111,8 +261,24 @@ class EpisodeAnalysis:
 @dataclass
 class EpisodeSummarizer:
 
-    cange_summarizer: ChangeSummarizer = None
+    change_summarizer: ChangeSummarizer = None
 
     def summarize(episode: list[StepInfo]) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
         pass
+
+
+@dataclass
+class EpisodeErrorSummarizer(EpisodeSummarizer):
+
+    change_summarizer: ChangeSummarizer = None
+
+    def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
+        """TODO: Implement the prompt."""
+        return ERROR_CLASSIFICATION_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            current_observation=current_observation,
+            historical_summaries=historical_summaries,
+            action_history=action_history,
+        )

From 5cb6cc210d28871121ae70ff76d3fa0026abfdbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9o=20Boisvert?= <leo.boisvert@hotmail.ca>
Date: Tue, 21 Jan 2025 10:52:43 -0500
Subject: [PATCH 04/25] Fix typo

---
 src/agentlab/analyze/error_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py
index 07406a43..5a36db58 100644
--- a/src/agentlab/analyze/error_analysis.py
+++ b/src/agentlab/analyze/error_analysis.py
@@ -133,7 +133,7 @@
 FEW-SHOT CLASSIFICATION EXAMPLES
 --------------------------------------------------------------------------------
 
-1) EXAMPLE A (Benchmarl Error - Benchmark Design Error)
+1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
    • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
      but the benchmark expects a more expensive product and marks the solution as wrong.
    • Classification: ["Benchmark Design Error"]

From 9f531cc930f1674dd766368784731153220e2362 Mon Sep 17 00:00:00 2001
From: Megh Thakkar <Megh-Thakkar@users.noreply.github.com>
Date: Tue, 21 Jan 2025 15:09:20 -0500
Subject: [PATCH 05/25] Update error_analysis.py

---
 src/agentlab/analyze/error_analysis.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis.py
index 5a36db58..8b7b0154 100644
--- a/src/agentlab/analyze/error_analysis.py
+++ b/src/agentlab/analyze/error_analysis.py
@@ -226,7 +226,9 @@ def summarize(
         """Produces, a summary of the effect of an action."""
         past_obs_message = self.obs_formatter(past_obs)
         current_obs_message = self.obs_formatter(current_obs)
-        goal = past_obs["goal"]
+
+        goal = past_obs["goal"]    # Use goal object from agentlab
+        # Outsource everything to formatter
         plan = past_obs["plan"]
         if self.use_diff:
             current_obs_message = _diff(past_obs_message, current_obs_message)

From 31e5bf55c49f04bc618f30e4f3792b56c3a985fa Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 22 Jan 2025 12:41:51 -0500
Subject: [PATCH 06/25] added pipeline and tests

---
 .../agents/generic_agent/agent_configs.py     |   2 +-
 .../analyze/error_analysis/__init__.py        |   0
 .../analyze/error_analysis/pipeline.py        |  74 +++++
 .../summarizer.py}                            |   3 +-
 tests/analyze/error_analysis/test_pipeline.py |  85 ++++++
 .../exp_args.pkl                              | Bin 0 -> 2288 bytes
 .../goal_object.pkl.gz                        | Bin 0 -> 102 bytes
 .../package_versions.txt                      | 287 ++++++++++++++++++
 .../step_0.pkl.gz                             | Bin 0 -> 7793 bytes
 .../step_1.pkl.gz                             | Bin 0 -> 7916 bytes
 .../step_2.pkl.gz                             | Bin 0 -> 7953 bytes
 .../step_3.pkl.gz                             | Bin 0 -> 5672 bytes
 .../summary_info.json                         |  44 +++
 .../exp_args.pkl                              | Bin 0 -> 2286 bytes
 .../goal_object.pkl.gz                        | Bin 0 -> 97 bytes
 .../package_versions.txt                      | 287 ++++++++++++++++++
 .../step_0.pkl.gz                             | Bin 0 -> 7728 bytes
 .../step_1.pkl.gz                             | Bin 0 -> 7861 bytes
 .../step_2.pkl.gz                             | Bin 0 -> 5613 bytes
 .../summary_info.json                         |  44 +++
 .../exp_args.pkl                              | Bin 0 -> 2276 bytes
 .../goal_object.pkl.gz                        | Bin 0 -> 106 bytes
 .../package_versions.txt                      | 287 ++++++++++++++++++
 .../step_0.pkl.gz                             | Bin 0 -> 8014 bytes
 .../step_1.pkl.gz                             | Bin 0 -> 4893 bytes
 .../summary_info.json                         |  44 +++
 .../exp_args.pkl                              | Bin 0 -> 2276 bytes
 .../goal_object.pkl.gz                        | Bin 0 -> 106 bytes
 .../package_versions.txt                      | 287 ++++++++++++++++++
 .../step_0.pkl.gz                             | Bin 0 -> 8000 bytes
 .../step_1.pkl.gz                             | Bin 0 -> 4879 bytes
 .../summary_info.json                         |  44 +++
 .../error_report_trial_1_of_3.md              |   0
 .../error_analysis/result_df_trial_1_of_3.csv |   5 +
 tests/data/error_analysis/study.pkl.gz        | Bin 0 -> 3761 bytes
 .../summary_df_trial_1_of_3.csv               |   2 +
 36 files changed, 1493 insertions(+), 2 deletions(-)
 create mode 100644 src/agentlab/analyze/error_analysis/__init__.py
 create mode 100644 src/agentlab/analyze/error_analysis/pipeline.py
 rename src/agentlab/analyze/{error_analysis.py => error_analysis/summarizer.py} (99%)
 create mode 100644 tests/analyze/error_analysis/test_pipeline.py
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
 create mode 100644 tests/data/error_analysis/error_report_trial_1_of_3.md
 create mode 100644 tests/data/error_analysis/result_df_trial_1_of_3.csv
 create mode 100644 tests/data/error_analysis/study.pkl.gz
 create mode 100644 tests/data/error_analysis/summary_df_trial_1_of_3.csv

diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index 86f617da..9089fcaf 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -257,7 +257,7 @@
 )
 
 AGENT_4o_MINI = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+    chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
     flags=FLAGS_GPT_4o,
 )
 
diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
new file mode 100644
index 00000000..53021297
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -0,0 +1,74 @@
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator
+
+from bgym import ExpResult
+
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+from .summarizer import ChangeSummarizer, EpisodeSummarizer
+
+
+@dataclass
+class Analyzer:
+    prompt: str
+    llm = None
+
+    def __call__(self, *args, **kwds):
+        return "analysis"
+
+
+@dataclass
+class ErrorAnalysisPipeline:
+    exp_dir: Path
+    filter: str = None
+    step_summarizer: ChangeSummarizer = None
+    episode_summarizer: EpisodeSummarizer = None
+    analyzer: Analyzer = None
+
+    def filter_exp_results(self) -> Generator[ExpResult, None, None]:
+        # TODO:(thibault) improve filtering
+        exp_results = yield_all_exp_results(self.exp_dir)
+        for exp_result in exp_results:
+            if self.filter is None or self.filter in str(exp_result.exp_dir):
+                yield exp_result
+
+    def run_analysis(self):
+        filtered_results = self.filter_exp_results()
+
+        for exp_result in filtered_results:
+            step_analysis = self.analyze_step(exp_result)
+            episode_analysis = self.analyze_episode(exp_result, step_analysis)
+            error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis)
+            self.save_analysis(exp_result, error_analysis)
+
+    def analyze_step(self, exp_result: ExpResult) -> list[str]:
+        step_summaries = []  # type: list[str]
+        # this assumes that there is always an extra step at the end of the episode
+        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+        # TODO:(thibault) make some checks
+        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+            step_summaries.append(
+                self.step_summarizer.summarize(step, step.action, next_step, step_summaries)
+            )
+        return step_summaries
+
+    def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
+        episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis)
+        return episode_summary
+
+    def analyze_errors(
+        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+    ) -> str:
+        error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis)
+        return error_analysis
+
+    def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
+        """Save the analysis to json"""
+        analysis_path = exp_result.exp_dir / "error_analysis.json"
+        if not exists_ok and analysis_path.exists():
+            raise FileExistsError(f"{analysis_path} already exists")
+        with analysis_path.open("w") as f:
+            json.dump(error_analysis, f)
diff --git a/src/agentlab/analyze/error_analysis.py b/src/agentlab/analyze/error_analysis/summarizer.py
similarity index 99%
rename from src/agentlab/analyze/error_analysis.py
rename to src/agentlab/analyze/error_analysis/summarizer.py
index 8b7b0154..b3760216 100644
--- a/src/agentlab/analyze/error_analysis.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -1,4 +1,5 @@
 from dataclasses import dataclass
+
 from bgym import StepInfo
 
 CHANGE_SUMMARIZER_PROMPT = """
@@ -227,7 +228,7 @@ def summarize(
         past_obs_message = self.obs_formatter(past_obs)
         current_obs_message = self.obs_formatter(current_obs)
 
-        goal = past_obs["goal"]    # Use goal object from agentlab
+        goal = past_obs["goal"]  # Use goal object from agentlab
         # Outsource everything to formatter
         plan = past_obs["plan"]
         if self.use_diff:
diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py
new file mode 100644
index 00000000..f9570c2b
--- /dev/null
+++ b/tests/analyze/error_analysis/test_pipeline.py
@@ -0,0 +1,85 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline
+
+exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+
+
+class MockStepSummarizer:
+    def summarize(
+        self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str]
+    ) -> str:
+        return f"Agent took action {action} at step {len(step_summaries)}"
+
+
+class MockEpisodeSummarizer:
+    def summarize(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
+        return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}"
+
+
+class MockAnalyzer:
+    def __call__(
+        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+    ) -> str:
+        return {"error": "analysis", "episode": episode_analysis}
+
+
+@pytest.fixture(scope="module")
+def pipeline() -> ErrorAnalysisPipeline:
+    return ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=None,
+        episode_summarizer=MockEpisodeSummarizer(),
+        step_summarizer=MockStepSummarizer(),
+        analyzer=MockAnalyzer(),
+    )
+
+
+def test_yield_no_filter(pipeline: ErrorAnalysisPipeline):
+    assert len(list(pipeline.filter_exp_results())) == 4
+
+
+def test_yield_with_filter(pipeline: ErrorAnalysisPipeline):
+    pattern = "click-dialog"
+    pipeline.filter = pattern
+    assert len(list(pipeline.filter_exp_results())) == 2
+    pipeline.filter = None
+
+
+def test_analyze_step(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+
+    assert len(exp_result.steps_info) == len(step_analysis) + 1
+    assert step_analysis[0] == f"Agent took action {exp_result.steps_info[0].action} at step 0"
+
+
+def test_analyze_episode(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
+
+    for step_info in exp_result.steps_info:
+        if step_info.action:
+            assert step_info.action in episode_analysis
+
+
+def test_save_analysis(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+    step_analysis = pipeline.analyze_step(exp_result)
+    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
+    error_analysis = pipeline.analyze_errors(exp_result, episode_analysis, step_analysis)
+
+    pipeline.save_analysis(exp_result, error_analysis, exists_ok=False)
+
+    assert (exp_result.exp_dir / "error_analysis.json").exists()
+
+    # remove the file
+    (exp_result.exp_dir / "error_analysis.json").unlink()
+
+
+if __name__ == "__main__":
+    test_yield_with_filter()
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b2856641cf90decf74c96ba05bcd59b7a81c860a
GIT binary patch
literal 2288
zcmbVO%WfP+6t(3zW6#8o1m_{~kPsjxLOsk(oY#t2hy-C`qT~Zg)!jAIrRnZ!RaMUe
z2?-WRu~fSh3lKlTXYdg$*z*CLTRk2>NLb*7-F55MeV%)+f4KDXpBpRT$GsbCH8nzy
z=0#V`DxqZ|N^82gQk8eDFK5+(9vT1I-}%vh?{B*{AK{*H61q>rA?I;7e3&loU?E}Q
zc>d(w?$hS>fGoVxYRL=X-L12F(WtV~zKRN2O7C`(j9XTyROE{gt}jE#^P(HBc?Hew
zgYRlBo{{DSSIz6jRp*@b-Ga}URhH-YJr<?$U$6YVy5fRdmQ`)f%eP6t6w?#wvz5oL
zV?}`~atrVN>F2-yLO;K{?TIu}mCV#trL^_0+)gU0T+ON4tO@4EnP8$pZ?GTr9z^|z
zzK-4cFy~b8!7|(M@t*}}l~zS%y}P!k=ksuW5-ctIxu#;+{qUmQbdH$N$i{{&N8W9G
z8=DJ{H)<nTW{Vv8bm*Jvr-i^VTv-m|$L<}v8ma_;QYLaiIYVWe#6oMOS+0=X!bT>I
zReFwNcvo>_&uSgJYr#`ush+3GS4nC!S&o;&<0gt!4u^<hlLBRSUIAjAO4lWV39&bU
zm;`~BCQ}x=Eiz_QLBWfe)jY9`IvD5h!AFSg`iVHnWisZc7u(AVrF4qgu0P?qfzS7b
zGPgo6?JNpmm@4x*9POz}lw5{;WR?ocW$xW2UR&ke-s1IC+Kk0AJ$bdWkR#ZI2ynTK
zcu^gnedIa^;qyY{+KmD;0{HmkEff%oGRX?A$K9Ay0yn;rqwE_o!9)d{FU5r^ZkalU
zMdaL}ah<A|@jUlG(Yem8Tts#3E*lG$phE9zVfCDatcFS_1f9)P6e<k4cWVm0Lq@x~
zDo3aZx+JTR8#*J)R8C}CV_wCK7ga6@ZCk+?Yn&N><Tj;Y3x`KA6_8owyd+)fGfY~U
zNThh|t}IbRn1AKkbS@AN0Erd_byWf?7SL^cLMzCD^#&-X@=ekQBvBH88{H+C5J$n*
zRU(gOyq5g&_&3*yp@Ei5Rnwzu0ZmMnuw93$Q${y=nzBL~Q*li*RS07+!SrA&a2U{?
zFln_v+;eT>4mQ%Nt~}+xNgAQ-{_qmjLzI(-OtTdbemObMmz$QG@q(dlE!x@*ppTMF
zB;#1k;Mg^4Mu?Pr1|P<X%J{=|w~+~+qB#`b6)h6X1yG&}g)&^9a%n*hnxR(uBokb0
zOr)3wt~_!rldCBZfz-E%9FQwI%B_H)6XF|B!$zX~T3BN&l^WZt;x@}=jOd;#BWKTX
z#3sA#RzJgBWr~)~Dh6Qpg#Az!1E3<E1Mu&?sMn7M1J>_Hy}Qxi5yRf1?H07p?xjeT
zVLt)zk(K2<vckaZ;hSu{ZMR`~(a|t8|Eew<^t|;iK*5w+vn(F`UqkN70*9%HYl3(S
z6wC@UqFSP;$ru#QF^ETQ9gPye(3;x<L60yIET{4Jhi$iaPJ(Wvs<!^{19zEXl&N<g
z_a8nO_J*h-AME#{{r;mQ;`i@~sMovqkoWh8_wPP}S{r*%lqu1mzvFfcpP;v=T#QZ9
qhyodh0;JS<w?Tg_{#!Zw`PzTcZvC}-qqFw@>K`}3JTE{V-G2ZBD2>?w

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..482f9b3d0d7d72ae173eff5c188db4215133df8c
GIT binary patch
literal 102
zcmV-s0Ga<EiwFqm7LjKH|7UMuY+r9;YGq?|E^upX0Bc~GYQq2lu~TZNX!Nj@R2HO8
z0aB?IB~vn(dlZ9Hb5fH_6oLb@opcmR(-ji)QWTPNGLy3vf=iQfGfVWQlqTu{0N&8w
Ijz<6h0R6@&FaQ7m

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..00267af037282ad54f4e4e24768579021c12c0e1
GIT binary patch
literal 7793
zcmV-%9**H3iwFqm7LjKH|8sO@a9=PkaBFM;?L7%>9M^RuMT+E*I_=m_9LHlS_AVoG
zNiJ`S)JaOBMx-2(mSw2KVs`hFJImRb^&F%&j?koOU<sH4Mm<H5rU(iYj!_^mjKD<@
zCoOuQKzawg(4<Wpq$yeyC{Pqmi=qhH_x>?EGrJ_0v{KnMwOa1X|L?v3{`>Fx|IhK6
z)?fJY&ot40-s4%z+_Z_cRw<>(?J}{n5-}V*rJH8i_d1pwQl2+fP5)KD_p|;TKk2pF
zP~<C3-T^af(~?8Eg6d>S#J1HnV*4w8lEPW0PJFLZwQbFIRKtN|2Z+OQsq9}m_njZk
zy?y)(1+SeW`reJp+Eenptd``8>00u0a42y5pggA<GVr%*S|xdtWMxOS*KHY25j-V7
z4@^*)dAUIJG8W34no~IAyTDGN%WHL^n||RCYx<!R?-UMuPc0Fh<Q#eFlk4aEWOq$g
zjl7)GwcNVA<Yr5nlk#2SwXT_}?iY@GU3ic&vqix17ri59nHZTe_B!KQI(FT=va0E1
zYG7cXV3x>$Gf>hDZPU#5mvz_1uP7KOI3+zep-uMZ3M99lHE$C;g|)uw9SgPKXpT<B
z>wV|~<?uoPE-!;D5M0eU+6JKr8O_L(+W>61rE(>eGcA%Txw@mNmZesFuRCLroN3sO
z<w9p(^}RL<=zHx(o`Jjyqx^aQLUY!C*}v{5m1d>+^M2t;uPyIX$^=nbv<d&-T~>R*
zYo7KO7Z<Pm5B#HllqSc|-n-Dz#Q*tl>Ky<<`GrS4DQ}j9BvD;D^JdPass}y9f-X_;
z-1V!dZf&&xzS8WqXH}c<(j(9X2Y{}s#+r*-L|gW{iIFq&nz6R1mH?;4>nOWf5ZZj6
z19aFG(Bo2Y-o9!9B%DG&+6*&Km}+;GRST%kQ&ECq^M1jnML7VUHu|@b&qwIrC*a!x
zpQqq+f`1b5=~X@fpI-Q+;Df(u_~7pteDHS)KD`tcKj+|kLV1dK9mwl44hL|jbSe^#
z5;W3Go9p05&9US7SNyG4+uY!9m>PVDci1kdd2{o+X*!falxLg9CF!+osJe?>yQ~j%
zps)|EAk>4(5#^Zj5PXj*kHGga{>9^`>wL#4<kR%;G37LT&qcnURF>iQr*`~h-_LS7
z$B5U7tli+8CabC)Qu5>Mr&&3!JStAakMi+&Nxh#j_A@08W&kk?aWV3<5NF|5ydWPh
z(BlP8yx`Me+T!PB<s$HMmw1P=YHpnv`9%;17zvJ33ye~$(hhosTGyp?D~FWBFm6Xd
za>rrp9#%dC<Mk+v+lN8<A6GsK%KtItNks<L?*ZlgIB3b!$}^zINvi%Ql|H2(lz%`O
zRE9wLhm{d!6qJ8lnNTJ{IH#0pkj!c2jB*x6^t^Hb^ki0<0}d|{Pf{Jn(z33@;?Ih|
zLU|P29abd5&LueS;OEtnhl3J<8RnV6JQ>2m+9#kNtfnV|ouqwYHOUapGtfs?dNNYl
z&r4I0(m`H26e%6%r6V;xm<aJ8%1*`jV$wJh!bK^v(%DGqC0=?tQhJ4#Dv{DfUV1Gg
zWjwzgoIkES2XHrN`SZ$6sCxnGI!KtSl^Av=hOLIMvo+WxhP@WU)<W3D8thUG+l*nq
z7{az{u-zE;W(>O$!v0hZ_MON`V#!ZOr28|<=aipi6qreNDxVLC;0Jt^LvB0FM`5CK
zam6oOMz<<yM%j&4BtiEEK#RteQv`vMz!m=x*XyPTG^-$aKtT-KJ8YI6%`{k|6e@q1
zN8VW;rdf=Pa!|I3n>U5`uT~so74vz((^fS|%OcfXnoTntHLDZeUJ^ns`Uvz7=o`>4
zpiekQZ0~5!wJ=IALDra~i9v)~r)g<xnh`H|y@R?6Ng_`_(9s8Bp2l~K&`<N&+?aDZ
z!yFUw4%$vd54)nhet=Zgbd5YGg^V>VywkjJwwYAVls~I`qWntaIL(jKaQt(GdGHbC
zmz4}ZK6UBrl~-t<rgYJwgV4K&l|%IS39O!yL)DkgDjfQhA~0H%#Y<<O{c)7Yl@JlA
zp8ZsW$S3cx)5Q=G6^h@E5?Kimf$F(;B1EoFu~Rif!~r55Q6l9K5vZPhCCbTNcIt$P
zynN~Gg`-g-p9>Lz>SxYIh|F^6mqSEe4IO{!(%I+#o`F2ciF_Hbev=g)35E+k1m+!R
z*;^$ZTmDgQ@w9>G0+BO4a^^(NK3&Z_An+6KC``R_>ySiWKuf>v*dbeN2D^jgSTme!
zR-RZkx0>d?e~8y*uCCg|xek6NFrE9X<6yLL<qnISD=|dFxc1gUsSqXO>RAirglLtw
zgnAibdtAZdP*IF8L4<sgFc;Bf=U}d_*Ol8=o?;8ZH3fphV<EUW8#^=jUWU4*JT3$W
z({6|x`+{pk<65eMb6L)D9+w{2G?_)xYW&2c8>a&}6~Ji#OQ)Y8g&*)Z)>uGkEMhbk
z5*qXQyLaj_4^F%`aGRDBmS+3|QC%m-np5Bt-eySfs^1ps4-bNr4>7~OM&05~T<(aJ
zynrR$JPmtZ%L6@7RqqflxlFXRg5!fCyQ<f2!~DotqiTL%*QmByjPklSVXkj(+WkZ6
z(X{W$BNKDuLqoF{N8#`A`N6^Qi|~74bo@eke01*o_^j`Bo)AkuzSqi#w_aXexax-&
zDKi_Itz|V`b1E52h0N6VnumPvKvv7c(V*`=tXi7dkBg6a0@BiG_Ue00zIT+?L`okG
z5eA87%+*XuwbpHHdG6e-U-X-sn!L6XG!T2ulJB)G&o5t{gX1)>>^PA#ON{A5GRyl4
zggCYgG_KAsD!`jXpes_f4r=HMn%9AJ2XZ#HSFp$`@R&*s;8Gp^lcF`v!@Q*vqOSO&
zORP%2t5uIuFiISJgoC%rSC)W@gG(2#&p*2iY}i)L_l`swC?dirM|fCN<%d%n!;~Ve
zALI2OqV=VcX$-hp#&tB^W&q<H;AAa>&_7}ss?Lk}4?~;aXgxdii$YIw=oSe_VQKjV
z9EFylL0<eAo9@Uj$(v{iQ*wH+eEXDqM?#<>26XecTbr7s4b_r!7Ev7%(%Bh#)irXc
z6v@1{(fg7l%M^}7%&LwGFfYlDNzGtNe(6phTPVhbD|<@5wIbg+B>`F(B?XhF5)!Av
zwp1gpCDVQKc<(6yTs5sEwxP`op3<gk(N1Y6Pxi{liQvV6rhr%={54__umQ3}Z5t?0
zP{IscSF=P1Xy{H#EtiRrPbO$tqL;$Z<DiBLFVUOAb(CZRwKLHtCj?VLg_dW<iIW;{
zijuiCxB~xEngRTv#mXmVIJ*oe%tO{n^in~^M8ze0OIyLgP<5N*YOBdWdaKF7zM)=u
zS{_V|3ejvw#<5&ZB?eU;xm4M;3rWT-@6sI!zwpN*j%yha%u^E7pm;&{&sru-P)lLm
zVN78?bQy&|oU-Y}Ihi;?6)i#AKzX2pOD4E1>gZZrMX6o}z%PP7N1!b_RWvmti9<!3
z0}&#8iUl2cW@bhn1WpB!gkd@bP@u#q+VwEc)C^(t8MAz|Bq5(<s3+mDX2XQ{C``q6
zvsuRgG9aM@1qRltGOMaMax=&VVK>&nkTpA#V`~>N!83K`I?6$WR#>R2Va%DCE%oV)
zJd_%gr{sS4scMSMj2m=&uB_Q+o+K@KppF;lgSJkC+!6CmFV!O!Awep-spf0dh|;CW
zgrH2(AXc2eHv0nlH4%dj%+BAy-53agt(%rw#ySx-4ge)^m~<quJO~deC>Rz9_uoc4
z1E5shutqrkwyWpP&t1h5;pyAv7oU9&VoHbTCxn+=VK7+M_gXNzwhRdu--;V-P;mY_
zmIkgoExb1%LlAFCb4Ts9U!9%2cy&3CW$6HVxC))d8qt;Z8_O3a)~;i>)1v|OSQUC0
zYXtA@&fIgA>t$poJrO`pR-s3rW>|36;;!MTV60_WuygSIbIZ%u79qeg^@hIZbQL4;
zQ5!eb@+t%{HOKOdmSMsBL1rc(F9`w}CVA>@$0DS<7bf<f_&(}EgRQw2U!lxvYX+!A
z-oM8;Sm{<P%O<*EHco@>b-ME^hN261V4f%0b2#3vC*=wjq<z;pa`C^AY!tQl-%zH!
z;(D#<6{}E(A^Jm@AbA#W7~GZ#V7pD%<3vX|3CM-Ot>x#yBxugm8LZqxZbB0@8c`s7
z%U0;lEyD|e6+2p>K&rwv3I-{dTPBtbiDBNq>u$*t?bhBp?RwpYid(R3Ez%demfLX1
zA&N)g<gQC_)zmV9O-HMWt_cA<S0OvL>6&Pi!{b>KMol%YS<k@)Y*jO$dlcfX>jv2E
z6kF6ZgiFx8H9bF1mmiB?gi_Tz%6EchuPyM+Af=_cd)h#nPg(jA(4qv3|AB3Jhq+o`
zC#r24d<&Q{e#kYbauVAK#2>KV&e;!mR*U}`u4gvFhOE^v4H${ykHY}<6u-&!I{3aX
zH@o7~zzK-pH55TH0VhgpwI8pEe%#x-@phW4X}Lz)zv<GRZ?A#Wn-p6qUt60OQ?Z|b
zT?xG=Zt-y@o3R!#q+D39SA})JI~1B|VVJ_$Rh8iD`w|oqe61?MWbrF_@&^1NOpebK
zuOcc2_p0!|vM)R#yw^kOM+KG?0vibl?2Uba)e%^-_$How75}_dRk=?H366#&_~yO@
zg#_OSNsul_(B%f|->98Z-J{~82{p~3X<|(CU@qi7A>^@;kiWWxkn0H0>%0n)`#dCg
z>y3!J)R4HpP!)Fo*}>Nq?M9L$N&!OhmI7m)KhHZq9F7QYHK1V!VRRhGt{doaXCplh
za`XQ)C99$P2-UPW*si1=^upam(#c1tnryaf223Bcc)Hl;n2`MOhT7;VjUP%f_mrZN
z8oi(sL!LUc3GE1lJhtrDuxX$)mvI5N0Yxdie?8Hg5l@sJVsmHiVfuxK8IOo@$p3%n
zVcv{;m_gC`$w=qZ`#Q9P4h^$a7Oq1>T#R6xVKEMWAarO%bUq#F{K&oz?Vv-L_@u{#
z4vh*iM&0@72SSG?MCV5%ogdrRp&fK+oUPSz9hwwk9E*r?`~#sw)1vbek<L%->(CB5
zG|5)=xem<;F-B8@$q$4MofSFtP`LBy#&c?E$x$6G*QgSGX)m*{cRl-wN#U)y6lTMH
zub&8qXPecnt_FI1xS<}iU9kr@+iXCbj767COuH#X++Zxj{LMQS-s%XhGv|YbyH-~<
z+wr|_-s~dZ)j}9HC*c>kibhno?v$6!QQseP*=n17Lpg@q0j%3#J7mjdrO*h+a08JR
zzjr`b2%p@2|3G!DR57Jy)8w$qr^G}k5<JrB2W$8dGuWfq^S;N7zE%PQ;eHQ42%`_S
zw%^lsA8=XM^o!p;(2!ydF~zJB+c)t&ns_})23DSq>Udh@Hqr1t@?fnzuJ?O1^r{4E
zZ9OK3;c)LqABe4s3sc_Pwl27QSGGRNH8dc$Z|i%s^?H&FY<(oE<72|sM<TX9_JD0Y
zruTcb^*{o(wjPtic)0iD55(5RWo7SeTNhluD_fuB8X6GWxAi^RdOb-7wmufs@hM^J
zkv*LB!~?eVnBMQz)&mLD+Imb5r^3CTd=R#d7lpp3%^VqgR~A0Q^peuqH|{+ecT9+Z
zT~9>y`LtM$i|nfnKj7uKkdE)sq-novtvS~FGcK{c$AEQ9%@C&Ina~xm*v8MZwmhkN
z@0Wk+%)jv`ShZ%+wj=jydY!r8`2qL5?#P{8_gs2OC(^yr*3@)KJe{bhz}aGPp$$d8
zh<{yrIb%bBuQdwzdZU0}X%z5{MgiY!6!5E!0>0HK;MW=j{CcB+-wXo^RiJ<oMTjE(
z)=o(D=H4jI?>2ywj^X^?4mj-g^eE1^w#5;*u}5+KU<aJYBONH}U4i=TZBfPD@gbbM
zZt>5z!-?O%8|CybcHlJf+=wWrf3*XrSL$&3*V}U%y@fc+>34SEH1cqYD5rn31E(u>
zIQ`!CoJMbBj&k~UJ8&9#0!Eb6@9)5Axelj4*g>X|TdAX*{=*KOMjo>f<@6tS;Pi8K
zIQ^&XIgQ@V9p&_&ci{BZIynEj15V_b9}!Mn?;(CyMiBY3;^KI`r`~ol{01U+@6fSO
zQHEHSY4M6~$Pmj&1|P8K7d}>a3ZL9pI1zqsU(#z?4Lz-|(6{BY`u_ImZ=Ipf>TAbm
z^f{1J+<c|Gsc9N-0+fNlnV!KOnTBGW%=Gk}mgMOIQS+>72+!qdn)|VmG%b<q-Y|op
z0~D5G>6)?5kcN9?ydip~Cw@P4j|}PXOi%S@>K+;IewyhKw^H{E#7K?ANo{lcbS-s8
zjomWN=#BQswv~%<R<v2a2N+}x(pU^Be2ew=nB!s0_|4bbqfW#cZQ!=-?Fmk9NicrH
zb{#q>Q?4U}OeMCsW1Au(bWQt1OuOX$Te$ac;oiT6d;b>h_v<a(5!X09fYB2VvSz-*
z9bGyWVdZJmMpg&05r1(>Ck_<`8)0B{{A=L`05Fa*$}sT8@EU|L+!%wSI$aA0v6uQS
z1`FXUn2Q5QEwnt%!T<xGh=F5at}rNqyciRpgV&qP^q5AN<ba@wR|9Zn1}HazwXDS(
znyiNx<~%*PE6&qHbvYl}E$4iDb6e)q=^dFL-W~HJb(v4^mU+Hkvpw^pJ2F4EJLbph
zGC#aq=J^8j_RLT0$o%B4m>(Xj%lyc0nRnN=<$HLj4p!9%gaGTQ)-=1(ibGJ<Qf(X3
z<k)i#>U4$0jez?);$Sg{GsGH-dMI%KwcsC5OVjkRRHr51llY?(=oIB^Elo|KCmqO%
zx)TnJT5QTCb>12pUy*~WVdX@kugZU)U{vY^xQUe@8psLsEp<WrWFCTM5;XZq0B&_<
zmLo#5>A}XDO%H{e<&I%bEel+1e7M2JN5YMB!`@Tl0v8(}Yq0V0aO2!C_tdz+#l|Na
zYkYVx+_<~8r*;7inj8jUbK6NUJ+Dg)Ci`W}HDLZupL%xlMk~&?qf_eI8TA$u>R>)C
z@`{EshsN@Tz!V$G85+$Pc9k#O|7=0z3Srl|t3=@brvafPU`wuscf-|viSME4z9m;<
zyW#4-1nyAS-jb`yU2t`OA~K7|A%?2q>?apYgHRd@q`Y1J<GbELi)^Y^9-kd4S?Xpc
zbm|bRq#3x^?BDY`Vdib*_{l->AfqnIR5CF%k?6$$yoQUG8onUL`{)*LlA$oZ*LiM@
zJ|#T-E}1r*w66O%o12@O=4FGx?9DM{HtUm}0+Fjvo1`y#=|NlVp%c~~cdbXx5-251
zP%!(YC_PLHJ>2o8<O}TS;CiJ`&cZA?OcT7844q`2JSES|o0_i6tD0pyd`28*BFuto
zV|$uo<2cGFjy}p3z#D?jRuqu)82<2JBIOr4yho_8KxtCCUcxV1CIU&%YBr=THVy(8
zpVtw7<xcU}+VEjHOZ2rS3;dNj6cXQkbCA96#P{0y4z7Rm5Xk%m)1@*{4VgA9J646t
z5eC83H=rVT(~gWUUCGO!xU@rzifpdZQ+(eHc3;9*?8q8_sSNOHr(|pcU&})`qCt)5
zOL-z#GH8nq<H#gv!!%qLJwr|q6zQhqIW-5EpjP;DAIYZUOnIkmfi#)ivjmAa0+*{L
zCC*+71TYrK5sh}xAe)qvl<zVbc3*Z&su34q;aKslHgWxec;Xhjf1&tIsJM9m<at7#
z5A|KTPBt{4P6fu4P^K3d#Kk77z!9h>l^wp{NTvM&{#P}oQ_v^dk;<k`?HzcA9E0~7
zpwt}hzw4S^U~f*sw-iyqW2fy+AS>iT2cc{cuxg1(p5$=fUd~oP6lKd?v(!?bTvFFT
zhAwCZ@?h$^$$G(fbExWu2U(I%oRBX;t5XuYgy9<?5h@n?r*OPZ;Z1U+sYyB^U(3Sa
zY_J}KT&jX1<ta)RS0ksS{d{OYAKK4{_Vc0pn-5V3bMEG{MaY!2XgXv{z6Fe|$ePXK
zj-i3wgMTg;)PBP);RB7JY4O?;S#`K~UegQ%-6D7}T{C#PLNH$Oon3I46qGfVlF#Xz
zYQ@IDMl%5Lvldweig3U&Y($~{Tn*eTV^M@u)6grixXL9E(14_d#hM;o1H-1SVfPFZ
z0*Xa2Qua}ZfKgf(P~s7vhDb@;Ut!r_VR;{}uuwa{$kb9O`jmt&Coz;xpc{}cYqn!r
z6|`VsQkaqi*(xT%n2^ZlScP3ybAi)XE`Zf=44XKZ&_U8k(@ikwsEJaOCdP(lnFjKa
zk{3){q|zk`^YL=UDF7w0nZ(mH^e<&yt!!HOV1LNL^WaC8(KcbK1&&e%kAh^VBe7T>
zFNK{+kaJS+W|{6<fjXbff(DKX8@9_Nhn_7T_#U&Go}mMhhKA(4wh9f<3>zYVjZJb7
z#U>LQs-~m#0o*Xna&lAEoJ`4{l2>&T9OjHXG&nfeE0s(bjZDehAj#Xju1~J;BT>z!
zo3-_QeAK7ps~QSdzm?F*Dh__4q~-HE-TS6;O5iRss}w}@S^XzN*W5JjH%+&E|0uGT
zM<nw^chpRkDm$-Ap2-FjV+yixYfo*#6vE8Y5DCK$6BFkas0wIgDxh&rrzG&zLcmn$
z@S3$i92E=1?@~bnk}F94)=ICmPAXYbwelH^UUm{rDW9PVRLdkz&3MHs%zi@sq3Hsz
zx`rWCm2ox>v5s3X#?~yFmAL`2K95n}G;&~UbGKEPXMxr8Rge*!_A~L!RHYGgedd<n
zy_bh>=@asjSt9)1wI#4uXlD(t3DLE6HW$P~i@fM+Z1z<<ZKir5!(329W+gDx+n^cX
z4ooXgS8u>hMKkCrffXIdQlC*!an9=kT{>eVNCA8ZuDO6$Kn(+rMS*bg3}OV?tddPB
zS5T1&=o94;EgEtoN7LC<6enxgDC8Pp0^qYKcv%Gi&@<CWIG7l%5hu3J!x^<~2gzIr
zx`NY87y_K+FWtCA-3z@NLSqepx5p)dvW!ZHo9za%kibHC{u)ZLc;;5ArdXS}Uc(k+
z@aihymP&74L;xtKf``hi3E>KQVVPw#bS4RuWj~iCVWPH${y2~<u2=db8-{|u`Irhv
zta2e+OadA9hP^)Y!<cZ<g@F19sL2v!mTYM$7z#`_QH+q4<-|m~nzYrMlHbIORWX=l
z!3<EnxN1w%WpM2nAaGqpRBph)hsXwt3Lu!!>r_N_b#X|d889TAsnC#MpMo_7Xo#&N
zxVZiUrUZR}7?&dvN<rL|Skn-Pr|Ht+X=xdMr~={|2316%hC*<3H6y3$Iag<K4EVr!
zXNPSDq;G+5qA^&80RU^CrCrN1RtUtroFh5-mT4Lo!TW4zV2Z+N6x8bK7+=Gvvr3kN
znFr@wnr4`%!3mL=2Li64U9mcoie~(%2&NIh0z)cQ7o;sn)*bWL*qDQ(&EXPIoiRt2
zSsh_@$a?`t%H)ebd`3ctAtfzru~6g?Kt}teBgby%Ks~TSBX`j<OEprQN2?Lbz|1#L
zca3~hh)U;2J?X)8x~ETKW*f61R3Wg4yh&{IHh9Sz!=kHS!73Igi6i{4D&;GNT7sFK
zu<bsvgzH!p;T@>O1x^?pN{>2>3khh2x@oTS$)E&LsSHX6?O+GD9e5r_kPd|B@aPe~
z7}<4RwMordWH()lyqSja6w>^_FQ0ECU)-#yN*9D{XX#tI9RBt)BLgaLRWiC>qVE^y
z=YDVpir1ZW!L{Uhxzfbnm71Zqs@eX6-{u`MT_;qeEPJgv$m{(}XNI0Xd-K<S^&75t
z2=72b{h}{u=htp`;+}7~Kw|eo(M^>*zIOy-FQm_q?CTlTT4O9r?3})y!RuNpJN+YO
zKfb8T_l}m-+rWgQ>VlOkP46@}vGNGlto$&R8)P%+OVe4e3+)%LO9jh@4t)I!`0tmz
z!#JBVjg0M<%ckY{cfBKd@YZI9Ub@bhc_hS4KN#vr|KNB(K5xc$v&G+UV>fbj8?3bq
zu2lO<Ck_ECz_-IIk9m)a^Lp3E!w@|r>cbbw!?1j~PC36)9$ywOYQ=T!3%_%~J1z=C
z7n3O8tF{o{nO}Iex%h|B2c^0AN0B>}{um3Jihlw>pbN!6#na{Dx9J@LeE7Xbf`;g(
zS+IGt@Rw~~HwLZ@zlt9u{-f{y%V$vU4l&qj#lO$}B38IwCpcBe2JmjX0u163;4fp!
z;DFn<t2=C+FwxAg7t&oi_njZkef?Wsyx|=af`Rx_Q+YI6X_i^t+j4u8_mNN?h<6o|
zBznsMzD2hR)xp|~V0N*Ngc^HW{!~V6AclaoSAN6IdYzacGZR>Jv#I|D$IJMZv4Q{q
D2|yQR

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..52c5209d9359c3ce77071a75d525ddca3a9945e0
GIT binary patch
literal 7916
zcmV<I9uwgoiwFqo7LjKH|8sO@a9=SlaBFM;?LAv;9M^RuMT+E*dfKs_IF83uY%U{m
zNiLtFL`jrLQPhZ}L(;ZfMcvNsUUFwSJF}Swsg0o`t?OC>rhrjTKHL;Rio*4e;TnN~
zB1X|RO$!A5X`3bu(i92O21U~VML&$b(4;-*K4xcTm*kR`D?6rE%bmIRoO91T_q^`C
z^Z0!87s}5!(0|^;8Oz+XiM3WPCdthbv9uyF96PC-X36*3mK{=>GgeLiWxwZF{9As)
zYqp`tR~o$oX2zx^hq8IqNf(K2t82veZ}<rcXPG+jy>`{MHQP}Q2aX*e4#&lkKRfl^
zAI-dd{FS`d$`O6<>C4(<@|>&|<+ABo@>6gqaC@OVs~R%!w`y8Nd6Q&hN43{&8BP&A
zDL(^DP?$M6PxKNN%A1;#Kj*u^PQJrycA=Yo{t#>Wz7y}{4||U-6P;uodHK`p7kXuP
zO;(MZoYl4Ly1eXWikg%3UE(#bnX2yRk9!?>kTx>~!0{KnBW8&h=@Rxj?OHl^-Mg}?
z>13k6zdvslNx#!y)C_IY%=DFX*T%0X=+8SvJvgCF_GR-VyPh#`5<7{tzUmzdwcu!u
zPQ~ke<RazpVgN3$fGiMP%{tl!p$KWs$dQ`>Y`DczIhi#rk}SHqqp6mqmVK`?ZIP^L
z*pB5wXI}Qb77FNltwxT4yzDCtFZdT5GyaSIlAll-mBttR{G(n=&MB7&qBLpa{+-*b
z_JG%T#$Q-ixbna7kN!~_96xjCVp{|M=fkOY00iadAM~W0Srn2)b!pF;S(mCF^biX=
zM8Q)_SAh|)h4vp6uQj9EgqI$HE;s;mRW;UJ)FRrl*GY`5nbVB51+@q`O<r5c&4AG6
zavY$|E`uHygY(u^3n1YX`q5&TIl@%Cqoi6ueU6F}6r1-8K26F2__WZ!^?W`?|2_iW
zCipxCpA-Dk4WAz6Q}F45PZB=(n}QGij=~3jC*adVVevBs-xJDX#A`!dS8zChJEdKb
zaFn2tM%r8(KWdB}$G_rlt=h&qf5X(^L%hRwUd@@COQz{i22q|Z8keNkvZ3lOa_zD{
z(1F4}G=oqNDo2!K%6;%XraS=OhxiwdAFuHpqmYl&zsHob@STc$Kdr35?~NUQ+4omD
zonypnN7kO^oF*!&9a8e+?59yVt~@AC#E<gHcuB3FQT8(-4kiIH3UMj&GaqN+Gw}iv
zFDS+f+<3vQFm3U3Te$?h+#%kfjGA31Ms5Mb0Y-x3)C8l{th9n&q1JUMoysBQFpS$#
zklb+?yZe=oz<50f<MvTd{)d&1gYthuc~p@>^}9fMKM7j$xbg%ja)PS=Nu^im1Lf~m
z29!Zi{vl;p83E-VQ^u83Ae__68Ia6b<(zUJM)ZPm5%gqQnE?)Gi6^O!V`&-JVe#jN
ze}nQ^;Z7p#T!Qm9eqJfLKPVBHVV)VxqaiG;eFXZ!YPurW3EC%ClL+BF0exhpCnKeO
zyfhgp9pI&dk<uYvI$YI*@emK9>~xGTCXI6;T$Ca!osN{w^3uzZ(kr}FiIgtz(#4RJ
z@q8&be^_}6;GU-C&nVZR?pdg7Q+|Oe83{A@i!ma<6eE%e5y>Kv2=;0WJ0HU?gs|7E
zu#FgYDTZx@u<a`BjTrXL820Bv*w0mA|8itNvE=g+AwRGDn(_jpz$~*}`SpMZe!x{Z
z<fg-173MpaZut4j=v^hvD7n#!1gKy?DASm7njlaDxZ)q;dYu%3CKn_JD2Sn)RBi9D
zS#mVfU@22jWqbGY7(C4bHj9=~CQ3GObEb#`m5QURVlD@G+NuT#TBN!|vuUoQW^}^a
zOF-~NZ-Jfyy##s)^bS+R_Ks#<3*+@N<d0dJB1EXQo0hhw8S!%0JE*IWF7iYKU49Vo
zX{5(k{Wy=(^|`5Y%taCJpzW0Puv6OW2S{a1*T`{F$XLVtJB{<_8%gC%`DV=%<;#)d
z6hBVE@oy03%ZHWUR?__V^z8X7FVT!m>7Ydip?CKyhv@MmSUn+!s%OtD9Qw2(Fq)Kw
z+4E2SG)m-3hzL|qe=b7g)3?~^LWqb8#cxN6+z1hY>X~;UM3yGlsTv~U0FkyRky3~V
zR8PMY<>WRybwWg5oIQW>Xq3qFAtF%y#Q6x3X%78jh{(&K<1fvgf93}a<WWxK_W<jU
zSkaMSxZp!z27<=DRSL4@ALTPVf#6v|<g6Gub0cTBDj5i*0K_{Ald$YMq|+DC(r-F;
z$QB#H?jTLp3}?~G5zFRQ)42B!@jA@aRhu|V;AaBUxyw2ZMjKb|u*kU*V_b}BZ#9$(
z@iMNS)lg1|R%uJ97a_{W6)X-F#rTqvrW%AUqQlO@tXr=sx2-(I7J_RC1c%2$aB((v
zX7K$CbzON_2o5IT5I6P(SC7UuRRrg<wBrIULa+%li=>tKiRm{^3~*9_69Sf4KSJ{V
zz~fkb0j0i(QC~=?&*$&nsmDAx@mj!bT25G+@ef3GofvCQo=<%nA;+tHo2WlL2vR=8
z411Bf#cR0m5h-~VOFDTP_KcPTdZ4P_AzpHsXlr@L2Ss*OuhoW`lCehB{I0H6ZM7`r
zb#B6}-`uqO22&#`-;;;OXT}Bxr!S4b-=PZw17nxq_u|Od#njly%!RRO-)lc17Jq!N
znGtWkyfS~)4=-1yH#A$zXu9T<)0PUEsqZxo`rd(zmV=`K-+M3w3qneptLdU@t=qnL
zziMe}A1+Yl2*^>Rd9Cj?_})=k6De)JJTo=z7rxNg(BQS4pn=$HlzgvcWp3r_3>>F;
zW!s6YS!7Hfl3CuDC&aO3pmB9>K>^+@0$q_Rbx=ds(!4gL+n=><?65^vfs<si9~bTD
zpCqkm9O5mV5Osymy2L8?xmx8Y38TcphdFq&d}SG!IJkUqY3|7tV8gbuzIP<jKmid(
zIKus+DmRqe7@`zu{V1>h2(2#`O{3q{(ypWFHUk*r04J*%g#HoBP<39!e;C?0N9)?D
zUljTjhi;N^6qZ+>#ZhP)9N@(du^Es2oV<yaFd?S~N;gl-w<H7_WI)%x*4)q_ZK#%<
zwTSAFkj_rZtFDp7p-<$rjh^QuS*CCtVp?@nfO$@KOlk%b@^iO(*^)6XVc8S%^&9f7
z(-NSCQIareDk5<zY)dtAS|ZgekM*1ez*W;qU>n-xz-jGFHQH(I<jEcxIT5@V&=e30
zgug}%0yaRFsci%0aY~qh>uQGR01e$qs-+S!a*1wQ*4;y4=y6a(h1cDa#1)l9H)>~h
zuiPz|3M#Z5EABq2@un!5>jO97e^N7mAGBEc<RoX80fl+USlvBTP%%+)$zIoP;9#h_
z&2hEWL?FG@#6a(0k9<ZRNR9~6Y)8hiTuvnhRT;Tt$+hzd#w_pBEeXHy$0Ckv84=9W
z64ao0LH18tCQML^VclU&VLfyig+G+E>BKqFeS#`lH*EvufetRw&1F$T*WxNl^)dkd
zEckN-+M-hhQ!^4cRJ1t|A;KqF(2*x6C*=X)R1oPlOeYTt)P0(EJ<KyTLl}L=EZ;Wi
zmQOO&lW<tIb;5fTreeF9jAH;9kWe=T2G%MvtEf0~Gr$I6H`c+BH9MVU%NH@hGj-)U
z%0YxySg5LD%*n|u_34~Em>iHN<UaVRXo}2?8+3c7q}gVUBrLhVh8O6Cw$6gw5%X3L
z)gu-mK`Oeb=Bm|*(xu6SpiI#qR$N$|eir?jh(QOY=bpxW83=*Ro0eL_IuSJv03~pk
zbR@An2oEYK7#0Zk-$FYBpj6$kMmYYKt5X+du40Ms^euA>Pd)`PrA_n`!b>(k5G?R}
zO&CL)2FHD`m5Ww1i<@pxbzuqX1E-!6ejLa^IB!aEm+iG)oj!Hx>PimFQUUZ(1v-T_
zqEoF;uUs5oTS5V(Mgr*33iJ@xh+echGf$P5O2|%XJb*q`fgXmMA;DRbyN0KNv8Exx
z&cO>$t*k6AK&WMk4pPYKD#qfYHg2}%R0w5imgN;qLxT5%%wRxr5`;8N_tcw?MM!0T
zOzcVVz14#TTZb>aM48vt3{a4qe}`|n((PB4S9HR>oQB;cx*ID7r2}_mo*|hj9C_E1
zvUv*<z$K1c_-`Z|MJ@aflqoN{UNic~3e;i910Z0K91A@RZp&1#)ut<Rq9dFHWJTcM
zax-8YG<#|fR`VfCp$Qp{Es(8c%5*oE;f25o?M+Z1RbcA{gJjGt5zB@|Fz4TPx8#X-
zYj2%)y-q{LZCSP|>5HArZ8+o*g=27X+a<VuYMH>Mqg6!Lh=A>@kSE)8bu`N1@r((h
zrW%XZQ!o`<)eJN{gt+ax0d_0J7BzL@5;Sy8&&|<=$if$)RP~PX-J$8ldA>oUG*xy~
z>qz)1OaBhED8a&iU|Zf{uGULLwM~O>3p2(Kxdv5EVmpEO1NK`v`vK2t;XlLm%tqLd
zH5;Y@BT@KS7@(HoH@IFK-wWnuS9k(A0TH~4A}A!_L}{+{<5kg*J6kv2O0zaCTTlDf
zT-x*PHIRCfLJQ?<Yx80%_8!=k&}-rrKF(w_)+C0M3+uItunu^KLh~~WSQy1B5`1l6
zf<l6?RwS4xypAVtz#qco_(WkIQ8CO{g!lTs@PzPQ3#lI!SV9PFI3%z)_61f$V2Q$;
zc=8qe^HxRWJ|!eL5|ZGX`w|oqd?O@5x-dZ(9;kn#c1m@RijSt&G@E8AGUmZt$h|_y
zqah)GWeXwK5TMt76(aWqNbuI{5qGgJalcX#cL3SOR~hYkk|at2Lh_aZW1YXiJ3kbT
z2yfM)VFzJ!9LTQg=y7{JJq~j7|2HM8qWd7#v^dzVq#pFb-9^&LN2ro)wrU1UAGCP7
z*y5Ow{PBib=z5JGN;3D9qLM1Tpc6x$I<yGw2!%Yh09dzapfp!-DYp(q$$xM?(VG!Z
zlp17nXYOJ8gohc6h;h*WIrK1Z#y!k{==`Zj=TrMSw1W-}u~inXLxWt5V4Ptw4t*$e
zXjpVU73uu&z7Fl6LzwuaMuiTI2r)+8`N)SthsH(cM<Sgc-PfTVbZCsN)p8v=CB!%y
z5#!j0LWj<X&W}erKfbR+JLu3UwyMu{Xi|tVni8D)Q0UNkky8(bJD;jQr<Rr-)zPx`
zD$$qrGW&Yhv!9q0-ik|MI^6f#iEwzfS=sceqsNEq>M`3DyLYqAI>gCXblJqTlTyU3
z#uCimykp_5j^LtmK4`dWbyc$+-|OVfF7RC~gkf_MeyOWyM0IOUdD$HGgE5zVH`^^0
z++goCg4+h&w!!X#{S3U$A<VH9QjQpEFdBvL9}vdH=Y>B!P#I=b%*NRaJ8UgUF^7u8
zomA@H`izJL?a^cKzQ_DUwFLUZ{T{j({vy~Fe_y+dfXkX*qwu{0bt&c`Q_KpnpDXWB
zpM_U9yB8C$CCR|bQ&AmHiL@x184TZBE062_9u2)BfofZi$zdqm`;mKM>*AW0_qVMJ
zF5i`{k8lkQi0#|@9&NprBm-L?j_UZRu=U}Ht&iSgTaW4eUTr;)K((#M<S-WQ{n$OR
zb#X=7``gw9m+#8fPjL+mi0#|@9&NprBm-L?jq3P>u=U9HPip)g+j>mz_iF2b1gdR4
zCWq7E-k-V`wvN}6zOT(38GKh3KFRcw(%CoeJsNjRh=E;?NA>xvSjmfQ$qn7(mAsIS
z@6n`bzpJe|*86iVvAu_YbxTbXrsC<)#k1JPZ?d*Lsd59EKYQ-q_ye(;Gick98$P}E
zZ150*J6>nxUa~tby&4qhF19o@%!-E{6`q8%g~Cz`ihLRWy7WrOx&U9T7x1-u0bj2d
z@Qr!_->et#EA;}tRWINl)C>64dIA413@B8A0!9=eiu5NtA<;X8qc~r$11A;3`STrc
z*gfu1oNsK4BkqNd;{3%9IFaW<P}JK3^{=)?6}Q-jaBjPWZ*7MYzXv$V>9=>_H1hC?
zD5u}qfzvBBIQ{PSoJQ|Mj&k~UJ8&9#!bOzRzu$q=8#Orn{`Q<k?}d(X`hy)fjXXvp
z%IObx;IveO(|_DSrjh%xqn!R|2Tmi;>WFgsPdjk>d<{<jd3#Qy_k>3|{qYW*zFY(6
zUw6QXJUAr6sq5XxcNl`mmtoh{kNbFIzU`#Ny+<81pe4Eo$Gdxc?^viVO)Sf__>D=O
zkSdmt9zM9y&wnES7(Nj&e<J)mz=YSd8hQ#~zIUr<0G|KGD~;#rGXPuh*?$gX7uUYn
z+0bwXZz+_4)5)%ZE}6z;olJIhot5M>d7|c6)gYeBXJ``0N>a2$u6ff9g7#BbiluAD
zIzt-jlJWNH$*%a_)m<{=#gko?TeG`lyn|}8OWc^<)gL1@94EES&Dhn{88vpBIioky
zCEHdu##zB;{qARwRY;>Tr0@;f+hdM}G2^##Z;v`2YqXA=ytgNKYD<Fg+rMkjL78$L
z8DJ{0#r@wD5uvNvA7t7k@81x<e?$2G4dMGYgnv+P2#>hNGyNDn@gQU7%G}YVVi8t8
zgWAaIAU5JJuI<F3!eGM;jE;XbTt5KDF-8~$-ez8f5QZCNa8#$O0U`EMzr|o7d<Bzo
z0I7zS&#*ASz#(MuaYWz<Ce~;310vpw;Q^9(`O0LMX@sfw3-Wk{0OzZpGA6jpSiH?s
zweZ57NDb_&6RE+PoDc4nbH3%dE%T|=j?543j``u5%%^tCJm16Fp81g-nIGL9^J6uc
zAKESRd{KIP=Erwr{?x9R9~!91{P1p>ch|P%duXr*R>d8J0Bfn%8Ft?mhoGvZ+BRg&
zvBxIV=n4xV0rxe;!NLz`h&2>-R^k9^!Bw7>&d}#wot1n~;!k3rvz0$?YG?>OI6>~N
zIrqQ-#-?FX`}M)G8**@Qt=!$+Tj9S~Fe<eJ-1v<kFv#8LTWW#^q<aH*){U7=H_)i8
z+HwNWd}^S+=2L^==DCB|f`g!8nu*0*7P#2>P@Rnrha2YxzNf|oE;c?|XX9hx#<_v+
zsd0geji0Ko@u7im<L=s?+66RdatMUYZK#{+dre|6^)Fej0aJSVShZ_vGfu#xv+U|g
z_7-#OV2UkLjk=PF`cjF&B<o5e>P;ecl|sCy34}-=!rpXOxxwAf3_^LqmRt?(hO4_0
z@<RcCORh$D!_{4h<)KKvC0D0*!PVUf(kw)W7^+0L_b$E$(KQrXdAt0_cfEra*;K8Z
zAMC<-lBI5@dFg^DIm9Yz25wCIcf59pX-1Zx9E>ko4h`U<rKGOm3a5rIr|~|%#anDB
zjPJEit<mR_hu>q<f)m?y-}~N5o>ZJLzD}kKZ4>%#%c>5(2IfxO@_Xr&c1|`3Oa~p4
z70XT)zPr{XX9ye;CQX<?Q>ZQ`sxI!Rlk!SURA`t#$6PPVAP3fX5*psrbe&JBVJvGm
z9PouNugSN04^SC^MkIB;h+nos1OlAaY{(^Slr5Kv*A{+@PvQ4l@L4;{^i3)A{4G5c
z5?`ltki9_#AM3?-as6wD8XFp(HC-7zG)PLO4a<&Iru{c%+tfFpB6xX^jPG&D1wCO@
zWOJ3C;wx-W+!DUEN7ndzYJgWeDPtS>rXRW)4cbB9^Ao|6K?!t_KN~R{X5g~u8FGT4
zNH;0Zs9C@SZNc{iNj4oH$~$cfWXNQmAxOj#xLn&ParT}efU!uHXtaX{*`%B#eV56w
z^Rio1jkpN&#|m$^h-(<c1H0Jm4~0L4ifaczo+sqFP~W8`vY`QWDln!aGQH3sE;dkI
zT{fWvWrwd!QfYsH|5XiYX(x#7NF~#z#tb||F2Q>ZP-+(U;C0Q;vzIa9i;t+_vD5Y@
zkQH*FgHW;v7^}o2PqMgoFK5aiijrlnS!%IYE~@JwLl-myc`$X|WW8X#;bWYlgDgoW
zPRO&+>V(8DWB3|Kgo=g!$sey#c<pzOG&D#j<i!jO&Iao-$fY7EQktN2aUpU-+RuCT
z^Pc^@XFu<G&+;DXYo@NPScFVS3#LOR<m<r54UF(K1Q{B*0C3f%yxM2DMSL_8G%a3*
zBC8H})N7hypoau!rfUXaZ@nWJy7)#f_)ZGS8cWJk`led8F~(uf(dAlX6)3_1$FLEF
zx^^}2xQs;+T}=bhk7$VH5(sEOPQt=Y7q5X~Q`fM2h6%C7A{aD#DMY|1tqUme;7_Ba
zB<(M*>@Ti-fEQP&y<cKFBoux^Lf3;ATA(|SFKf1AT4l6kVOp4_1PLtW#Vkj`Y(*ZB
zq%#cUtQ5Lr100Mf1)5WghFwy#fuC8)<Mf4B5C_viNLgvJ3O*oe)zk|RV?(n{16fYW
z^QJ8l@uGy;e5vf@fs)vT;z=I5q>`?dH!Xa>0A%nv!saw3n16w9mB9%i8R~m1HVcr#
z{wK&-sYA3(cP&r-)n;A;--Zp_C6YyFm<!yMSxN5DXGw#Da!y-?255c`LBqx-xl?0P
zkquSTai9R)FwIh8Q`MYw(Vmc3brXE-q&zq<Fwi3vO&GXz(cB=3o4l@9F7qQ%%|@ZR
zzLzV=gnU&);p*4Bb+U?j(p}VYIh}5rQ#p0x_A{##L~|May`gJv8h4wfTe^D`**h!}
zIifpix<ZwmS0vA51Bx*LR$=X_Etr6qIT}@A*kM}ZTnAMFjZ6kKrgTaI-*5y>g$@_3
zdE%&8Abywg8j!q!)UV&@k=98$W2#mzjRDV2z$xW3RDo)lz=<8NScSP$s6RBr;8lwl
zbye90-XJ`33&wDqL9;S9AU()oVqh9sFt(YSD$LTr>bY|m9`6?7nXE`7==$V!!Fvyn
z_R^>1WwS{5%XEuiuh7mKUQD8E>ue^8g%<g&tFgIT^`x8Xg$y%E4Vjg|P;Y`}fIBd)
z99_}@I~C2KQw3IZAWMBlUd0)+i;EUG-y}!@e25873%mkq7<eoSgi~)2BgkfzY)aX@
zicCPCD355-kWM+8&L*<xJh4$oPr?MiXHoEy3IL#IrqS(SF11GJ)Sg`y!2B>{)Uq8U
zb1~=&PEuhAaEV~~=~?Pt==~HLYXAZRE-;j2R65+HH;9D<=EJk!P*%p1y>cZh+sqRK
zAkbOjWm&*2mEN2P7En$F50x1cA{+F=GD~RaOcE%|J}yhbL~RTGaUj23FZW6|3<Z4^
zGZl_l%|f=A1k&t9f4%63F-N1z2(=MVlO;$-*`ic16qui)7$K3%ifMQy&#N_|zxKUG
zI(MJSfEl2Aan+Wj%i!ASVw@qXqH+TUK14QHk^sSk-rXXqD@#ifO`0J`O@@X9`xLB5
zKtpU5!o}4iFeT^%#JDVpPzvIv#F~bXeTJ?lo|RVchehv9nJJ=BLm@c27Lrx<tgEv)
z27F+=v%@w6(zn1j(HJbl0D!em)2?M0D+FR*){z{1?KKUI;C(hUFh$`s@@i#mjjw>z
zSYt~<F2Omkq#BC`iFqL48rl_WJIUxI0TsbC0$5;3rRtIl<ta7C{8cvQ;ApeBfK+45
zk!4m#SRL}7$B{Dm(h;A;&|yeQ^II%4IRuc=e(A`u8$wVI?9j+vvdm(Y6z9-t*sx(9
zg|w)<Mn1|!rBkY|)IciL)hjWxjoA>Y5LiUcBsO{*yw;6j(Y3Q+4GfgT5&l<|a%Dp;
z!emg`cCT3cb*!@R4%FfTCyWlIOC7?s2ed-nG}rn3QG%#c0wsfXu!GwUJP#vCCz55-
z2Px5|%#I7HO{!KhJLyX1wN4n%iXHsiE~{@NIozA5$PL71XXtCd9R7MVBMr)ImD9Rj
zq^~mP=b?R(&WsCABge~?2L3+QG`)?@_UB(~@eY};6Dm?>z2+=r?EdVz!RFx$PyFPk
zu6GFUPD0J0?}F!-bGPH3Z@55W_es$`lv}=c1fnV=#*oJAY1LX|EKBU1zV*TDSSvYw
z!)7183C#D77S)@;grn+$l`9SJG&ZpE2-mFqFqRu+Gw4ghd9MR46E9T-n}rU1^%ea0
zTi#)uQkh2Dc1tDGa{SxgksSD8vrI2*XUsehVx|vFaJX+^tPh_>W4oEcU$(HDygCim
zS{hfdeWe|TfED12=#_`OhsAlVYv*Bz?i2Ol3+R<*@1r%!`Ni}2UV2e0E_I*(vjg67
zQ5d?YMEOCbh49Y&{8D4#ub~f0W8s^TyO;h33mXdGf*;U@!ng5sw(z&~ZUH|0-UC5H
zbdN09)S3T&i`R)EDa|kJ2Z{gWiPx##9b&MdisIzwp~Cgr!CgX}$2;-zFo?^5zk<nt
z0}k1)Y^b%v6f(`;csD!s-5<@oU2uQw9TS3q$PumFc+@)@tu#x#Zu7rdy^n?JK)kDv
z{?NM+@HM>^s5aJSfcfRGPk9}zBjLt==g7~*eqjjsJO5I4GhREUzQP0&-l3(LXQq~>
z)3a9>FHBv<p**%U^-Q{|RKP9GOf4@i_zPag^y0#d0J!5xGf&PfFHX;-=cW-{TA5l_
W5P14o|F-L9lK%&ZJVXYBg8%@q5Jois

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..b00c7bcf0327cf7402be13f4ac9ee79c20eadf06
GIT binary patch
literal 7953
zcmV+sAMW5EiwFqr7LjKH|8sO@a9=VmaBFM;?LAv;9M^RuMT+E*dfKs_IIhQ3>|I9W
zl3YGTiIOOhqNtHbhoo(}in^WMz2wevc4j>fQjDQCsq0z-rhr>dKGGIJe;of9t`Qh*
z;}k*CqCwlArg^kU(<Vq76b*_N1!}ZK(X>71K4xcTm*kR`BRi&6%bmIRoO91T_q^`C
z^Y~=*r@!`G1O4Ycn6=Dxn^-Fwr4+eQCYDwrhGVC6(=7X5+oD6tv&ORNzvOp+%D?F+
zy=EJVe5J|TXJ&0$av)bwolJ??wz@)W|GJ-~aF(eP-)mQGTeBV2aNyVh;&5Cl`!kc@
z`O);7M?X{WS~;TcJ$X@kM4pw^lDuKMmiz=93fvwj&#8tC{4JYSNnR&e*-`CPTZU5v
zPsvXK6BK4vE)czph4Q-Q6wdfAuv6&pnqBCoUpT;;zUSCmg@fKBi$o_mM_&Bc>bV}-
zU6EBIFXwbEw<<5X*^=g@e3y95E2gUZg`-{v9%RgH5peuD?~qw0My8Ct&bXG2UH2|6
zYdRV4>+36+CDP~gl{7<JH?zHE-L>&63i=98Ne@nFlfAhD$*pG18^lgwt*?4VLM=F&
zqf_yEA2?4rJRg9|OCSpbS96ZGMkqo?GxFpH02^+pyphV87D<&{-O*IbQa60BGh>mQ
zY1oeCLT6s`y%q}Sd#y&EfxP4^4bS`M8?*ik{(_%W8kNT9{ldduOWxTi6GUm!#{Ap2
zSnWQq@svL|H+Sj(;2-^?G&p|t_W8C3{?CU~ZyyNCFWm1*d9x%WiR#jxH*+pkJ?J48
zbcli{7A^xLUJLENuQYnCS=A=I^bmBx0iesOvErf@(U!eVV&u%cW~|JqC4$P@R(7)>
zwAnldXtOs!k4wRM>#_xqa0>lsG0Z$+s@+jmEucP2MG1<{`vspSWgmQ6=-+xiAEJLB
zf^QRi9)Zs>{z<^6Tlpw_y5W<85B{d%gTEv2!QXNCbW>RTOv3k=@(A(Tkk=&~4&Y8{
zS0o%IXrz%g*T#<;W5@BY_*<*CvCiKxHTV$kpj}Y&=K6wZI+Q__XN$%q>9wq>x{F-9
ztPgacun)~3)cwjK<%n_*e2*yi!S@0F#p6e7d`BtdqxA0)<urUJBj1lHOYnPr+h6wm
z6;9^}@!FBKCpo9difV_H{3!crRE{e5ixcsqd?a2{>t}@hjEjQ_K#W3Mi2PiMv+(hF
z0f`rs;stKJ;AWV%__?KA0A6kr??6_~tr8<Y2jT!D!EtJWQEFCNL9bBjI+RZ3fN~JV
z?J!90D2&~`$_HS)?uT*vASnNX%7;Pue@=N=kwNvlKzTm`TJos!7$|a*s{e7NN9hIS
z?^F7f0Z{%yWk?wY<sVhXloKGFlgcTO%xUF}au!DPoN^xYWJ;L^4rhobsg7f5S=V9l
z=emEL@>t<cBJ5m(^EQ57DY-W&5tw0~8O*~WEUbM9`oU_tBG^gVCsvaT;XDR?WTnR=
zrM<i~6)ElKr2~=DL0&pk)q}AR52Ea3j4vjQGa+1*A}gJWl+N(di;>bxyi|#l&hgUu
zkd*O!Avk|fc>>^`q~%X3SE24{sB2Sxo+=p$GxrNIBEJ|Tk_{2bA(067atyl=!!Cxf
zSF5m%7<M^^ZH2JyD(tlw_Kg_!CqmfIZr1%@ii|6kd@>^I=agSno@a!Zfwn8Z7I1<e
z@K_GH;V_SdY0ri0e&HfISxGa>ZnPo^n%D<AHL9E>2$Tfo_=mV&Cq<w+2FU}8z&J^4
z@1R+BG}B<oQ&4Mr_wr~w#X~oXn^8W>HgWT&hzOO6!>nR94|v+L2B})4x<j*R)}v;1
z5^&Q%{Ra9B^cUzW&`(SPhlg{ng^_y^GRYiG79!NzO-ozRjCi^0?blUE9C=ED&OZqK
zH0EQ}ew4@S`W)67=B$Xf-*z_iuzT9=2S{a2*T{2H$XLUbw;HdUZ6uX5<?A(1l;4dU
zr}=Rjj(?pnuRf&wj*{WWCuh!HdXeUCN(U|4552orIY5se!s>B3R6TQ6;m{`)fzhPQ
z&76JwCs87oLPVf?>Jt$nAG^s;=R!nOD1I|a<a&q*R8PMZA+j*ePSp?*2Z*#qiIhV`
zpnB@XC?~hrsS_gd!pzz8hoeNE3lV|p$IeEGOmXNJLPTB)9e;M_>{H)oAP;jQp98Ef
zv7$r4aKVSbTm<cVvt(q`Kg!2>D#7!E$XO|J=0?tLR&o(Y28eeUrenEPNUYDJrQdMu
zkS#WX-9f^v8P2?wCzj2vrg8Tl;&qtI%QkTqz|RDxbBA>tj5e;^VUcqs#t0eH-fAcn
zB4=DZtD&3_t@5T&FF?GHD_9&Vit$+`Lp2CpM2DS&`L|wEZku_EO$65v2o8^h;Nom-
z&)~Zm>YDPP5FAX!A#Usmt{#nRstC?yna4R?j$l(~7D+4d6BBWqBH;7@rv@y=euxx)
z$m3Xj0j0i(QC~=?&*$&lsmDAx@mj!bT25G+@ef3Gofs=lflq=PA<e6No2WnB4^lqB
z411os#jCgk5-E8aOFDTP_LP<fdZ4P_0bX*EXe$NB2Ss*OuhoV*ld(e8{En_$ZMHz=
zb*{tw-(0tQ2hzi7-;;;Nrbh<`rY;P_-@$YJ{i7G)_x$ka`Sj@U^tsU~-)lc6mVtb)
znGtWkxOC;RA6~f3tZBBE)pX6-$XF_5roPuW;CuVBS{{!2eeeDdEC?xME@w)rwQBp`
zy{e_Dy|`4FCm=_S=C!`p;CqK@O{BE>;`HQ{U;K1qLxb0Hj0R$_QS!Z(rP-y+({P;T
zm2JmzW{EL<KxTPgfe^=*fyU+8IR$vL2y{iN)IkkhRrA`AZePx}vBMTw22N6`K3u+|
ze^RujageukOw<*haf!9j>uQyw6pRuFAL8K6@})&!V*ld#h1tiKfDPNq`QD*O14Tp_
z<_PzSs{CMTZIDu=^&`Cg1GK(WGL1f0%eaoF+YDfo101hr5c)?fL)Cc^|6yoj9Ib1+
zeo^QX9J)!uQCM7h8b_gNpr02%z$QKNv+_Dx!nmC7FW)#R-;@w&fB{|oN^?Vlw5D2e
z&LXNqLOMGkFS|w#hd!Ct*1DgSWSPQoh$+=k0p?lRF{v4h%g^5IVT;GOm}QU4*RIPq
zPfCCmMoGb>sf5I-ur1ZdYsqwvJlcH{0GCZGiEU^T{U^0k)o3TR<Hx&Y<V5gdKvO_0
z5dI1=2-pBwq_z!|$0%V2uB%z112l9erIyRY$R`uDEYVG2=y6a(g_r0~;o3?vf!dkq
zkrRTcphC;D;>2-{H$}-@>%R{FQ<?$%pvB6^CpfzdD9l6FN_0~}#YDv=driBJgQ4m+
z$JLgTf%KM>{XGNS@+rALH7rE46&c5JIh7bxW#m$2*DfR(v%E_;CH%r4i#V=jL@-ZE
zP=n$H**|WXFhMPab%!y9_0VM${$R?c6X#^&7*(_cZ3E?j4lbGCvZ$eJaTTR{82~>6
z{v3fe=~U6wj3f>fZ4N|;@F^B_<cWz1xgR(cL=uMS6hMIzCu!HiJX14-(Pzx^4U~j@
zoS`0v!>Uab-lH%T+s$Sj1IU1c5)>F%tH`XP;>b-u8-$%$2Se8EOpdK!#01aOmFp-6
z5n5rPs)jKqCN|ZlGx9*HUmllx;isZ0GBa+_?dh^+n|YG7<h~kSpa<GI4RS}!o8458
zScC+r=%$*lRwGK6CKG}(MT1yzZhq=%^lKsp?VFl?5_f1I1U9c*Y8mT9)Yu1<z+uvn
z#PT3KsGwk2Al!co?F@iYb;BCr_**Vdo}0dmCBoCU%+5Xj1jLj!(N733xk7)i)bBN6
z3~d@1^SxFsTG1?Szd_Zx1*{L8dRq8#AOqpNDa~EB*Lr#C#D&XCc`Qo@(1R7|G}eet
zwLZCYer#m{1&|&NphqgugIFVa(dtY;v9VA_cG6=3^oa`e5Y!9`&YIj6JQa*J4GMPl
zpL=3yX?_kuEmL%mLQYpP79X~8+byp`C{uGRuV@++yzgfQ1Co;<q+z<J-f%2JDm!Ij
zcZ%<_?l;(qeDOufytZP1g5>?%eEXGd#In4i6XxYK>@LuKSurRbxIgn0$xh<PyPlLQ
zSdaiNaOC2DBiSfw@qeI9dC~Qn(LYw84niIP0fXdO=wWc1rh=_DU6~Ud;Upj{0tc6$
z2IHXFQ+u$k4_OLL$Y^YVY%RM%_jMUw2&~xN1O-wBwq7tu#@sToY)Az2{w;S?o@lpr
z*J;=5G*sN6Wvi0D*wfsCLk>|q0w=dzg6pT232ZuAMRbh_*uD&TvQ1Y<qZ}U1nlNgr
zF>gHqQ?X^uK(j-LTdo^mw^D3TQx`5lL)Y~DEM15!{x+1V-eJB^G&O&PZxty`mA%zE
z5`N0ke*i5?u=t<YmUob=^#W0C)8HG#jPV1mL6wu(P9Xk({Z`I?z_VKXuW&uH5jJGa
zhH1b^6#q92P)qR}T(6Dq3Ujk7J_ekC2wp}J6q9hGG*|lZvgpU{%^Po}S(}!tr~Rug
z?fKRkNWDq1h4Qtzc`+4x2kc7dHF1j{X0jP=5<|*`^-4up`@938`56W*jA9iDzOpAl
zA;Fg`5=<6f#go_I4`FhAtat@cG0azl_v)VTgz#PosUH<sQV47)B(T@^1Xe>}$>QsH
z@_GF8Mn&a5DkL}@lHlum5)=}AEhIs@FhLg{sDGn&N_CHlkEYf%n`S98=D}RZJwnJM
zAt8T$6Cu|Spx1sGBKJ8+@K);)cd0IMf2Jbt0J4p*GTQYdNt6PF<V^*}I)9FLelQ#n
z-l#*v_QU8nkX_f&<Mw)b9OULdH6^Q}dq35*IM|M)9`wT9LDI=bsFG~9Y6eUnw0OGM
z;+T;9@rGLHdW|1SGIy1tk}AER6GNUlv<U48g*>(ZShs1QG?#EGw+=-qymvj(>k&_s
z9$<54?qPa`hZ&8Calro>^f0f-Jxsso{E0~C(|bC!jSda6RTi#816+(?oMABzzAtoW
zNOV3O>HN^14sD}DnE0eegboc0F-G0_@cTlC#zg0bBb^`F)1hs2Xq2thaveG$#5fWW
z<LLWBhfaykk3~8^wx>he=+Ftas?T+3LWnV%5}bHn=+Ie_QxAkYpRPZrmKGh=(Q@@F
z(Pwuv`+D26pO_Thh)ZEA-1pjvaCo*^+4QQT#|P``G20cpd$Y|t#K~B6*~GMyQpBys
zGR)t+BjK%%;JR}@Xt-;2S+gDA>*USO@m(#1VRI6GsjFy2b!$#}*&Ow~F_*2h8SH;X
zaNB@&8*C@olfY{g!mUc70gj;t5-omjpD++UxBJ1q%226dM$Kl)VU16Tc~B&Fq|<lT
z?ju&PORML7kJ)^+1p31L9=sbiAM9+utIa;(vZm!1zq_w4#T;OYSt0g5wD9U?cVptU
zBpFzFI;!Jok=jI~`_SFB^0?mb($FgssJ8W(90tR^AHFNLF0M;?ciXz)@@?7rFxSw4
z*q*KL($;H9GO+cbsE&^aTOW$p`p8|j^_br8*46_FRNHz?4x{1TkKPqq7gv?NyKP-?
z`L=BR1lQ1j*q*KL($;H9GO+cLsE&^dTaRqvq{r^Et;h6!x3(TgpxV}BayS|8{fWC_
z>v&D*yV}f=!MA1M6HG5Dou4V=-uYFKcV$W+lYih6#-gThS}f#6cH{={@<Lw7Ja*{~
zXuqo+L9F*@Tw;3<0PB{TA<Q~5p=)Qcjo)N#c~a&6F@NUFzwswyHD}Q+MDF+W+H=7(
z2yT0wkz2`byYymEq&wfz&@dyOdQ^NI&gO~>EhzFu{Oi&SA?pHsxn96m>IHnYUclGt
z1$@0;z|YqU_(r{ee^4*r7wQH4qcEUQ1qv8ZgecONw?m>g2uE@Lqz;^P4Cha`!C|+!
zM{)jaOB``4d=%#|w!w)!7J{PQ5~yF>5>?z`AHuoi7QeX_PW%?&D5u}rhSSKiC!(DG
z%{H7~s=?`Zw&pZ?8*-G>zukt@$OA5-oc`T5oL;ZN>G!tgG<qv^l+*8T!)fFx8c|Mv
zunni>8l3*aHZqOejveLnN84~3c~nP~(|_EC)8}e%`cGSP8oebv%IS}{;q;{%IRCm0
zPUN{E5l&t29-herk&oUAjj<o^%eS43xb>)m2DD^ius_l5dq+Zb8Dd$c#qUe%ge<d+
z^zaFee&Oc|kKm&L3&+9_226TQ%b~{s7J4>&7~q#bxc|^u`Y^y&d<dWe*~Qh*bv87d
z!aE9O;B=y^ze}buStk=+U8g1aRDq~@RyBa<@+q1>vXV3{k!#*_gP?sBmSX9evC5DJ
zyJWn(dZH_Sb9I*tDfL8G<<9Ib8E>GP=o0s3clE_c4aG@qaW8f?bw-WdWzOggcgePu
zi*Z)8S-<-jWEIj#3@LoS_STrAVa)iQ+*_lL#Tu>S9`CIQp4gON{O<1>bWo;TNBWsc
zY;yZIMMUVT_6L}D$$R&M@7)i+cR%>v{owD_`@tiw@l+p1Pdvz)`3>&q(y<6DpF(Y9
zbr2iz7ng<NP+_nk21duf8m<oj;~2vX1Mf1gLI}f+FgU8y)qoItso!F-5Wa#*Ie=6{
z%cocvVBiq4_&6eP1QY9Xg?<t5#qa=0yn1D#%QV8&`viHsxPbH3M;Q}bW-Z?4iCTE!
zPNe&H)QR*!P0j~)$~oUH-IDoqdRyiPcgFlsP3F@(Wu9;UY|Z@ew#<+0jQP=;%n$CA
zdA^>#HS=TJGJj%6%n$Z&r~5-Y?T5RvWfukqYG747LkO^zmYrgEaB&E#WvXpM@*I1B
zLXEDlXcBN=LozH9afVn!QNJY)pccI5Y3USw^3`d{_ay#I20C&1Qd2`i=n)EXqUJON
zV;P%~N$uALMz71kWwvr6(Np2SM=&b21Kim4AV$aubTc(UdxAktU@nsY)XKUo$AebW
z{q?n)9tgL}UBq?`(_AdxvcSd02kUHnDBL(V@m)18aIx``IvXDiH_lCTSB(o?Z2Uxh
zjSu#R8+TVWX_sr!He(Lx(DWb(otsdCse4VLzSmK<TmvTe^!aL6^=6!kN9Wts)9p=W
z+rea8Bpr3774;<*fkoDpP}G}F>?oOdM^g!rNQ9l~j<SS1pCg1agiW~`+zD5AB>0Cy
z|E64x?1Za364gU7eN(PZ?0~B~9<Eui4)Iorb?;oH4dQGl%JO#kkMDZ>EwZj!c|X{i
z@gz%K&+yVYPjZM=(hS___HTRb;PQ+dKiOZkk~xSgnUcDK%bps(y2ktPChxeRFuvD5
zxk8^+9)8PB3(j^|eeds=3Pi3v-;%y=rV9<Vi_U7h+!Z*mh^*@1f?#sR2dqb4FKD@f
zTq3Gr%T9r8P$-)}Fg+zBSp$7FzD)-vm$GBZHWd{@V>2ZN)WrnYB|JHRatT+a)y$pS
zraUXJYq~C%Ei*@8q6#$Gv>U0yu~{nX#38f=e0xsHFSL30Q7M2<q;$Q6U$$@rBAn7}
z2sSoKm&?U#3%}*3_<Jq*1fE6uCY3AvEj|<yU$?WLy<r6(_Qf`L{i_EW8ycQAT_!L?
zrVY!EwV@d+wEw2Q1{J}}ePn#kOJ0UfBg34EY%bGNe8mk)K*G2B$Qpl-4e)BGWNZW9
z1Vs0!K|knwfFf8jsDTbs=N@`u!^B(`Jwr|q6zQhqX%$8h7rpU)LJ}V#<ej#Kf~6AA
z;@BFFz~%ZziL>_*0gOd*M57%v$U5aD<-1IVofqAbYQ#lYI8uDGMO+mj9^=JsiYWdu
zR9xK$@;oNbhWaiokTngcQ-QH3;LG&#gSgm0VRhMr5|kZkn@alw{4Z-zOFKbqM=F~(
zQ#Pusyw?Dw=BPGlc7eUD315sv1&^J!*MY2%3mt^A1$vj4nB++gxAf&K*aWKTilvr%
z<dV7yGIT*RkOx!O&Go<%R0kjK6dhzqI(AH+fmX*Qc6Gy7Kq6Eu^iSbvjl!$nKGe`4
z9h2v?FgR<h#~_!Aph$U~(#7@3acM6J+Dn4=lAygL=%<kcQMWUBb;%-RT$(c-GA>^O
zMy}J;0Hcwifj0oJT`s7-hFijCCqdKVT`98aaG$-R83sB_@N2qe@Qj6E6ysaL;6^DZ
zYb+&C>g(!;jo}V^j^5WI%RmthIEIZV)XS@Z^JOfGKx-QMMl9NjBt_6j$+Ij1b@3V)
zHgyHNXP6LXEQ0Z~he8C5(z<{W&jV=Sl%%~im%TNY_wJetHTMgQuF&^!2|W*9iGkig
zzNp!bX>Fh(3(LZMB}ip4LuT0uokH|W(kTXVTH-UIfw4q4_!m(cfDTlH;^iL^P<fIy
zXP4Dn;E|RJV3M2<n>d(5LY_-AT<{iA`=|bb7;Bnk8rZv(e8sdy_Fj^33Q*o~3P4Hh
zXz|PtomW{`H`XnDRRN^^dBSEwWtg^s7nZ?YAsOnWEH-tJ!k#KfbE)gJOn0R~JsX>m
z(P>zj<j`&A1IK1oGD7rj(!hY6*Os9Hn$$yhv9U?+{@DCwP1ST9Qvf$ivz%O4H78TD
z$K_?+1aCYc5A^r<cS|J`#yC?l*GTdPuj`RF_>rh)qflMn!*yv~zO12e^=k>8ETc*#
zN?Jaz(+zkkrvz?gvr0iUpVi+Py5_oZr)j$7J4cbd4kMW-x}#<)RM~k&@=P|M7~^0S
z)~?!uS&Nyc@fU_2=0eUjP!-U~R6t`=rzG%YNx)R-aNfE?92E=1?_xm%lGl;?wd>u|
zD%r@Is+G@RXta}XO8E>`pjswzHpnZMVZs&a56wk*)jY;$RkndQTsL6mhJibaW@WBH
zB9X@w!ZdPVY|}SXTqqb9&S7XQA;dFLkw(z<iED!QZk`uNAC(u)65(&?ErGp4J1cl$
zimt7)=_eLi<QZ3E6Ta%1I@Jpqrk@%zD}kZj0L=h*U|M;)5(IWCnnC{ytmr_N`iz2#
z(`>i0<N*}GhnV2Bz$>7JfybgiI9~@bf^3$_x|A!Z$OQC>@`x4<iI=14m8BYN6cU**
z0q|KAysQEM=$UCG9L%~_2we|gYd*|nGe#}jK{DrquHeiTh5*+U7N4A<?uFjgp|J)a
zhTy71Sw^M99e;yZNZ?9%${fnycv`qo$=}us!~h6%mUxL5a7(2(E8+^2Q^7-J)`VaP
zy|BzO8ak5%%Ca|DKBA(wh5k5@iLP$+NHz=weZMmmj=|TTkS!*G40~rFT^Yk{j;=k_
zMnFxLAZ=xfSHVzVCW~T(lrJad?v;$N)+)f&?>5p2|6~@-0M(1Dwj^By*Ny=KmvThq
z8Vr1hY_N_2f(g9_MpReUqa>PMLpGZV4GH!sSjK>c*wTfIOHN=)&<BWdITE21#7&7c
z4blA+U4}d@E#VJ~-q{VNh(Zm8;OGKMPSta+&f*yGf$`1`+YCtG(%1@MNrnLcYoDTB
z%Q997#Jrp%IruJZ8W_R*tZ871!f6!L%7PqUHmR{Nmx5e^b6!a`<_!|_K)^M$D;9)O
z(dh&#f@uV>z>rGSB^An3YL5BKY|O#Y=5W=i#+W0^td6ic<h_6+W%BhVKHZ_ikdm%!
zvNq)qKt}teBgbwoK|QcTBX_|vOI1>wMXO=MhItgyqV5{`*dQvMe08P!)9J1riJ5K8
zhERpTBJw7&(c9p4a}0|vxP_LGXC2{xRVlw=s3n*-3ft}x>&1?>A-n^%xWEacL+Mfn
zalrzuP&dt0KGT#SDwRRWpdIYsmIKej2-1mUne>@Vbp5mAoNAM*<<CyK{CV{tjAz9T
zeuh`$w~!rfYE&c#g0r*qonj7u*P4+5b+$G#x?ZAhNayFF4VBKU3!TgJa;1U4-!((;
zr?dSlf7s$3FkL59q+IZtbC9$9GiL@q^3TtF<5U0cdI#`cC)6GKig|u<csuSJhYKWj
z*A?A-x#@d{Ag)4c42is+QLPomvc%5mD<r&*m9o=2WcK1~#eDB@NxcC~II1pKxzzAh
zV*@LXaLvjOV!1)qgT6GJ^*Yct@xoWITIj$RK7;>$%R7jpWf~dVEtgHp@o#yD^5BWh
z4SJD0W9GgPGreGhL%sc@z4%ZZ(1_w+wy;~qIt|uZ1{c14r5%TW72x~rl?S{B#d)nO
z@nMMW5%uBA@s(!pgEh+erTF-|dr>PcoWJtbecn+~7`iS+`F^E^@b3PVrN-i4K_8UH
z;@2bhLVW`Z8;akAAJB#3xA62r@vrH<2YmRw`+|n(=31~9bmd=KyiPQo48KhPB>v++
z|M4eL?+!57O2tned;ZH%;d<?07!c|4zP<ts;v(QLVLIS|OSUUJaqTdP%&^zz%}jph
zN7HY9`!`<ojtId(?1)u<)jJ%mG|Rm1hadla??a(F5brW1K=hsleCKZks*SZNFu&OO
z=r4O6tRtbuzS!xW6B~^o;7=WT+Rb|HXwt$2lHP%Z>8B<arZO{^=g&=E#-Tj2F!@xb
zs#L%&OiwP(&-rs+$JG4Xv;es6Nz;$dF3wL)XJ)4mTw0o3R1kRTY5$h%W>fzUzM?7_
H+k*fAPzrYd

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d7f75bd67b3cd8e24a871270d0321296eeb5a08d
GIT binary patch
literal 5672
zcmV+@7T4(?iwFqr7LjKH|8sO@a9=YnaBFM;?LAv;97l3|KbDj%$#U%2=ku64dzU9_
zxeq=?QW7ar5;YVllawXL$82VIo7`FL&g{)Y5^X5Bi{qFAI)HJW`#2;p^5ZiC7=~dO
z2yz!V34#DYz79bixFCoxmrIcQN%CPNL6BGVYj$RKmpi28lO0o|-I?jGs;;W8>Z<P9
z9)6?a+sQ%;{fD2(8v1P$8C%=sIJ#3ohFV6NWyVXoUU6{GnuRKh+NSQj;S7G$x$DGm
zhY3ZF+>YDztVv4_<qC?GDI?QVwvg#;I57%m=q2RfUd1$3(^51Gt}P%A*X4?HarTEl
zS@_<u?-X!1M|AM>m(-`FMM)`3+q!K?*Wgm%4ncWN(Int+Q#Z=eZIqQP#k^@sa7)1B
z(sf{h!YoP!RH~3d>9%SW&N?=*Q|QATHi+pI4zZ>mKJjtkFn)Rsl~B%-)}FgLHze6x
zlA`6MTuIH{l-BHQS+(Mhjc~`7u9Td@G2BNkGJ3WMIL<PDNUtC*Qz24kY@<X($Co$N
z5;{FRJY3MrXxJJqtD1US&z`E3Y?FLM!EnJUm)slL<f&W%<!)y6JIIWaT1UZ0JuO(O
zRif&}kDR9*o_E2eb<hO?S8|qm3sHoOs^!rg0M_hsWjmhJ4HPfiB}-KdL)mt4f5t#L
zT{A7i24UWCa3=+HaJQCcAa6Kw%d5`$)~xfIv+Bg;R=M?6r|=Z+%v;+PL{Qq)3FqDp
zt8K@v)6Vkp^5y@9fAmXkv7GF^^F1y6p98nJ9Teph9>tNoUKW}p=F*$jb2c?SkdPGg
ziGpjZOTY;3r1Cp*EAGxJCgP<Jfe01=Eh*ZTO{|EvjQf$6)AOpfwXBo@rw#X1>?|m4
zG0y>d%x$pavU}gXX#gbLf*hTio=42I`zne7)EB8L!LYepaJ0$oaCFk|FvsKc`$_n;
z!SOU4C-@PCV^IDo9D{Jg;UMP}9OOI(2RWaHW01m<V-`Lq<fjqtA-t}W?f~xOUO7T~
z2^wjo&GqoB*1&b}Bfjg^wuU@=slkEpVY8s*_1mkuZczq_K08&eN!)o$DcOW;o5?^s
zipbCbO6`&#l8?#{!{?~{bMSeLf5`PSbw1-1@)`PlR6YZrS>NY5c^%FhP0#G}E1b?z
zgnJ2V&vQ;=Rnztu`7w5Cm5<4fiW_l~zY;8|cN$}-)8b+V5d9Dre5Wfx7QPlNK*54?
zu)q!$-1X8HrycnM@Ny5~Ls=zv6KVNnPzN{(>8Cd6r4G3p?21@jpWH7Wk`F__9RbZ9
zgT6a1KLY*rDD>OUgYiEhe+i8L%kopQ1g1X##`_hpl4s;+!H{Fr{7=e5@+mO>VL2fu
z!T8hih&&3$KQ2$mlc1a_c^WixMm{T_gC3oe&x4)J%L~BaMT8@YWf^MLwpjexa5gB9
zRqi5)-ACZQhu>FAj=LoS)5|l1dCG%DYM%r-Sj~VBJ4R(<H8Bs)vmhfYJ?Sew#Y^M9
z(gZI}`byKhbfiXt2@emVY%0JP)5cj3F3}<@o%fYq<fWH<rI&fB>?>X7r7IpSllxWo
z{t5XSz&%gPugf=}?ggmpk$-_28S*msivc3P6d;oI5XliDKJ3i^b|HXW^kCnt!PWxU
zl>oNk!8U8KZw0XL1h8N7V87f*{9pF<D=GPgPuH)=zbe1V2r&okm4D6Ugd8wpS?G?%
zMl6!{T-b05m&hO+QMHQguZV#q4uegN%TtH|#eg|}ARG5n1e#-@JfH}SDP-cqdc{(8
zjU`WRt%;BGXgtqDH;bEoJ}M@%^SX!#)rupmVlfYR>ZS^*ny<P~HEGtPWJ}29riG3h
zWXvGr1sN;II57(x9?97TiQH?DN#<y>;G@>78|s#-1<P&RRZ<{v<S7Xm{N2z`V?K%6
z&+vF1ZooRr1}lWSOl!O39iH}k0I95QYk5wJFxGPA<JK$ZT2b{*{<FF_@^AUBQ~Wvw
z*T0U~s6Hb9rkvr|Qy0%&ex2rSavv?~0_l#+hv@Z_r24evslIql=Fn5Jz-W_~FP^*l
z-+m&OJw%{-{v{ug=kBuGWe*VriofS4vf&{D)e9f{h^(Gww~B{|1w?xML@FL4P(A;;
zpOYPSYk7#gcJbW#BYq;Uc!)stv*&z7<~j6h9wKjeuHU?P?)paz<S9<%yMXl}D|*Q7
zE;ta(MTmblN=6!f$zS8C1kVqAcV*w5?Yp~M%|##?K==qu$8t9zu|7{c{f=dNe6bb$
z4iaWnvsR2eGED9@t@}R^?$b9nO=PXYIODp`eYSCM+MscJRnCo=M96^k)<P)|IfLd|
z3uSp|RT@IQ2Jt><U_mH9#+!16S`Zl``pg{6ze{!X)@Y<?ptu%SapYPkF33i620zPC
zFUn5{#lch@;>LmE!dP5eRdF`UJm$!91e-duNLr1bBoQZ51TsA!Qv;S_KZy!I=5Z`s
zKnWKy!i9uzWB%R;^}q;@a3>6#hUL{}@PMdZLfV#9;FI81Nb{<nHaZ@5ftC+3$6les
z;tjF{;wyQ9l=Sm7?7Er<dSI&f5HGoe)UAT$fFauo?lxi0q-{|%zpv|7jTWeI|81E6
z>$lBQ$<%1d!P3aY!gw+{e_<59({qW$_ysthA00oR8XsMl8=rS@?+LLC<lqiQyyMdP
zl_kf!aGANKnrc=psn&MJP#`mPaBI@R?O8PsR|y9{>VX9%W%SKVSut*!4nD3Js&a}f
zRpt@sQKfmUgIgSYgx2^<J1#BE&O60#x3;w4&J#2c<JO3SJJ%Q2mlohU#VdPG<n%IQ
z`jEu(z5+s)DFKb8#bp_IGZ2Upsn$UaT~)(9gzj+8G>O0l+5}GG@nN!jN5A5<rZvr5
zIw9(cU$>F5eacp=S8?bi4nD%cJEY5Nz(m*D`PId%>%fL-<Q)8vuYn>#8084ZMO8i>
zzm=vGY5f?le}vYT%eppfs~Ou;OC|#t=Kv>b83g%|p(!O^#7_)uf};&IlZ!%6a_F`Q
z>4mlR7f3I(B@?{(F*fOuUY2eXPdF{55|uks(%lFFN;04uf7sE|61k-qQqDk%g*-Mp
zBW>DRj&yx2uihGbIU-3Ejzi2VmI5#@OO{UE;I#De-66JkOct}u)6$C@(%q>Dpm|Z^
zFlj0i;#Ap&qUF_CYDgL%oC3g2-H4Gk)S1MTI$eu4rJg)FC=pHsFB&uj#9ZZXAq{~K
zpf&2-KzV`^X5b|yi%I|u;>49o1!?(Ml$J#YDGa@KYpC*~gK@I95{nY+j1Ear!IWE}
z=2>y{q{^G3WL`{c!2h_a0YAiJrIRz9T?XXkA!|ejsiFd^;+lO?-5{NzluVARZpK{g
zZN?Hq$w6saO2kKnYIY-I88)X9fU5RfykeV$7-N=;bT>ji@MR#&HZ&jRR0L{>dO`PB
z4IL(^Wv}fprbs=AOq8FFn{?tFi=Lo{7Nu<veUJez7UjCAV{1VZrFQ88e;vki0@OgL
zqONK&(p9uMP$7Yjv!El*%*;p$;8YNaYPwYb1By;j(Y-uVH-z42%<=~)QRyT@JqedJ
z4^g;8UMi-Y%~~3e0S!edFtAqDS=Gb|Hwo4Wd+`p2teTk|Tfqn@o|!ARQ4S)k!XT#V
z#hjUGXisOQWIQ3AmQKN`YAF(PZm{izifZb46f>mZI-|f4v~>pbPB8BdQafT160}m%
zm3*xkQMxpl5R@qz#EQ9<`4`BzCSp+g{NnTE2@Qn6j@yP(A$1~Zv;!p=FlkScayL9E
zU|^&`4F8=}7yzZ_Mry>s-?=n9x3EM?#7N({xP0{*#FQQ(Cxn+=A>l6d<2DjQ+maIw
z?&hi$&63A&P&Kzo>fJ#-CB|_e1L3?a#fNO%y)-|0VQD>2%2F<Lx(b~lHA1NF=hx3q
zY^@Rnq()unu_|<$)CeiMt%Ymbs};gdYQlw{tU`}KO<Hi)W^a*O!B|^bu+uenZGC-Z
z8A2^HbkIVsq>xyA#3Ya1@(P49CCBoLwzS~Ai#ZHvE&?Hqq<hL8%Rs35q)a@W;!jz-
zG`1pNe4R3{ZfRg3dFLK~{7N6hvb>@n=H)c(uF~hSqEq_F^O@@?J4<@r#*tjXfCO-r
zBNzXZkoBV${|n0G*KOQE#>XnuVaNj@V4yq;Jq)g4D%fq(l{p~@CjnUz3~>1ca1NS1
z^}6f&kfqRsjK&to*0S65xh})=fE9b&pdeC(4GRXzm|a1J35j6d*|8h)M8(>l&^GSZ
z6!L(Utx7uLndUAWa){zlxY@B0SwA&&VAE2os%s^Hy_=9Hn{;*5&*8DG4!x#mE5<dL
zifyVIaXSLBW7{rvH^mk;p>Pp#bhVUUqzjS7-+)pDAK}l5=2x!pM@4d5_1S8OgrBnX
zZ$OI@EdD!b3m@iYy^0i5*Z2cs#`qyyqsEC$%T>S2em7^|<yk5Ihqs>j2<x&AP1m3&
zivQyUsAu>sHtyk1g}K`mp9M}p1#b}*6k~8BcU0wgOUQArapT=IYg2P!+P`5_$#=Iv
zI+_$aDPN7<i<#J`;8((~kzM=})6ICB=u)n%x2wu($A>)gGZL^!6sv0R?E?)84Zc;?
zV66BKxp^195Yywc#VZ7rg!!uS-Z@a7P~O`f^P>uj35AV#6!z|c!s;k2R(y}#yidM9
zsG8hYg$74G8hr0SgF=JvdNfEECg{Qg9p9**Qrn~Iqp3B`rdf(i@?fsyA)(|kkCNYS
zpyWCV#Jx)px#u9kyBS8^<xuMWPF3A5WDj3uG{YoGlmdk0h6-cC&vD_?-iYu)2n*|i
z-mxIN4k2-G7>V87{AXrlHFO`PmKFrtlh$1+>^&r%yoajEX1A)r^g#`#i=CDZ$scd1
zldjh|o+R^UpQ6^-6rEJ^w4+m4mM2`Y#lz5UMS5bLECYuyvcl(g6noD%ilvfl-pxm+
zQ(|-)_o*@I{68F>-V2US2_gKXPx#b<4K=Z$G+UwJHk9OQ1n2asG5v+Gp%Ec`$|wBD
zfekgWA(Awu#)J)x3N`wN^U*Ja4NVB)M}5MN9oSG48yaWpyxfK+g&N0vY8?MU*wC~P
ze!?gG#DNVpv7t%!M1b4Sj8LOLC7Ap|*w8tVQzyN`r^4sdku^)P)Ld92dUHQBw@*BC
z3TWYjpcdx6ve!?9y))J71G5kk9}Xokd!qK>W~w37$yl`6q_&?@B#$X8Fn_~Gy^l%U
z->maaBTu}RRMT>BKW}!KKM^A^Y)-;|CMz0I?7GusHoyJ6&4b-%r#{>sVBG}Yai3?9
z-#K_&jd(gZfa+?r_~UlrAbf835AD^iQpAj!&62$q9~bi=U+hSw9<1N{ykMVR&t(tz
ze60qCy>h1?gwMNA#Xr+-?{Zn!^NSy~hce70Gt4TnFQA9lHoG4euct}Z%Ts<EPl?pV
zAKgbDtd|GvexHtB)j+MU2lSBkN<aEQd|mve<+JVUg3C|j>!aL4U1A5mzE5ATr%Bh>
zNBlNECVYLw=j&q+*w+Jgzh7T>HBjs80X>X+r5}GFzAk=|_SyDz!R06N^+|4_F0lh&
z->0wF)1>R`V}2VyEqvYg2q!i1fPFn+_xtsAR|B=a9?-*-SNh2Z;p^o0q@QUwCk%cf
z51(OnN$DI+%lGHo0Uf$-J>fU!Gh#8$_f#$YfEVLDCcaOPrgGOhb3ppDHZt*Jz`CJi
z5VP=%=eM(@jc>8GaHP6_%(-~>U-_G{I<myIefvFeZ_a%O!9CpX+e&uNrauNHbXPiC
zS}uyW9u=>`-EwiYlc;=+eB1Pgkf8wI3JdslSipC}0=^p-@V&5r?}r8aAS~eTga!QF
zuz<hk1r(-00eyzxNBV<iNOXg+ALoxk;G_aLf7}FzZE^SG{K>93Vk^8K=TDp9_+AS^
zQFjFD4|YWrJM29;J9hEUcf$#80rqqH7fm?zy?esX>0dSB^l}|ef4DoR{%y#9PXD?I
zr@j|l_&NQXCY)~6;q-5J=hVLy+Ry1nO*r+vMZ?eO-!<X1Qis#OZ=zG*c5FYVKWW0L
z?^PXsPXDn9r?1rE^q+R;)W0R%&*@K_aQa3aoPTbD<9lz2k5d~z%zrHBMn1X~n*7v_
z?8`T;jM#eAM*~_cnodUt9emVNmqCW18+>0<KV*s(LXW(`(J6en@HBZfVBv)K#egx~
zw&{5tU}31y%K(4$;o-;6(U$>sla~NmkX_vPZhuS5G}%!o0jDzqi2;em<Pw@07&sG=
zrVB{Pv#KPym!@eB$VyVQM5?>zhJX%JSc+9rwVMnnJs^?Y)iVRZ&D8@Eq{TA>)t%V`
z64^jCGa&Y54-5xLjRZ;UvKPCSI-|yRnKOE$1CnXv0-P01CigIdtU($JAbIy|?~Xa{
z#SHG`-W_!!&}fJ~-n$c=Y)CM;`@0SuqEl`o31$)vwtrJZ0$tO7l3AB@upj(jKls6Z
z@Pqx}pV$51zM*k?m_$!<k=66te4tAOBCIq`tdZ40Y$Rv0v=fB#f{idR+W)n1!vGk>
z7-bk_mw63>7jBHfQJby>gxE{REd~qW%bk=1NG-H9&B6c!hmghlkpTNJNqw%65b<7g
z50E6kK$#iPH81sHL7x0Tfb%s>853M)4c_KtJv^@vsl=XyNG0oXp4=<v{E6qT%%@UK
znNRPH`H{NJr}oM`e}uC;^P^3fAKM%A<8_%&@0EGJCcQiJ6HS?)+!OQZL^InT*{dA(
z)~+I?lXbAFBZCL9o|R3r9b6oOTA5;+kUR%opioB?7EN64>u82WBF+$N$Ukn03#f%r
z^Gsx#zWM4*#K95%P6j%0`LL~}#q$aUDOz_LLSh-4kwto6Opb3z?oVZ<XmqH`|Bzrb
z(hG1C8*Yq{qGXt<3p(U>YLw(MQ9!M(+j2Z;HI)e0YAWe%l@Afk8m75euw{Wu8c&C8
ze8k&0ckz8SE^tZXV<8(K_cqR5bYG1NT+;YtxW>~7Z{zmXe!3d4pv5%kntM)^S$P9m
zVnZG7s~ENhlY9DpwHu`lG8OmFw`-@{4QAW!WLqR1q0)+QNrmf?p%RL)>BOFriBC0^
z5Q&6$Fx^v@aQ|}zPlnKttMp#Dx-Y@s6Z#u+HMSS7?n_kn#Po(-P40oK`yQ@Yu=em)
zjdh=1q;=!0C(816`N^?ymw|38M&9%AEn+CQGrV*eM=WHNRgFBXc6M+tjD1>;-rU1o
zd?gcrlr6H-sgl1|L*A1_9=n=Q-di#BO|?Xx9~N5hVe%Jk*2UkUF{^*qrWa|qGVC>-
z^e@DG|C_&Wvp=Xp)<W-~T%ht>_=vw!=YQ4a`#AbOe%w=MWt2@2fUcdAcZ*k{dRUt-
z<_ACeZQRGiV2#NwLM#d;x7b+_A4XasW(*%%UAR8GI-j|?v@$ol1S<TOA6$Lu=3o94
zA6=cjp7B@8tuFZL!tC10GN`9-er0(<0PNt%!qvsKmHCCt;yeM5tk15=1bF@hXUDd)
O@&5zIHdTW>XaE3aIS54n

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
new file mode 100644
index 00000000..34e6f226
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 3,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 4,
+    "stats.cum_n_token_goal": 27,
+    "stats.max_n_token_goal": 9,
+    "stats.cum_n_token_url": 72,
+    "stats.max_n_token_url": 24,
+    "stats.cum_n_token_focused_element_bid": 3,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 8,
+    "stats.max_n_token_last_action": 4,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 2892,
+    "stats.max_n_token_dom_txt": 966,
+    "stats.cum_n_token_axtree_txt": 667,
+    "stats.max_n_token_axtree_txt": 223,
+    "stats.cum_n_token_pruned_html": 1014,
+    "stats.max_n_token_pruned_html": 340,
+    "stats.cum_n_retry_llm": 3,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 4339,
+    "stats.max_input_tokens": 1464,
+    "stats.cum_output_tokens": 225,
+    "stats.max_output_tokens": 84,
+    "stats.cum_cost": 0.00078585,
+    "stats.max_cost": 0.0002646,
+    "stats.cum_n_token_agent_messages": 4512,
+    "stats.max_n_token_agent_messages": 1517,
+    "stats.cum_step_elapsed": 3.0203144550323486,
+    "stats.max_step_elapsed": 1.3659462928771973,
+    "stats.cum_agent_elapsed": 3.8209800720214844,
+    "stats.max_agent_elapsed": 1.8219048976898193,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6bdd8639d29d9df84bd8ef01e47587d6eaf127e5
GIT binary patch
literal 2286
zcmbVO%WfP+6t!`jv1j5(f)gU~kPsjxLOpnl6Wc3dArcf5V}%bWRd?4+m!`X`RaHF?
zBqUfM#Zv83EI|ASd-i+;EB1T<=T?u$4-yu5VRzlSb)V;+>+i4q^ykV#_;GK?T8)j+
zgK5zflS*h=h|-#Fu2kh+>x)Tss0YTs^!I=8-}!s4%?G$=oP_S<aL9Sw4Iie9J6K3q
zI9@(^xA~;GJtPaSv|92)cz5@rUNop|w5y^*meRY8B;%G9Di!%Ggxm8F^1SGVYhFQf
z{qWmbi)W;H#Z~jVan-qGeY4;bW|ieReh)=y{L_WM7Z+TR%d)ENW%)YkmtuS-eYWt(
zb*v~*MQ-8UKl$vppXuipw?2|as*;(ys+6|=h1*C)m8&ULn>E4Qxe!dWv%Rw$ZSO_B
zeP73Jxu0_?_|81r@bRAoXO&h(WxcyKtLI5LKMt1W{ajKp?7n~1ZaPIwXk=qUmILor
zzK+e6$1AlFEVD(9d^+?^_0vM&7_Kab@l*FEUG-IhKPeNrpq!zyPGX_8(kxfVZe}Bs
z#wtC<F}$m|u@|*Y-L2p$u~g4<<(nk6nJkBM;b{{^D*Jszu}Xn5JFNh*PNnM-!GzeG
zKum(bOOq)J-4+=$s-WP-#A=>cMjecE_}~M?cKb}6<T4rZ^NaQQg;F|2ZI>VO+`#8M
zeVJRK=XPd=Fie&C6ppr3B}y*CJu*v$<udo~8n3PLZfo{>EN#YOnV!Acn8^|BOa!=G
zN4%(x&pvRS!|-{faqUKd83BBJ^co6?MVVv;*TZhiDS<m*%0c#(7-6D<&2Pk&DQ=lM
zhDGGup>Z9nnDIRKKhn9*t(--5>aH6LmY_m!YGL)1g{=BYCj_0<R1_)<xpzwny+cNu
zxhe;!3A!Y!kQ+K9%T$hJT4P?tgcnsV2yJV@7i*juf9zJJVKaxvFcpwl<-8<a>N8AQ
znMkB~>Tb+YM3{f!+H@`u4*-c41$9*dDi+Xfd_pV8f%OI`=kit32P9DvfE(Q<m=H(7
z)=eUhX1wP7@#t6AiJ^g(OI6dOTLDc>matuis#8X9@ib+HG^XO3W~vazV1ntvTHr9C
zJ7Lmlf3)S=#2sv)Rb6??f0Z;s+5OQqs)r~?4Vh*OApCN4nJ-r@H^Ui2-JP|y8$cf=
znMj7Qn!vGZ)Qk|RKWa5Tj1`sfN6T&{6Ffz8D83t7B$x}JJQWIMxIO05f*dqMt@K$Y
zxL6rUF%DdL>{=#QV;};luMs&QS9FwH0YN9kHynqJMESL_##kyfwpYb%mdhB?ohTzG
zCpco0-E)hdV6HMn%VrS+uzALQD2oA5k<KCb_d&GXi*|NcuNQ598toh~>@C`ELHpuf
zic}f)696AsSxzG>49q@#m5tZ!HViL18iwXy)n$9$`sZL^%B<NH5C5kfcVmXaRKzs_
zyafVgg&9yKQPN}x0_PaRW4DYp2_WbVw+4D1U>;ab!|#vUZtIc+y+~DU{n2~wI)x}x
z?><b^198w3+tJ<w(T{fHcsDvo<6hK1==HZB?(goW2^!PJ78GSlB<SzE4Z}z1>^T!-
rk2In{#UcNc+U{29kH!Bg7e8P6&)coP7VmVH-d+6Tcks@0P)GM4!O@Rc

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..45522b9e38d9eb2edf0a451937e726adf406c308
GIT binary patch
literal 97
zcmV-n0G|IJiwFqh7LjKH|7UMuY+r9;YGq?|E^upX0Bc~GYR&)wu~TZNX!Nj@R2HO8
z0aB?IB~vn(dt`!Bb5fH_6#P<?6cY1N6q0i?ld~0qOOtXlOZ29cCh7qI6vLXJLI3~&
DPpT!{

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..1d3d08b182ea10cedb6fc2106b0571fbb76d0608
GIT binary patch
literal 7728
zcmV-09?#()iwFqh7LjKH|8sO@a9=PkaBFM;?LBF19M^d~MRG`8z7xmsn2NK@h+Hn0
zH$_UKE{YnFN+rscs}S4S-M8FX&CaanAjLRBt_GHYDG<aH6lsf~K;aq%48sWA)Io!y
zMbRd0(==#{qG^#fNE5UOP@q537Ddsf-}jE$nb{?|q?O8vsnv34-uu4qeeZkU_1>G~
z6CIzw{>c{l&wD&)nHx5-);23Ca=S__twIdPPU)su^}X(E4yi60Mbm%TANZty$4`14
zHWc|vi`Q=EY+7<KUs9cHh1j;bMr?o8Pf|F`)QRu)sJ5-yj%qk?>;Q2%u2lVn`EP&!
z{98|y9`?F8qVGL(QF}^Wl+}v7X}Xqt9S#NVAe84-Lk9kerd5$QNKSTC`<5-kDT1fu
zXMqU{vnZE{Ud2LrLvu=}d>7a$^?DsHbki>#<W2wJ^QA-H`>qk4<Q;h#=&43Q&g)wK
zmVC|4RWv8%yTt2QGgaL$J>d1?LDtNb0o-5q4x3eCWUJWWtZV7mYwuD~)5*;6@Nmhj
zkYQ)Iq8ZwTnH#F=u8m(&FkEsfdT>IU9LkqS{#MSsP3#ob`l@#{)PkcqI+d*V@L9^?
z*#KN#0Wl!Bns>ByLJ_i>Q6RSg*l;V=%~amBNUGxMj;30cy6Jm;S&QUN!*(neI`gvc
zby7gz>oN)q<YgG(&-iCsbN-9|RX?e;Dy^UKOHX>81!uEL5T#9<@?W~kYTLcmS$}zX
z`O=T!AN`}WIDYP>v)wKHpAV;AI|#}zJ?cpXvmzvkn$lA+^Dfmo=ph#Lih}D`FQcY)
z(*FBOtJjrNZNf_rLl+zXx~v*&E-Dah+3O=l-YjUw+Ok>!oHnn!>gGUbiv<qQZEu1K
zSAz4dq6Ls}3jOFb%mQIr-CI>HpuR{&2|CUD1)nyh9X_4(Z!@3w(!Wo@w+%i|!RHwN
zB;Yfkd<Z@R@JYc3e>3pG-wF8O?+knfC@g;F;d@MZig?|~>k1ABaHsSr5{?oy(n_1_
z=0~lu<M>znZB*OZ<ZqZ7e290*E~y1`<Em*oltGkdr^Y4eb*`(ri(I>`4|Jfg4;>)X
z1Il6LsPZ6uk1CJA_c8v(<EI;ZCn@C9^zTvSBz)&1-=9-f;P)dt{<7~UIh~`#>p|9@
z;hZLGsvT1D2iQ-m@__QFI1xX}2jV4-ekRz@j5wGB#3;mt$j?%og`4q$LcBna7dY{P
zkA-QApBI%2z{_3Y9n7iuTf`_VgE+uQaGcs;lsc3y&@0rsUZqbts2qZEI|7n>0LJbi
z<zX1FM`7IF1Iqun@;*@h_bX2-GN^t(DDMYAOP*Ff2#TDf>VI4rRE9wLhn2K40?I$C
zj49)w{FBO*G7Z8xq0EA0PAaFA(=eiEl(V2G=alon;R5j_)p0B>=Q=F(too~zN5S18
zMI!86g7a>EUMqPhC=r-po*B%OAuOzY0{X#f`Xks$+9y_%4B>nb`p8O;M@olyX)02h
z=A|Q%(otSIR@Z~65D%j4M2s&cjZ-09lp-rV7b#ufr57Wmmw2fXDP88JS3*+8^Q*!6
z<H~h_dxn-jtK5LP=b)~egt=OcVP|95Y6v@5hfQMGYcXssgk7$~uEem-820laY^x62
zjbU%Zus1{4AFac_6B$V?`FKRSpHe=p{34^kOtMG$Oh5!b;G-OJ+hINmbDRsSe(55*
zRY^0dZnPo^x;G43G^v~*2$Te__=mV&A4Q;91t|auV%Xjxv+8K3!E&Tf`9nPNp5tMf
z#mFcJRhzg4Q+WSc#SvDqSO7e&s6j>+sqWQmn&GH9o$&UO5OUE+pnpK$fPMje!aT9P
zBYD@tD18kQ#ym|7BGh_JOIy>7c)9Bx&{fC}dHR8lJ_z$PzGH-bn#bnmoYN`hn22}4
zb~g2}E86P^Naak|C~#89Sj*CnT9;0@lG>T_mkm#pUymGT_;Ch~e~B;;KBoMdlI6!I
z7EYgYXr89@(xL;<yN8s6^!N#^KK`vx^}^}%9QuSJFxr&mh12t|MTuMr5rJy?jR=u5
z9C|rKM6D;X8X}@@Nkk11aTZQrcs)v_8X^MKnU6$>EO2NiMC3)F{q-o3Plt#=_0w{M
z$XO2kVu;Agq2n(uoPPEn8OW2I$Zvw&e~%R%4u%Up1m+!R*;^$ZTmDgQ^0a~H0+BO4
za^^(NK32;+An+6K2+X?jw;+i=i<W-du|u}l3U&v{v1T||tOBuYZZ)lY{}8X&EEa9z
zTm?T9n9e=caWLk%a)(9El^6^^uD$h8Y6xm|ODG52MqK~up<aa89#^n9R21Wj5Fwu=
z%tiFtd6;JF4du3#r`SSpErH<hSO_l8#?B1Blc8=Zj|;)Uv>W2azTleCxVD<$T$Xd3
z!KDW_O=gj_7C$lR#_0e~1#lX`(&;Bi>3ckmH5X8tix|y?gywwy?wxwfgA=b4+@|G(
zr5XP~RM&~I=9KtEsx|nw^WRqG07&^DGwdtWE#AQ8j!4OKSklMSuxGUb&;wQV4)T(V
zL|ZF4J}9!QdR;cmkBl{{=J#}sYOBR4uWtk9`sRi`G?E$5_@10OKR$nM^8DHPk+G=@
zlcSUK<C7z2(`PS?O=l+Nr{>eX*K<rP`S@N3Bi?awW$CgXUZ2daYqpltbj{h!S}J6w
zzSlb9d+j-`07pROA=T2<AzXJX5RjHevsd41@x3FoCQ|xnh%iVrYZkK=)w*S4%jf6M
z`DMSgrN!$!Mgy_eD*0aL%Hqo9^KhKumEFhkW`!|*P-b~wi4ez@fyU*<Wd(S%2y{iN
z)j<tiLG!wi?r`45_DU8h0*|TGFfP^6KPg(%I?7u*ChE%1yTsZYa<$q~3Py>8k8$u0
z`O-CD;=r}DR~J9L0&Lh;-uDhi8Ym;eI7fI$R24>3>!Xw+t)Jla57YWe#WaRpE$cd(
zZZm*M4sg7lLFgZ`3{~ev{D+}UakT!O`bD9qIdq$Zqi}8IIUI$yku)!Uj7@js7vv4J
zgc&)LuHHT&-;oe#gaO_7gN~LKX<fDCyhT)pgmiXJF1kh@l_FWt)(2jYWSPQoh;yo=
z0?Z4tV^TAikzcqo$QFum;mV$oZ?4LBPDp?jMoGb>se;6*ur1XnXvxf=JUMUz0E?!T
z#5T0K^a*XY9_@s7{P=*3oCsbFXbOl0!e1i>0UIFKsBHt~DN2}u>uQeZ01e$qsnsem
z3dsa5OAJsLdK}bH;UxxAxQ>!cpmrt(<%D1=sL%?mIB{I#O;Iv8)2r}5r5V5vTC9A0
zj<d^v!aU@x!~hjkOjKO5H?>t93{|%|u2xJ2(kmv@gChg-tej4b3(;&x#<5&ZB?eU+
zxm4A)OG(Bo@6sI!zwpN*j%yha%o7sSpm;&{AGS=GpjN`V!<fQ)=rRg_G-cC?b24#^
zDq4cJf$~5HmrQV3G|;uUic-A{fS(6{jzC*<s%&aT5{HU52O>oH6bm}?+}xa;22KT$
zgkd@*P@u#K+VwEc)C^(t8MAz!Bq1MXsK?>3Zo`E4C``q6b2-NVG9aM@1qRk?GOMXL
za+79*up8@O$eNwav$czu;F-E|9pxZGD=bviFy`Fcmilx`9!aI;8F>hPYMLT5;|AS6
zU)5~0K$4a`+`tPALR%+6?udD3fa(#8kRTP^R15WLMCsCGLQtk?5G&4HIrkj;H4%f_
z&n-TKyD<;~J2ot}ighAtv;!q@m~<quJO~deC>Rz9_uol71E5shutqrk&dc*>&R@n7
z;psaUmp^<RVoJB@Cxn-LDIF~9du<q9+eU<oZ@Z5Cow<r-fg{fd-wmh`!rL+<qQ=1<
zWHJHZXbmufHKGq~xgws92Y?ebz)`Fb{I)NX9G(aKWTtD7V^A|HcxlUH+oJ(1qk@YA
zXRfcTTv>*I%G4S9n%7l~xJPW<NGqriu+%)uE80c{%LkZQfP5qfT$t3Uw;hX++TNGg
zZ{k~~2Mo5xUj8hicWG+|s6oMhiSMP-T~(Gn^ug?#2G^@}lU0bm7k6BqCAoPVU)PiJ
zB@5ELs~oxfUr08JTK;b+Q$Fi@9q0vXP=_GeLs%dM7GN0MmI+^%P1oN<M>q+{eZY+s
z&Vvci9H}Q*sfS#ICS){%K(?0Kq?@!1F9cTZZi51;2HPwcq+D*5ST-br1^=$QB~P?l
zd+W68^%*K|xw5rKU+hwD!y$(#AAys*F2Pk&%LFzZttPq_1nen7Hf+;1&?tuw<V+Yf
z)wp6^hY43vGoX7E;;!ok*j*G`)HH=l(3~~But=91%fAYxs&|C%{G7Y8#5aGGw%TrK
z6KOqV>ED4CC0PCsY|A^u)%q$?ZPVaezKrpMu0fTP*iIn+fc-Ace!#O@{?Bkdvk^9A
z9foPZNR<C44A4mNTU@W3@AGoAD?bgKfCyee5tI{fqIA^y@rvljOItVIMRPMP-%R^A
zT-x*PHIRCfatGyWYx80%_7kuxq1VJMKh9(`(I$qJ3+vUIu-d(Yp@|iSC5%-y3BI~7
zK_S6cY7$JAU&E8v;SXVQ{9ySqqGB+w3GcOi;R)fr8d5(hu%r;!SV&;6?+dJfz>?)R
z@Z?MQ=gpeReMm@fJS4$4_9ZAJ_<Bf!boqfUFHrwR?Ud>s6(3EYY0gX&VVe1JArA^6
zPlSa0r7eWqK!9G)Wr*BoAi=xUjJPXJiTevRaR-pyd@a##CP|_cAS7=oFxL4qyz`^s
zi120;8g>9i$ARp+i5~Yf)8imF|9?}mI=YWgO^bu=O6ox`++8G{e1vMrW|wBb^g)ZK
zi=B=M$scd1ldj76p(JxpDJrSc3py3#sY9pGj!?*B%Y03n21;`U7i^nQl+sVHCwe2|
zi83Q>%FI2?knk{*5iySVKZ73Tjkt$Ni_T9+I-l9sp&fK+lr64s9U9?c%%me?9DP^l
z(3t3aCerz_eI43ChcNNUOb8ts7h;UM^YM3u4o!*9k4HK`v9Cir=+GovtK~X0EyOqx
z5#!{$LWgEW=cgi_pW4@<9du}#t?F|fniFD-rUcXP3LQEva_W(A=QGXc)Y3Iab+ml5
zO7z9O%)Z|C>?bCLH{((`7w&uGL^wR#tZi*I(c?o+^%!@-?%!Oq31K3GE}NA0QF^$U
zScSQpcQm}s5nNNw2Ml+tE^D^qdwsmoWxk_@Fl;`;FJ@&;sBXi_E}NbHbj)IFZSt+-
z7;ZPPZiCg3Et{1>qZ`8w1X})1yRZ#Dt^01f7$;RsrP=g2tm-K-0g42TOy>R?d`SKG
zXzaA#F+;EK{cyO?qxZ$oo$5Q>(1F2*hF<==_NEkZgexM_*|+9BT60W@fi-8M`kWED
zOEi3s-DhjA>G)o)8T(yt&9UB(hWk8zU#vM-e1}^zF!;8td7LXE(%HA>Jz8^2h=Dba
zMfG_?So2uKnkVkFHP>`}uhxwHuD9k`?<d23p1dE{OfLm{N1HR{@NL=iG*d-JXy2gs
zXwY#f1{OUL)#({w(UHBD%+&q0=&+ve(WF_=>uoyL{}bU}Pv4gaoqwn2+xfSdZ_n}I
zlHbp__ioTJAqMt571igHV!14`?=^a#m&-yrzSjtyZ<ue#dVk6#w)YsYZmC(qR6HBH
zLKWNi9Ba#yYWHIK3#a~#KTN74hqfKLchc+02lw;8<n=}Fw0g;<ms}#<OPwt(3*u=(
zWd+Wb%S)Xo@@4$%(#s8-0(_-ez*n0Ee63l)*P8`=qglW&H4FG=vw*+TEZ|p~1^j9l
zP^bb0j3`1B>GyX+qBq+{asIFgoJ<Vok9NRew~I${zP>GvxUD>j^QSxDL>{p~QSS=W
zZ*7Yz?p6=s+;z);wH;3UcG@VXf4u{zk>@x>IsKa*IK9+>)4$!G)95X|QBJ?T1E=cN
zoc{d|oUS(D^gGRQsy5*CA9mn0@<fU#r{CRy(`o}wzqdW7(Oa9Noc_}eoJJmd5#{s;
zJ8=5x2Aux$_MAp<ua0v1FFSDhas!-y-2o@^42}q=uJ<6nYa)nzIdSni-s5gNS$=~L
zySL|Ps3=P;%d~h!A7qGCB!ka7^h@tAJ%vxwD;*0zM=$BM6+=(YD-CY>?7aWI@r_gT
z*?C>~%sdB@iW{HkYiXIqoAP8}aIQbyFVj%0lezx>laf4JB5HwEjo`UFOLIR~lA$GX
z!y8%<beO_YEL}5hF{IIc8E;UY>yO``+%H2qJl9{lxw&7)yOZYn#jVZ#!!c50aZ=md
z&RkEOQDe7oGkW9wvTfyKoRw|X?_ma6hcppG3g7a)J?3N>Gk!Dl_NY^_Mw_@zdV7M?
zTM~@lK;3{2%9QI!nyJJVcTiJAgsy9UglU(&e@pfLE!F$CRPWzX{nL6&b;LE!4rBDh
zgPd8|<c=;Ai?H%6Y9p(I*oeQlq!Wh<gN-pTI{x)=!vGk^7-txGqjnua7;b{WQJt;_
zgxE{{7K4TG70kr}q#jzHWnqAUPsPBoFkecGATP!Q=-@RWbN!|fCOIr<UVnzOGE8|9
zjO8re%yc8HFyEQ<uK3Q3G~|0^w|qO*ZTZe*cI10>cYKdE<U6xlzH`Ox`5xbq?}^><
zJ=u`&(cSXBOmf>ZKD8s`)4O7PG~JN#vE4GB-(FEhM;c(&Y(50oNTX)ijZYkcYLsf*
zkQT?DJJ6sjEKUU6HxLJlF`OaRP}C!d1E>Xmcv70BkA*rZ`JTidWk9DSzueZ+5_<B0
zoM<?~z$nG0SW?f;k;zp#xH?r%BnE5z4+=)59)O!#4ZOdcxc*E-z(JV@oVf%|brOJC
zTZg4+&}1gvT$7oRaFb4TPfY?EHaXg4lVjl~bHzP1324~lM3YTUhMNTA+f$o}hmB4*
z*XU?E+-QDpqRms$j)G{pZ6uf;HzWp=_p0R@FdwH+6}xev182|CDRTV`d5Z~hFh3S~
zLQ|PQb9q2uYE9(;&1L|*ivRaM`m@*{c8t3U?Dsybv!K2uL!-N4=$^!D7Ol5rXks@E
z-IGAf!|;|IP49xEdlO;v@iD7~r=MJK3_@clF!FZ!kMDX1EV7|m1$-8xWT_k3(5XYL
zie}(mvHz0S12bnM&rc4B2L<&~rjm)wWMTjV>KZOOYWS`d?|oaml!e0hUeEj*ed>1j
zwJe=D>AdB8-&~Xp0#h%?l-ZO`c1on?bg2-bx*x{5U(OMj7!oEOn6gsPe%AGV?p;&z
zN*Etsx598WTjmen);DG79`o6QvTe$X@`k4Cd?Kqk$P2B8<CHvK1ld4KDt)O71<Vl{
z$A<Du8>fumi(XQGsoQ&mb{jZP>3RjfY{3R(c22V)p|DY$To7J&_&qe`-|EE2%3Px_
z{8-}enW2#QLYM>WMIgS{#rIPE8wXokTAnjq8GJ6x#bw&C>{y#rIKZ8$uR}%f`WYEt
zM^cbM1*wo371=D(Q+yK(PUt0k?~JVR*R24rc1p%J@Le@@OBfV_zNsdHCBwMuFgSzs
z#D)p6EP95VASlvJ$>-HPV1k0+8*d~oKjfXZh2j@7wrOV=E>|#0oV_UrU@Vd+8ttG#
zHYg`4-(@oFyXaO_BQC<y(ej&};z|PXY$|rOfBCCWaibmNc}!jm^<BD3)-|9`1;+G8
zrrZ5-vB@fMWI_qb4s}7L{Q>@qFwtk7Ahsh_O`BRS@C?}j?=?WFdE5!tHM_)KCxq|s
zp@PRw+Z#Yu$b}9<)goY@5|cd1;|{u<+XPWmEpyFMD}!=Hy#+FKEvlzQQ`gOnKu=W%
zpI8(fWJx-9OkRLiXC!tp!S8}Zs95Nq(gO_&Z;T&qX_1b}S8^~o>#WBhmztnRb%xT#
zb-@{FKa<(dWcD+e{Y>WVWir&E%->jn`TUHuY&v8{z6p%1Vu+`4#?Zjd!5>#k>X6}9
z@KHa|w0MDstUBBquW5#XZV)_{t{H^AoQ+_7;(My#ASozo3~~W|L*2A7kYUf!uUe#t
z*?}oAY($~{TMgVQV^M@q(}4UV8b`SV0vaiKk;RsNUIW9Xu3`5K69R}uFftBOh=5UA
z7f|96o`yt8+F$F~U+Z`Wu60m5zQEK@DDsSiE+sLNNuZg_7d6{4txdF0VKSIv1lcDh
zwU}VY^Q^+Is`<bvtdzhiIEGCeOwb^Sr0F9VY}5p)$r58-vrGf|NXbj4EmGr(g!y%K
z(<uQZu}#C%F7zK&UESQU@DcWqQx^!Ef>dD=1ddP!kAY;Uqp(<BE`^;%kV8`MWtr|;
zi8_~!k_L_l8@8(?kDjX#_!hI4ZlQybMn>d<R)hv<#ta_b#wNK3Vv~b)Rnt-W0B)FO
zHMyZ`PPSst$VJ@*hd3vXq|@mEsba!tWGm)6N#5plgYqUn64h)XSYJQLM}0=Vtf6rA
zn+cs1aqtrrtx(YEt}>NV0{3TGr65|!=|35|=7w>vX}Z;WN0GfIAz2{0qh@PV*?CR!
zOg5kxGmuqVduj`&1ZIJTKp1wIXgD`PRX`(C0gZW`lE9bf08^pEE7lTmR4fp`*Gd|Y
zTt(_PR|lk9WHV=~Rw0Yg$xgy4<ug=)YMI2T7OyD6Y$ntnnhx-)D;OeG*#_PqmT?Ql
z*qKAKGS?xN7ch#OMjnjq{B0HHO<?tWePV1XA;dFRlSa_>xtoIb0UnyA56Rce3gItk
zt$@8kJ8O8gg|6LVb2}`w$n&noW>fXkV5%1~%<VK}Rsuu44VnS&z_bc<T?On^G=rWJ
zSkZwj^%*4<=ddoWI^YbDAO-LtxF7;v0W}Of76rn|Fo+RkQzRQwzN8`(&?m|xS~TQ7
zj;6DzB~HGuQOG621i)ue@Tv*`pl7C$a4_*%BTj4)hBIo}4w5+=bOoo4Fa$VhzxK=m
zbuaWj1C2ER-X5p?RT-5Ix3LYF_5*>X@O(6sLh;OOvz9__l!yTk=q&NNCE%7yZ&5@5
zD5rvl%A5(|3VLCgRWx)a36$j!mnC7MwuSyUkPP119F%Mr3i^6qDjcx{g={eiWZApq
z2GI{=LPaO`jS*0jCCD7v`cW_xm`tJ=A?wPEiE%9{Yc$2aftP_|Fw21%pn7rDmZXc|
z+A%=jI*F)Uhk*}~4HgMNFrgQRi0ay6k3=(ENGMaGA;CTcYXr~`TNiL~JqJt)`T#L5
zPa>3pxGAxwAr8;dMZJ^K3jR<9#5D=3h(Zm8;OOc^Ue)ui&f*yGf$`1`+YCtG0^dYq
zunYqL)_#t5Eyq|P5cBemgo|@5FoO44*T58o(<rI6)h@m!(O}gp1v3lIxirf#Pl6L7
zF%JY>L%U+NB^Ax+Q4vfdfCYwBsxC-dkZd^Sudp!(N1Mkbod#o$EVDYo>X7#mj+Dt4
zbNGyd4ns;>+G63yA%KkbOGl2~D1mxlheqy#Wmf8>xQJFG)@Ye;pza#^*d!{Q|MX|l
znN0tn#LPBkL#RSv5e1Xj=xy*~Fos1}vudlS@}eXBuPPNb4YdL@H(}es@~dNQ3hzKI
zE^xx=Q2NzTTpvIy)J^jipX^Bxm8zg*&<=KR+kxj{1nEF{4v!w;i;KNyRGZYTE%wp1
zMeoN#ls{Vw+uKMAHw0>O0x{V+`erGIzj4dRg7R9MSzWKtcVhE%Ke$W7>&v;|BnrG-
zY2okp%+lMrY=24X^bVS?6Dm^Xy^cI&=>Ec~^zpBy5<mZz>m9_qbWl_1OU3#1%{{n}
z8ZMC7{X%r(;*RefW-o(>v|Z1t)*543V(0V~177c1)fpNyhw$Z6zIUXe-UcQdRTr#W
zYWY!X3oDOs&B_m9xj{C9zO<b7deIW`+D))g=)hOLfd78QJB0Hh)5zLxwQ5?9f7d%)
z0KaQ)(#yLUGmnIr83L0V8%j?O;j>d<yvl#l$!>P)Ggxa``Z9Rs01g2wz&DyJk9m)a
z^G4T@!w@|v>cbb3!?3)kK{>yg9A9uQYQ?qROJ8pH9uS403ow+g)mjMeb}yZ4E&mzx
zL1`_2BXSqdpJQQ5`J3<qx={WLJe@Cpi(WO)hu?c7Xozl11=}i1ztQRSVF=3dE8YEt
z(~o}iqxjx5?;wM%Rdju>_zO_sdOhGCA+F;cX(brMYk<Fk!QTM~Y}a<NdSJ?!WiL@%
znE&?o&wu&1-~S!<YI_t6M2fn~BhgB;%Ie-4{Ns;%?+w*~c+o!TW%>B#*BVqeYcqn`
q%Q_Nj?5)(Z4~k8?5bzIwc)`tiJ(%h;6Igb0ss97%@+<#ffdBwX<Lkfx

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..18c107bdd9ed15b33808171852f6ac4c50ca18cd
GIT binary patch
literal 7861
zcmV;m9!lXKiwFqj7LjKH|8sO@a9=SlaBFM;?LAv;9M^SxO5~8#!|^Mz9gnHlTt?)Q
z+~r%OBub<xdPLG8DOrX>EM|8vxwD#`S<izM>!?VRrk0Q?5Y&^8w1wM#xJCiPHBtjj
zilRtb6iEA%G)aR7LEN-Wi?%=?A3<9LMcZ@kV|He^B$u>OIWe_b?##XCo_o%@=XLL$
z$0wUU|A|l4(SP1!S<BqCiM75}N|IY;VreB}ICfGu&9d*cEIXt;XRMk2D}MJc__zIp
z*JMMHuhe^uX4a-99l3()WJ<)g)pcU~tA2vQS*A{WuT`~e&3077fnx`d!*QwX&rW^!
zhcjP#qVSM+lsELfYnQdB<T+U_$y=st$yeb}wC#cNoNCBG-<oNa<V}*59o4>R%W#U|
zN%?sog2K$n1)`U+P~Oy>!a3gsatdu;lM8J6g$~~J-+#Vv%zJ#9=p^UJ3xH2G@^Vhs
zayR8=H(S!2r0){1Y28$Hzwm(9h6folTLf@_!8>l2iIFKI!x`7ok!$bSHBBeueSLie
zvqbuwzLI8Wn`XARth+XTML}P|De1up?XovlAi0}a^A@p_SnI3a$xsiD=IB(i-a{8D
zg%<;Gc?HCP;A+m%HVAEy(TqH~1;B<|DsLrorbUt^S9dhkveYf#YtL9DXBxI+xxmaT
zzSm3veebA|XCSZmO5JDti}hLmWq-*}DD_JHXZ*sGUUS~rDidtdppE)>@37iNuYST`
zSXg-Wf8ihfqtrQm_U^@&I{wdxQ?C&O<rg0Dq`X-Yl0;2u&6_!wY8`Nh1#P0>>e3Y;
z#A~Mfqs|@8sy5-J$AJY0fUc;<x{C@#d-mFiku&p}vA&>|piP6<Qg*W-w7EP7XtB3I
zg-gNt(KQPo;S~62Hq1O>THRJwEx<lUMF~31`GQY_(g>es`nQ(Phv?rY;M)M7r{Hsn
zf4bn)t$Y+d-SA1m2Y*xW!QWx{;O{tmx+yGvrr>)@d5U-~Nb3p?2T-T9DiV$obW%^d
zYvD)rvE%qx{H<17U*m5W8+?d&%r2;Tb92cw9ZDd|vsvSk^qMzR-9@Th#seKF<e>?K
zdPF&{oK!vl-;>J2@O_kj@%ZT~-w_J=H2r&0ISb#Z$oFH)3jBV2&tLZa6vuOtc&$j<
zHBM=wqS_%Pe}MheD-S4-h!gRnd?a2{?Pr+%jEjRwXc&dK6#1EtlW-$mkdGJW@d7tq
za661!{M=D40WEil*O66oH;Iv70C9kk;5aqFC^acZL9bBj+LU&sLpcWHb^;{#0F2#(
z%0n<-kHENn5S0Hh<#AB{4=Yb9GN^thDDOu=OP*Gq0Yy$w^*^ojD7~QkeM-MF0Lq_M
z29+UD{t;zV83W;*Q6@k#XO(lxc^J_P%0<wVX=Mf|oF$&5I*z4fU5AC9RezPzSfNfL
z>|BEL7JgnSc`zsuEyFZ3m?uN6u=WYygVl6KS|=z^tR@j^^9=CFN>4{hdwFRxQrgc;
z2O_0uUOKpqgV7KTqU=nJE+&n0p|&VRRyrLio#myMBc;#sQYBKlz)KfHQpWS8;QTS=
zD!^T%<<BeEq3#8!YawB(R%5L*vDRv+b#_~85^KF4YpsP^7q_)8#af%O)-Q%yTi9B$
zsd+?@Y{Zb<iXr*A5R#X+k?`}80mYI}Muhxn<rkIDFbvExTa{l5kl+Vgl|ybh%vE8k
zb7|EtTt@FIX-3(NRwO_L`#_mSlrsc@5<nII5Z7y`4QO&f^3ViZC?l%v9W%?0W*RJ8
z3aV`HK^}vrdBA4TGD<|*CT`voaiCIhf>q4rp`Es-L2eeQZqsa<>!?|s@ct4Ie9>E=
zr$8@(9s<3?6tTS%IoHB?y$q>ij;06^Y^|oHt!qZS-1Uy=DrAa05kZ$91biClF;+j#
zqjYU<>Kt=X#5-a;TYA_j9rOdJvZia~IVvQqZvIF0^XKbH<xKfz)f446BgZLzoPy(D
zCd`))D!;B|`0<(9^V1H^*pxO}bOgA2Q0bt@Phj=wZ-lC6&(Cn^Gm2=@pe)RupL!#T
z<k=7ssFvS~Ai2Py7eYwX?MPNbNYovXs39cI?D<P?Mv;_5NT53P@d%Px4()`HybQR%
z7De*u5E7_<T8<#O$e~{jA$cWq{Oaua=l`04Jjs##Hpu;tSkdudxZp!z27<=DQwp-<
zALRy5Ab3^~InyI&ZshECB?EyJfOsch{*}84>GVak^jnS{vc-C^J4ll?!&$WQ#Im{7
z)F1pqyf$-f%_hzg_?f_T?y-)8F~^lVEOM^IVEA$E-43OOpjLN;a=>lG^=~`W%Mj(`
z3KoZows;j{<&%WDh&DS16K=h#+;;L5I|!~W5F8#0!Ntkgo51%n)D7h^Avl<TL)<tN
zTrC>cP!XKV(vAzb2*D=IERt5@C#K&xF~CUyP6$|H{RAoeEstZh1(ez%Mr|RXHlM$L
zryld*#A^n(X*pqO#y=3%bz-bL1wN&!55A53w_Z5{Qtn`ey-3~SbzJy}l)Qi??K};8
zUdsbMP*tykms}>=dcpBQkzLh0YQs#)Sf^@!PgkpUT9)$KH(}OqZrZ&AsiBnb$*Gy4
zsp*lKi&F!GqnAd~BU3{o0~h-*UK$)r4Nr|u_4{7yDY5wDdrb^^)8&=<D}H#TGP9xC
zT2|9FXDef=keT{k{ebT^X0<#V0hUKXupp$2xt1xZ)=k^@9#kz&?ZvgpJOMdsG_Uo&
zI^R1%Ya*phmuIG?{o?2A>*~DbQ#25J^^)&3ugtAnnStXJuWUJ$GfRx<4w>bB1wtHK
z1{_!B78Ib(BEX7Nse>B2mgcn}-oBh|Bf}P114@#~K3ue;f0DGOKFxbNCF+VVy2RS*
zb+yV-5=M!G4|4D(`PpS4;>hyFrMc%;fDGHp`QGtJ2Ssc!#2Y*)s`BaNMw()z^~1dW
zAzEK5nMR+hWn4$oZ3ZyH0Zwlx5cngOq3XPd|FCAGyjka7d{O8z4&5N(C@inMfTPeb
z(9eq>WiuZ6C3zDqVO&o2mv5bsZ%YU?z<{oQwW+R7+E6VyXA#vQA)TF+*IXlqL!ZcN
z8{IESvP|JP#I)+D0P~XUnA8l$<(F>vuq9($!m`KZ8>{l|GZHilH%Y>zsf5U>ur1Zd
zYl&2kJkosz0M|?_fqiI`{b#g^?aj_;r%!jwNQt1ufUW>pApCV=5U>HVOl=!5k5a@8
zTvxM12WVg?sg}#c$S1mJSywlOp~pcD6<$|&5?53bU8tR1J#v>IDyY!%thnp6#=D|u
zZuGCh|D<LBJ!rA==}As50}9iSwYs{gpkkuplD(m=;$Wz{&D&~gi9mX5iT<8}Zh1oP
zPYwyu?1sj%T#h9MRT;Tt*|iG^Ml5IPwuE2!V-d%-j7ZBf64ao0LH5sCCQMLEVclUw
zVLh;n!cQk{I&n^PouZ1?Mf*T`po2?vaamN+wYZ8>y$pa~1b>b|J1|u=H6wvTMY{tL
zB7Bks9eHwcQtk&z1(GhqbPAwAU1uokVVbEK!ss(%`Ho4Ke3~^q4TsydPB=$lEVi4?
zItHKt33X9mAgv;^ii#sO{cI5SV;!uqW@mD2`64EGrmkE^If&2-3sp7Ta&mG<eL5!(
zB>UxYxfgyanj$mf2Hl=1Yqpsu2}|y);stu3ud^U`Y<atz>Jf{OAQjzI^V`*k;-$%i
zz)YJ#thlf^{Q~+m5rZ11=dR(t41~a@O-n6foroHZfC(HX9Z4(?!h;G5h6Td?H&bQ+
zl&Txn2*=-iW$MDr6)X{+zIkrpxvLOUS_GdEUUG&0V1eIjz!=&vAY6RIRiy925|#yy
zJSBWLG=&h}kQxv*4swu61%T-aU<zvl4-MHhJRJ%Ehbw?-tQqmWqkL#Z%>v0{O+Z;{
ztkPx>YSMzLh8$9n4v0z%dX8MUy0Wsk0HKvBHgKKORgAqSY}{PSs}Q=>9Lpmb(t`LS
z%s@a=5`-{J@6=n4MMz~oOzc7NZPX(MTX!#h4x1m<)(udIynmPPuhN}XmPfS1Je-Ev
zCA#q{2B8giVV)=1DI96nlX3+M622wgxcFm48*N(rZzxkf=Xy=(4=Yf|AnrqeAbA#Q
z7~GDj-%*>c#0f??3dn-Mq2*`57-+WC8m!(!Rzedn8cQHM%Wlz)Th=ZFR%~s60;vL9
zD;OkSZkbp%Bz}4Sj=LjGl&ym??RxEoircVk719?wm%C8NA&MvA<c>>lz0@*+Oh>DT
zt{wqf*B~#p>1t?{!UwV@jGAgJT32D}wWb-U<_K}eb%WMNX=_naQ(J<Dt?BtWy6{;1
zH7HfR6MQ#ldU2j_2q_Jf9n=~Ud`i-P04|EK_@CI9cZ{p`5>ai_;M>5A@DA6Y%1LY|
z5Pv}aQBHn9vs(PGa6PjTHe^kPX~0Mn|0fJkP4VkouZ8dZa<eNw1C)RWUPBQS6L6w5
zRrq*K@NswN&X3Y8P0Q8N|8<vgzPkoeZ&GZgbnR?jOvQczb|v(hxW&hrY(^TykaA(Y
zUJ+KK*AbeJVSvJ@RgvK9hY}PLe61qEMDYzgc@zE+CdX%r^Vk%_dPR6|912ee@AZ)S
zQGq3dzy?DCd-G6WRRoqOzJ(`W#6NFWRPLiff<qw*zI7-;A;C985~K?ebfJOzH)^L;
z_o(=2DowLzniA6-m<zc_2zfXp<S*_Z<SGL6TCYIlz5of{&054=s!7~msE9j&Y~ibl
zb}dN~#Q-6BM}aZsFL375;fU~d4H|X?M#q8dx(1G0YvDM^&Hvw&Y#ZK3sHVli_9gXz
z3wIw$Cm*3ovUyZ9VEUlN)5T`TgyfHR)J)fB{7{m4pcIvC(+fH&<f%im(2h{ZV+(#Y
zn+8gA1($AXP?W+?uP1sd;)zlNZ1T)KOt0`TBM~tU_&<Xl=B>Dg=@-n8MVL<=>d+oK
zlxE8;T!#j@7*qWbF{a-aIy5MlPeqs?Jk+5*bO;ll)UeQ@AtA=7J0E&q=+LNOekj8H
z@SzUvp+h5Vt(NQ1m=NP|M2sWv3muve%#TKxA3fBeJ#=V{t?F|fniOJ;rUYZ}3mrNy
za_WIF^Qqc%YH8U~9W7U@61{qm+1I<C{luj3c3cY6Vcx4J!r|FwWs|E0j*r#EG46-m
zzqw`&!bAdHHYshV_;6dX40AW{WO$n+xWJqb817hI(QL=}+Igo7d`Amm*nEUv*2=n2
z-KvvaHaq?4n8jB540akLZCikJ8?1)x*sT;A-B{Z|pv89@g>CR@-QP5daZ<%pnoW<x
zs-6@Rph(b2rS7l6ht&Uo#!mT;8Txka`@%e@?~9>3<@dOu1A$cyz4*PxniO$>D<a}K
zwB`d^b4-YVHK(HboD#W9G<*-<XKSwL_(821`QC2LG49i0o`>#>HD}k}<JJrWzAI}U
z;);lP4z2ls)*KUJV9kS3eI6FpJQ%U&;rndO6&*jQH6!2KtvSa1NSNo5`(e%WQn2^5
zIa3PXl|7F!Rb+^MrVM)jS2gX|88qSL!|&D?#0Ma72E$Qv7#GeUvQ3j3y}!;NY$6Br
z28{FV?jXkhnK0L5_a(UJ-s`D)?p>zplRPfv4pa4m8+1&Ffjy5#_4%w=X^U)`rSJ1f
zTS&(b8r*YLQ}r14=UifYj{@nInjuWZGog!Hv5#M2eR)#lhAw~h+`sc@Q#EDLwj(!S
zdab$Op#XQi_Q<_mcU^k5CgNReuB)3B4<9N%2WJb#rDhcQGX8bxm5Mb1zE&&X>$L*D
zQ7hn^wF16XE8rJv1$?_!z~8SG@XNIV{$UtUr~(CyC_=Q+SN3W|@7RsD`I8#jq+)IU
zbdNUdp7Us%ukG4K+?yV4^XGfCi9Dx)HoYU7eq-0B;+FSNn>%jtTf4Q1-;*1q^xJz-
z8hIE*l+y3)LFuzqDE+J5DUIG29HsQTdr+$GOzHRbpmengrQffOQnd=D@9aTo<gpe}
zN`J5irR6G={@w1BM(>Y~Qu@O^D2+TjBTDH%>_O?Lt5EunyHgsyM>|UCKkY&3D^=S3
z>mF?)5Aujm>UtmGQ<EU_(R-8WUUr+V+D=B?8`MSvTB0j8($(#ICqs1^Vp*oeZyaie
zRI!Zs@EMAJ;lqWe@Cki|Q{m_HCA@~U&{O#eJv%*<@5IM{=fU&znS4j_S$qy;7uP@A
zURO7PxA@6G>11brr%YqAP9{4$&r0${fv9;_HGt>x1Wf{2Ns5-pRd0$x&^`)FTj`o{
zlQl|r%6Pl;WM}+t=S~^&;>pg+t<s$`-r+RaDQ=YR?2DlqjHBA+=IHI%88&v?H^Vp7
zDce>qMp@BjeD^WPZH<Owjlwrj@7{7G+%kS^_3lkaW1ZG;)AjBM$96;*za6^@9+WB9
zk$$EUJKT>=8zS_!{s)+L$%i*+AKsvSc!T!g4cb4gH)uy(<3t}uPdvz)`7Q3~Qn3gt
zPoOrkI*5(<i)%Y^s4&<d1Eb@=9j*@m<1L0*3%u=nTZ1s%FoUByy&Vu@FZEjt7Q$CB
zDF=}4(DDQe0}OmL29AZfLca*|VoU%JURN^NX&PaaeFEpzYn+rmN{b*YYw>Qzs<jH!
zo$B8g-Kl}9bPw#8Zl}B}-Ko@`bf@=6_h41JQ~RYmyS6*sLwnLayg#~8t(d(da1|EP
z`(<H)WOwIbbWa|}_QgZGzbfH_`z1WLyUwHss<f(@e+aOea!s)Ns5k^wF4eXnLykSh
zpb9H2WCYY#5eEx5oFLXw)H#U*s0G(}R+^yCn>s7`p2VMwKxZev(NI?xdXRzKRdud`
z0gFw$q}Ce)Bdc<7v8&wG)l;FrM-VEt0^I0o5D4V1tJkUm_5`}vMYEnRXjxgCrOlwr
zRDW$<rUt@YI^_d(3C*y}bd6mOhP%wJ9jHrahFuQV*yTvLOEA72`s70v>Krq|c;DC_
zJ0Gj9^K^f>^W4EioTDO6gNV6}bTNIdiVUXmWy>{SicTL}cKu^bIH8Wtnzv7ycbGE=
zQ)ZDm)RY|5mKp>mS5sP0Yf`YUl;GYc1S}m0JIH;-`g<SgS)AXIp!9wSx+fu=1@IjS
z8r}~<_c}K7$h;$QWBVZP-UQuTeE2F6>n9g4gJ>CwmAqg6<GbDwi)^Y^-Vb(BJjqfw
zGrV-clN@4|Gy^x5{kvW(_);UsPmaVF9n)!Cbd=O}T%pwPJulwlJG>Bv!uVe6)H;36
zclebs%{Vc=>3d&aDUgcO#W%Bb!oULSwyf&lT43tKwWdejENHm`);Q26Pi)PCc^%!T
zP0EPIKnIMknxQbJY*W!tX!-`2>=qL*%!p}WCzD+#_tXptA1T1V1znj4GuLgK@|?V>
z>AGCD%p3s{per^}#uva8TD*rT-=GCaT`%F6t?Gc#rZpSlmW}e{lJHu>FTpAPZZkf=
zW|_WiWS+m|hC<@oXpXSAkl>@K*v_key`#Ra?gi78!3zVOGVNG)tS!nSNWj!Lpdxtp
zjg0R?$;-eW^3Soz<{CZ4*T5jN62A0C*7&<#fL1#xV;}gU9J+xF8bM!^6KN&G!0Rxt
z?V%?&%$jAv8B&6vh&L(Es4$4QB8{)xk@y%P?X)kDAro$vAQDHk<r+qjv)A+hj74%p
zqYN5klTwoOT_(f!%Wg?E;vy`ZEdD{Wxb8qa1dH8PQ2b-4xZVizJSESCc$b#Qh6dQF
zz}WNRna;(<2KuVYCX}G;P~TL_5757+K`mv1*p5^-ZKhXLT{+hPrRH$oT-WRZdzTTu
zJctS&nYK3pt&j^HgtA4zKqV%5lEeLWRB#owa@|r(J#tCC2{Lp+Gmr*T*Uinq7E}ix
zjT9YZNjh~(o`qh=C3fk-S3x3FEc8#|fhvX9?c;TI(kXc{3xl)4I0m^?1Vzf@6fZ6w
zj!TER&0%hHnA;rYHuo&Ip}uA6`U*_?$E5|+A>;B5AY@f!39g}m$%AVy7t~(EE#c#Z
zplk8^5LtD&V_w$`13e-*FI_VTdr2I@P{lW6!B<jH)>%@X(l^yD8)F)BjxN?BYk(0B
zcncewP*<)79+r_PqN!=<Td|NTk`93*DbKMm)5&XK*wl4o&oCjDSOkM)4}}N_rF8)&
z9_(p!l%&H2k;4U%_wa%UwdYHW7opJO61o<=*a6L5zO31fX>FmU3e&-?BS=Uw&t*9W
zoeK0z(gXuJD}^qb00$yU0p}FMVVBih;73*pI0xYs#KH6qQc#+Ff{jP5mRdP6HZ;pL
zkmRI1Z`vZUE=ic3m$#e(U=lk)JZVFhQ`Xh3O$(p74;gu$&t=LmhXUUzgOfot)VEkH
ztCzz5CCEUj!?R3xy+HjDn=4_*cA4bR+2sRwWLA<j^hwgdfSlLXpaYt3Lr}1>OYXGT
z)M7)`bQ~xEH%zmf*i<zqQ?ke9HQfXsIw=qI_xE>8B@+fNQ!+P5;uf##k+=AfsAi+E
zy}pMl$hdq(L*eQ-x^%LJdeT+W@_C(ZTvIu9;TABf6iD+~{U?Lh+%)brPPcsTFtYbL
zB=SUe)Jz2{JFiHd$p#c-9IV1RP+u^iF!MBO!mz_M#km2h0veeNa7^hG1-|tMhzcDp
zTJywFu|WJT7c@Y*ir8<gc1t(OR@PLld<FxWoq$tHXQ%?zGJz8{Ua<yqp%6bbgWy$*
z7*$o-2HJ4;&dd$NZ5GYS+<@?($G~qIIWV@FTPn=D!0Nek7#!^q;+d>SBVc{<hM>Kh
zM|0_;^0HYX{N1u8uvh439WUt6wVP~ah=mq;(bd>oZ2Kgd>V*t5Lk)?Qz))|2W`H{|
ztvp?p0Xr4lpi>1>bU;geMnT0Hu#1ZjIDaIF0elEd{t0*m)G+W^6bPr%AV!eQ8rhU`
z1r><^o+yoI(U2}VnqFB)!A>Fl2qOTWMZwD|0DzvEMwf%R(mJ72b+$Fn{4gWbvK>To
zF<=EJoiGGA<zK!wOWg~-r$J*KKw!WHgR+cDhdbp4u@J$0cs3i#x_C0TRmr+G3&a2j
zbe4E&6i`d0Hz$GxlvBY&W!8kq23%NX84aCD0%h6DWl5N*ZJ|F7<c~MEdL$c$g1(uU
z3di89Ku8voK!&~ij;>i@4n~&+sy9GQmLS<=OGv>`VE&0>ghVYTrqz`^uG$3sI$m;$
z!7K}Afa=9nTaqq=Yo{w}hOCOp4H)<k+F*$R1QU9niKwnD6-hKHh8#2*8WQ9wSYd#U
z*s6kyt3hB&zyrj%9Eo5G;-<v9hLAl$*96Z>EBM2rcXo>@qEJI2IJ&lxQ}vvyvp5EH
zV7#-#HUrYPz&FttEW-eRwNF#lvWyf0F)!yx4!#AN21f8c8yc9Ra2f@*vR1}dG^(tb
zB_WsKlvh%XMT5jV5Ks;MinW?#bP|AyU>X4|5TsIdNrv*2s$>2d8*^~9Ib0B`GUiA!
zt0SxqX)oYNnS3dTPg3YGB&GQsmW><&jnRJT$g!I;P!H_T$X&9`(l#m1q1CWq!#oOc
zQFo1WY!Q`CnL1PbsZ?i=#LPBkL#RSv5qXo?=xy+NGuDc(g#~L`pd^m)zp9kqGSm`G
z_JnQsh*ew1+7jM@T3q0S(V=vzX<S`EE7VQ%CZ8Wl5S7ZHWY7;Xxa+|4FoJj@Stfnh
z5M7#VyP(=++sb4+U75UI2jf|>gP*CT@m-{adkPg<fvD^(eI1s=UlV3zK!L5TjINjH
zE4}%7Xg{Mp>jHauUar*f_mXDl?PIn-|A}U=!*rcck#f#!%0ZUy&z|d_b$@W}U#_@b
z2i_Hg+Ctxn&M%{G#l6{ZfyC}lqI(~=eeXDXmp!EJdPcR@8Oahmr*A{>+SbcX@1WU>
zZ^ZJw6D9Q)5aFo0Am!P*AJx~f@(9(e{1}!SWHaEU?!4EA)`*vgf}H{bU;YCA`&I84
z4vcAJY`0uCEyus(9nXXBHMi*H<cyeyL&Wrg*$wvgkM!bWUO*>`UvFkNSG60gw+wxk
zy>bMHfEC~i*_B7V$HaNHYt~_iJ|ODD7p^N!-Uq9c^NZH;{p_MvTw*@|rAF@oQ5d?o
zL;15x58)m3`AhZ1zW^ST`r<bucMW|D3+syCh9A&{;$PzFx#D-|Rr-ARy@!L2=-yVa
zF*E;r&0aeiPljJm4-)^kFMaltsCOL<wo>uWSHF)Px?U^T0K|5@1FisrxD4%AFadDD
z3EP#8uvVBhX4sqGW~aXU!<jF4f91#CNg)`B7SYPDj(aDfm1ddOoqy*Y??a(F5bqkK
zJoM^+d|hn?s)h9#wA4~3y*9>3xHIjy=EOc<2>8|cMmOuVV!A6#AmMc^%{)J~G@Y5f
zvUp+Y3J&GTrK#sL+e!u8(#+KI;)1{6wM{QB%m{$Ho;35^-16e|OlEEx!KIa{Wd(t!
TU-0j^ZZ`RUQ_G{6hk^hAAEhAj

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..d55bd69ac657ac8bb201fcf0d072b4e9265cd0bc
GIT binary patch
literal 5613
zcmV<J6%y(niwFqk7LjKH|8sO@a9=VmaBFM;?LAv;97l3|KbE8{`Q+F++2=8J_AXD<
zavyw(q$G-@C~7FuM^X<%$1G-do7`FL&aCGlN--2%;y9L&32=zVA3*}=ejNW8hF};-
z3?#t?L4ZS$k3$j!IRv>2E_XrllLSGK1cx9=Rd>(q%<L|ANb62^OpSJDrn{=Ts=BJH
zx@UX%^^UiHZl{I*vrc3+byp|a&fRjH+^G;vE)&Jj<0VzC*jCSmK`P72wram+5B;Kj
z&yHCgIuzMro7Jvnby_lzD@aDBOmtn^A-cU~$0(epmWXZjO1duVhNKv9Z2)n&E?4ZO
zx$pjP@%Nr9JY{urMBBP{Nq#|C7NoLpS2Z=^DqK3a!%&`+6ao0#R<*LQOR|C?>9=$N
zZV@~#Tn8p7%(74*r3w}byRuO@XPdxIq0j0tK}@?az?=Tj+l3?6vm2yDa)z)9^du!O
z<Vtexmat)F%d!!-O=5NIs8Y!;JZbgeMMljQ0o-1-o=_`9$y89_jH#7SYU}d0Tq0*i
zMn(#1nT!}CWm%DT)$Hj?$<*=V6pR#%a>>1+O`gsbNbXivy+iak*4mPF%+rD)8zrh)
z>#2Fl;k*kjY=RmPT*?{pZ9)+;vXUov09Y~0mAmnrs*!lvEE%$-Y0_QW>d$B-rz*Ok
znIOztw$(`iZL3?!Gmy7zvE>bWzBOyVX|LNcu~lq+!!A5;b>@w`6@n;j@}&LXKC5lF
zT4(Il)z!=Y3;*bs*kah(2lG8G{GScCRy!!lE<9~T@@m=9Bw9*uUd@@*>Oev)=yM9L
zuCD+iRwtF;7F(_EtfUiO`UHqz0MLr0?3idkv}LQGC^<DRD?6)FnV_BaRLm?WZ8^^Y
zdi1+s!e#fqds_oYxCJ>n6*W(oRrggS4X7_uRf0`(x!`CM+u`V>-(ij~((mWs(+0;2
zaGc~v6pkVBD{u_K5r+fMDLC*v4hNpkz%fK&@tA|pN$~|@^&qdC*d4%~*egb`m!OeW
z+FTF6Y7JZmKhAf(+SZU~FEu!bbwn>nd3ASPRSn7@>a$bknzTA^OC=MzHkk~xqfmwp
zP->U>gm_H+0(_2%KLej<_y@0Ftn-<mkT25jW8zu(%=tcF6*uAhwWeqG`6W*07_oYh
zwQHQySk<&WM*bu_wTe%QPdhixN&IrKq~2+qoz6HHvw-M_xad1w39@iASdb4El!66j
zu;8ATwsX2KUIbnq5NjYS<!%urzY6LAC&7MdgI?+oyTPu|>iWcfaX>r*{dN>I_ayY)
zaq%hWucx8kein@Xg!n8N|Chw)MFC8I5RCWBU?ne#FM%P)sQI4~hsD!i{3BvQOoH*J
z#Zhq#jDJF$6sJHr)8Y(h=B#*5JP$p3L7WFWSr8Y2!zE%xB*W0;tZA^&vt@5l9;@6%
z2)mEKeGk8{mK=9W9853I4CZ+c7S=uoa<G~~A9jq&#A;$5oR>gGR(i@;dYYHUeWeLr
zn)H>XdFg141d|>foU-WvUrZb4Jh-SuR=VITUE-yee5IFpspu<R<)v#LE#v*Vdw)W_
z3UJqG`E~II)V%_AJ;ck^RscH_z?MAN*&1vTz}^X9%O31v4R$$ztp>1P^<Zm=E$Y-g
zd_-;sh};Pf`FRhK*J@b!1z(3^$=7{KeqH=!@eM|Sd1kNpD=rZ{V5l<49fJ*3nCe{I
zvJ01RbdAVL#q?LizywFYm?p$&f<Q6giXX(Z`Y8fUE=V3w5JQ}hbnA#(F=SO?*^*nO
zTgQ0}Uf=<nMN2;u6`h!Q)rkYuileMzIS+X9whXzMuewjxX|5w>ON6%<gW!v!1&$Or
zO5g~Aqr)80t)n?p!+5;`sbh|&2tI1PswVHqO0e9tx=IpciaZg)A>R%7G}2?NevwD%
zaKqF&HbfDtOE>P8yo1s~4<MCQO(oAsA!999{=N0e`BqZB6aT#KjrdL9b&6l7;QH4H
z8_P$<-x4$YdV1;nf<ZGjv5yvYfpo{k0ebx$R-gKgr+Vr9B8Q$99gH?{b?N-v`+g#q
zJw%{dc<3W?fkUr)h)A_Wwmd|nhD0O}5o786#Si>MDjp(Go%))O$P$M(JVf3E+TZpQ
zdEG+<s$Ud*MCLj4n;s%>d9L4CI)DB94CHxE<aa>tf5?iSaJvf*0y7YF?nWs{!!PkB
zPat?!;JYjN?o8j^y=n#mDFCsK!u%_D3)1O%^z=K1?(xM|@H<G8WyM(2@<h|Q*R&q|
zAXcBcy{!{t9mW~gbsn*egEI$>+pBVJ#NhZr>#c>7JWyK=p$r%{g7#Mn^(I95pn(OU
z{21>*tbCrZA)-&u!Gyb1S8t6*iUx{naTSNxj^cuBG-vSB40TgH;V2F!;1D+s6&J?h
z+Nz2(S=w;{7a`b$nMKlS{KWJdCk8kvzzG3Mte+!=AMiL9E}(>q7~w)fxG{hKgL+^D
zCsrp6o0{R(X7GThULwklQQ%XmR`=7+&#ht?XnBA+_8J`)Z{Wg*ujCah>E~(KbvY08
zz*MaPUUG@ZI|aiALpCL=TZfsFvO~@Mk*-!XT9&f<cVX7A?&_zLsj-x838}@gxrK?v
z`MKoi<i&~f#N60Kay~JCadav*J~ug+u&v&c&f<@4bui)`mo~4g*xr@O%xzhhvvNr`
z?q)OzGE>`XP1;s_R?fo}P<h$|3rfnU+nKVY-O_FAxTML_X<VDk6VRhf^IF?#v8|)D
z##h>LX>o4BF23E`(qeU<q=DFKjo4P_=JMvsB3!3<WzWf+T4qcS2rTa_5Mt;8&{$bs
z6@fR6fGCk_9n{dZG^+>cj^uP51=h$ma1xJ?;G!M<iqo3bG;isoQ&)V|B--86rd+*>
zLoadgQ4ZcAT;2dCx;Ez5mtWolHgqj#TTl2JC?dibM>y_O<<s%oX-bjSkMsJcXnnb?
zDkG+xF%7w-Gk^&WaH^I;kdJ7JRN_Va#Ly-=+F&!eDD)JEZi`?qY;3-Qz0j6S@Zx9K
zj7NA)*hNn`Bcu|QJJZ6w2m&P;(2YOpXlaSumNX%!5y>DPo1GQ5O(ln2AIr<PhhB>a
z0)^uc3z8uL%xi+7Qa3mwymoJxEg9nymVQRKxh342jsTh$B@UCOG7_iC)+8k_$5O+>
z#LzSVZmU`h+mL4y)ACF$+O&M?)R2IjIJ_v(6cBTjze5xPK0r38Zv*8?N|=F{q%0``
zG>8+IDixySV^LZb9ilMw+O46=iw?zcMI{zR>x>QyQHLqFLe8_|=qZ^uMakSuY{CDy
ztN=ggvBIfY&MpJ;@{rY{LsU@#RdLPUl((=mq>|2Y<?WcOz3o_HI5{ND2#NTZqnf?Q
z7@Enc1fZ%t7q6IlA;y^HBHfGN2fj37n403loQ^;Zsuy(svZlfWwd}PW#uV0r$f*2u
zT&EM~So9<{v?y%@^??IiEXs9J$JT--O6}4Geig=Z1Zp5uQI(Y#b`@<7REY3#7IcK!
z*;ye0oH|6JifR<VfTGh>bT7};4WaiLv;2ukR5-;@Pr+r)qZ2NXmx^v?vxWj>KtoXq
z46IdkRyA?tCc!#kKi<KRWj&K)%NGH~GjruO%0V2f(9l%9n6tAD?dhD5j3<OM!f7~F
zEk$6?4Ys{lk##jsVwx~gXA~HQw$6gy5%b;<wIdcGK`SLy%Ga6^rAw0uhcZQjSaD%(
z;T0U$oEX%;uzU@l%RmV1*wv&8);Uq59Vo$oNqZ8@-S8lRfnk9&{C84e0F;^=);I%y
z=gQoL#T6`ZM*7a>)t9eAOzCmtgz%CpB-{mls|{mlThbZg+pZ#i7uK=N9mrG8*bS%<
z!rM|wr^Y}DQYjZOT?I^GjUz)_b{lWUT)^=vU>a*CY^$4hty8l~vRLDCmYS;Kj6zM?
z;i@f%T%=v5(hfge7p`t@uB}37Wrht>=SmXB-lIA`T+2%kx}+S-Bihmq^IgnAKvNM2
zVVK@YcMOe?>hmz?85Dnv+NH2{_u@AYy<6T<z##JW1O9xKK51onL_f^KX_#H74_=*4
z=)<Ql*GYB`d)l-jxq=1>-#SMw{us&nQH%c#W#TtXs{_ZwD%25(`w$>Vo`o6)*D&?#
z*6B)|BM2t}Sr80p`9*LBnl1IZtM`zV&;*Re63EW7cj<#$hUWn*_O?MmqzW4r43aOi
zLNpx`zr20lY{(N8>tI5gR=*<QV_3EdX**9Y_u!C26pz8peUspNsip#(hFn!$D+2az
zLtd=Y)lff&Pi9r<HAz|1uENx7TUOA_5#qjSy4c+m+o=hKOVF|9Qhu2(JQjZwN+s(k
ze;Tx~c7;C>65FazP(vj6l%@XwT9jb%Kd~+A2si6>BI&Ba9|JSS2TX+;C(#X8{Vx06
zoPC#PsrX;sdgddn%Q_TQfu1P-j~Aex;kTGp4}bQ{-LCi&Z~`iL7gbP<!Hw8amE&DU
zjt7k!@1|LroD0+b4U<Z~w*}JCq}WOMYV2Oj#C`&P<=8bbi_bFMOtd*&%9ZtARax!U
zfM-620Scp5RfF#xYS7W(yHyRwitppi2k=Fh9$zY6K~xOuRpq^Zs60n`?|ICRDlFzG
zY}BK$4-OSpM`5wzL%jJAzdov(+*ce8j(Ifr@KA$}20!p<kS;{fg$6plQ9q@&N7YAD
zX_`ILl$hqgT*<?ZlE*zt{;+|P>nPCbU4h7b0TR4hVbom?rS9LTs@sL^;j4;zm?Vi(
zfRNl!VNCc7T==v%B779W!n&Y$49KoSNZcDnVmCMcsTo-f-KVLg1;O^Ebyo^=A4w<g
zp=z?(Eh{j6kb~)Br=ddf#~bRT>oc||$^7Z3s5Lf4CzL$x=yWX06E4|eVQ9C)p4h}?
z-w;Mt`0S2i4}GIpD#<3^d~`bPj7}3iH74!<hojTO;OLZagrD*WpE|UmCN`92OElbu
zl3b0cginp>&xH+*I>M)X!jB%>P!k)%q$xG-*wC1xM*na=_PMa3Nk{lGpYY>{Hq^w1
zCfGVJx1lLVjpIHwPJAwGXvPtK(kJ}np$#>$p(*x6fZNclqeg#9F!i~xq4Q2oo%9Ny
z3ZGL)HVny-b776>orBEWKJm;cpoNcuT3GPPUOy4`&Qz-p$U;bbB$UMXEbZ~lQ$r{d
z88q31wx80&N0JqoyIIG)k4W4fr}GZOr(G+uZrD~oZ*-ME4MP|<AK^cVWlczC-H9@r
z*?!h$!0uB@A8rq@u7lT*hRsGiy&J%F6<Yj3yW<;tTKA)Nr=KKeD$S<HUQ>@d6ChvE
zNTnXH!+XsCfX+_k4mf(P^dnxG(~rf`jmoFo(SgCbj$ZtJdniLpazjKqhu(ZZZw@HY
z_2!h{o>NZl;t$`WkJ+26HhxfVM!9RfIUs%7EA!Z6@#gIIr`(%?!B6DPW84st&Y?FS
z(3=BFbiH}hZ_ne7H;?+fdHgYZbJfNV>dh#3tv3gxpYX~&@i@Gh{*3HX?aq|LPvp;2
z%oG`+&!Izi;LnFYsXqsm=sNVc-=xnt4()r?lA3(H4(&Db19~)*yw;@y@=trEo_Z_+
zI`?T$w{xE`-Ja#KC3l!^AKak>N_72s(r?daoz*hmW3TjMUM=(3_(217u5P*=kp7%W
zbn6*lU6V3|nRv$Y<5q0r*I8Ruq`INYUOM-${M}R?S@dn+2286r=e`u+fz|KZ+x5Vt
zzt%*$Yn?4EOU}!OiZ8?6YH__2Rlb4WCjF&iD8P5a0=^d(@cpoWAA|*b7#8ruuz(+h
z1^nHxfFFki{CzK=V+s_|X9#|zZ#6@rJ9hmze;fiQ6~OtECOB-*xgY1-d*V2I)BQMq
z+62e<o(76~-$DJ(o~X{2cMr~ev-lT#;RN^O`Z@i}CY<_S2I1%QubOarxelj)y*H=+
zeZhWCzuSaUsWGR2(}dHlI-LG(I8LQHoPMtfr@q%(_&NQ%CY)C4aQgRqbL!t8?dSA|
zO*r+vJHyZEKQ!U=^*WsX<KCS5_h|b${ii0JzEub3pPS(LUgY89)U>|9f6eAbKDsv<
ze~X42<#i+D><#Lp0WB6yr=vr*b<9(jA)2OYeB)3*q>2@!hwo6d3tuX{fN$t4ob<k*
zFJ`rEd)~@d7;f}VzHfc~$0yFyck*@PyZ8*qE^hp4e@n{@Zt)X<)7inqpg?1CiOdcT
zo{b1I1tR5HRTA%o8JYyLk`yfw>TZfb&=Cqtu}ZRXiy@^41>EjDI~d&UJSae3JUdw3
zDm^IR4yV~cXQTAsNPyI6kklTVqid-%YHZs#qc=7v=vpqoSy5+lk1)s@r11cfcLVj_
zm=j*i;MVHBQ6~e9hS+qyH^Hff1cTeL>(D`+avMo7lW4FXn<64~P5Vh^UBcl8?ZXY)
zha0pHH)wxWH)#8Y#+ea}o_LW}^LP0`mkLB!VFs;{)j@2;Gp_9fp}b(D42<@FE!+qI
z1~JAM25$SVLGZ$jGdOC~wSW+N>A1yUA$+-$asa7?7G_u&VBnJha4gIf5>AkJ`UL3U
zZ$oAWRmDqk#G!fh8fRsM^5QU-)p#>g^{~8rrxN?(JC&@<cXGdc8<jozPNkaio!%ec
zqjmXC?U(QD_TGGtHRXGJe|)1^F@HzkIwGX^E5a(t?k&S)QyHfAB||z<m+{g4GM?Mp
zX41(zSXKA;0M;|E8Mcp#Lr~+AbR9C}z-tWZh{8gK%Y7YnuyDf}Vh#BRC+7lcVQ4%X
znW68SIvcUA2!AsIot^x4TT6@QMFv8&?py-{7MpfOdT%BtwgmSFt3ot7T;+e*VKmYU
zaFbhZAP}Nguhj({c5N|Avz{nmR@Y`J8Z?<ogljUD^fqZ!4%8%|VUy{QO^$k-%x)j3
zNkGFU$3r$b;cXI}uUVUz2nL%(G;DJ!T$|~Hx6RzaG@GNEO@n5+YebnDHz4NhTeJEq
znyJ7PoxZm0##cLVLhYY5*G`%n%$eOOvy(c6N)E!M2CmD6N(;g!1^Y?~9(_W<(gE)P
zxvyCN=p#Lg^9>nF?}wpB62e&kZ^+R2ei(Yxv6)BahRjXvgSkf&baTP(t46G!T)cFn
zr6*SMcKOLRtuBr1N?P9YoGhYAyBS`(YDEm9m1PAVEZg_3UKmuB9KCs9b@4?<0Fria
zQBuY~?Sk(d!bhY!l=oINbz3gsqrO6mbp-$X%ck>3TlDH5f$1g6?F@UxCH-?MAAeK+
zH})q;aM|(>$@wdb3)WG8rON*V%*P*IIbnU#Q)gtPZ4iJibK-lst57|xO&9Zf3zpT#
z#9)nyEsj_eN^CK+AU+JIj+ilPV14oW-1<UhX=Uxg+zP1hU;gasd`$lX>)86-^^Ct#
zY<0oc7w0zCRzW>|3u~*34#0gYviS1y#@fPSW_ba@Bb##@A_6bGV&6B-Z2bQK6Y2`o
HX=eZcQ0eF%

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
new file mode 100644
index 00000000..6f351629
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 2,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 3,
+    "stats.cum_n_token_goal": 12,
+    "stats.max_n_token_goal": 6,
+    "stats.cum_n_token_url": 48,
+    "stats.max_n_token_url": 24,
+    "stats.cum_n_token_focused_element_bid": 2,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 4,
+    "stats.max_n_token_last_action": 4,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1902,
+    "stats.max_n_token_dom_txt": 952,
+    "stats.cum_n_token_axtree_txt": 400,
+    "stats.max_n_token_axtree_txt": 201,
+    "stats.cum_n_token_pruned_html": 650,
+    "stats.max_n_token_pruned_html": 326,
+    "stats.cum_n_retry_llm": 2,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 2789,
+    "stats.max_input_tokens": 1404,
+    "stats.cum_output_tokens": 128,
+    "stats.max_output_tokens": 65,
+    "stats.cum_cost": 0.00049515,
+    "stats.max_cost": 0.00024839999999999997,
+    "stats.cum_n_token_agent_messages": 2902,
+    "stats.max_n_token_agent_messages": 1459,
+    "stats.cum_step_elapsed": 6.860883951187134,
+    "stats.max_step_elapsed": 5.8696064949035645,
+    "stats.cum_agent_elapsed": 3.769465684890747,
+    "stats.max_agent_elapsed": 2.946484327316284,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..71da24d790723bdd2cda95d05e018a254d7327ac
GIT binary patch
literal 2276
zcmbVO-EJF26i(EpPVD?OqydQ^2?0_ov?g(anurS;AzDOesLBJ3W_QQ-$g?}EnOQqV
zLV^pVMw+`07a(4Qm*Ad9;GPHIJF|}Sqm~PNVb7d7bLRYf=R5xK_Ah@eFT{_3J2iS@
zg&9nXt~jfNk%cIo?JBLS;9FmwRmWyv!^`mCr|?6#@7sKUbH+*NKaLy4({5bYF3w;f
zV{v<Z7yRav=Jc2>ywYmP3laQVSM`!X<&p!P6ta}TZ)8L6SfO*FW-i>DyP#CjjmNx#
z=KAsXwGmHA^Tt!n?dGY@HS3!NKVwciuJAn)r43&%{Ij^=qg<9%?XJt$Nxu}63+dB^
zN511kfh=-|*Ta*~|M-<YzxnmCv{IMM)>WmA3oraeE-Iy`lx@~Hvvb9nq_^8UNOm74
zdxxP;{Ze0XN_cM`ZCw0k!dYc>QMur^X8C*;_m88cc|C9G6n5XgX*Zw3Cls==A<IGV
z%ipH<#^vSO3O00wLOdN>Q~rz)*v2ErVf@^`K}UU^;U{N835ppi>m(M&Xv367bTb=6
zX`MDxY~#I(TX$9K+;7E~GDrEmRK816H<aaQE<A6%$Ys9|FILD==B5=O)~QTg!kOrM
z<A}}Rcxi{)LAOQ5tS-oTapnxq9HR=R3UBZMe7koc&XmkX{PJLZexS9<k=v!mTv@#N
zPG2e~%-qf_5tivvP4UpSu0+XYJV#`?a9pb3Z}Zye;J0VDC(;dBD)Wn58#6haorwUK
zJMb6T3BwP3=Qu8J6t3MUFe`wMk6uFosVK8y!Of_fatdJQD>)c`BgW{cX!AR9ql;U%
zPGJ!-cc@(_I%Qm`@H6de<>bt(bAQJ=ummM~(+Fp#EJoGWCL`#q<f72A$%DV8Q9DGm
zsdPC&PEaLT#n@08S>|#q^BVms&UjHNL1<fxZ?Wbv8&3U-v~2d^DNIFVR*ILTOLc}x
zCo_o<&;9BgMTGemzD@fA{s53DQB>C@pyB}C=1pitIk4UU<x;*v`hX--0&t_c1QX&Y
z*t$#P(X`i`KOX(=J1I0!a;a;&v=!0BWC`1K%sNH%CeL$LNNX!@sHX~HEjpMktVIq3
zx-%x72`AgWP29l-M%PuK_*Y0Hrahe8rhJHU+>q&T0fb+Uuk+=K<90M-sO?!<yAkwB
zE;$-E_(sJFk%yD@#=do;vf*UOFAoLJQ5f=V)rbr|0fgs5BL(*+TsjbgVaSuYhyn-8
zV<{$)Cr^FLDm?)j5cV1o1L8zAxf2j{qHm)~oJJH|3nPr>(qM8`+zpjX;oUQB<=HcA
zF~#ot#ZS;vLrsHb5#ZmvU_PY80;B-%F}U~BWOpy=_1NBCvb&%3ju>VZO}40gbtWac
zjI#+~Pn;~Li4zuPpS;S(>*g977uAdnb1k!M?;yDF9Mnr;HA7<m{}}PBGwkIetqImG
z&@L<NfO3drW+TuxNAsQfB@{>iH;aA^wH=@fn4*>sCvCrdO@b=qx_05@J%5M%lX>tT
z9%P3{S^97{*-iV0$w4pcC8^jyO88N>$M=u=eQ^l2HfEnFbE3ZRz;9SSMolmI7t^B=
oMFNh&=TvaNOg|3)P_8athA-N!zZZ8px87a+^E0r`a}Y-NUzF5}2mk;8

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..6f8de6744a71a3189eaf5d158aa6334e2ab00df2
GIT binary patch
literal 106
zcmV-w0G0nAiwFqh7LjKH|7UMuY+r9;YGq?|E^upX0Bc~G>cRj4u~TZNX!Nj@R2HO8
z0aB?IB~vn(do-PM@{3ayN-|OvQZf^B^3xTP@+%aQDixA*GLy42^U}dmN)<|aQ%V!{
M0Kt^jsZRg^0Nti4g8%>k

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..94b8701c7872a1b71d3f9158069edad7c604f47a
GIT binary patch
literal 8014
zcmV-UAF<#ciwFqh7LjKH|8sO@a9=PkaBFM;?LAv;9M^SAiq9eSv@AK68;?obOUC4K
zcS(vOCCZ|x7d;Z?NR%yCRkyP{ce%5io!QKT<T_Q51aTv^gEmbjNYeD-eiZ0We_W@C
z5xA~fpa{^90zp3-^rHompFTc<z$j3lfP2n;%+4%#$yLO}PC+iXd*|GH&OP^>bI(2Z
z-nlcM@A`W1r3C%+PvmWD!y)!+t&$>}RbuNEV!BSsu&ip}_uX_!b;&GR!PkP(F9i33
z5x>g;P@p9IeOBI~ko|>{>gFoManx1f1b2fGTF$l%68Qb9<LHj7nl7YWAP(tDHMqR+
z!yjM#?$du=^m{pC;J<Q3e@0%C)rwrRJX^j6Nr5{C@PcZ}z+cg_EAj@(%dYCIIWk;g
z@s#{3FhR>K$t7Y`5h!oyZt2Cq19nOSewPQ$3`+Z1)$+BcN(cPWMZ<E4?3ReE>8fF^
z%6V&3&e!CEp%>P4bCqTdZ4RXZkN91ymTCm0BmMv;a#p@f3U09MAGE5(%vG`VInOpw
z1pcL>ZjiZ&iHVX`ArtOIMK|>gD?eT}JO}>;V4~z!jPQb1JI+{1VNRg>ha=Qo-8HC)
z{lkk`RKCEtUIAHPQMKUe>x350>83_D1OIW;t5j>Lf@PCb#WP%8wQaQ)_=7o{6fDzm
zZ4VmpwZQMD1p~j=)L4<PftKG07CZ96mxCL@h|-~Syb+Y1_PaH=Rwanish<iS+-JG_
z{Ejoh^78UaAHpB~QW9>Ef3Vn>;C}&J`ujjYLFu?JX;wu@5Y44uvkD$HJ!l;Q1_a>N
zjjL#G-L&n2(&6{!RfllsL1=;tg|4dRs)trYYxW0;S+F$F-?CZ(oKC;5>g7RTOByfG
z=hVQCE8%r-(T0+63GL`MEsZeK9;m7|P+y{=1jFX-f={Qi4?f-WyWP)I^!sV}?}X1V
ze4YzGcr~Vs(@z@yv+$XM|1o7oIR)Rb5Apku#}!lqaHaGs5^4tschD;Oc&eiz-Rxg{
zH_s8Dec|uE*md|J{sE_?YSzXL%W^4WD8Fu<OV00JR}Bxj^%(C|G01xt2(d>ws2o-v
zga5<IG5CLi|6}^O7XNd!^mFw4u(AOE7b5@oeL+!`f<ho&hHt{5_jq39L;QYZ=@ri5
zNZo`Z#(aeRbSOuZ<Kja6D9<)Rw)vT5Kl37S4)Eh8mSR6gl$V<^#4`0pz@}nGu0G!q
zv*C;Q`}1mHjhNapNCNr+s-{!v1@-h%^BPbFmHo;AP}Ct1;1N*PQRQ(^&~Z@ICzL0Z
z6JXuHqI?ppTUMS?hQOMi1?4=iB*9chlu_kV$_v!KQ_2L`cLpqb66||gISJ}V3!epr
zo>tC)Sk5Xhf=qfqrRSAJ;!CRQ+Irq|+0b$~xJ!BK<$N7bB*LyGxbEZEb;!{WA~2)e
zvNBIcu<E(b@Z6yog%R2&mNOE;dmdWIpfALr;~bibLDL+Xi9shhbSeft$)Ph_+IBj^
znSh;b;Fihhd<6HRa*0DP$Dl89s1k!-<Iv?;U7uENC^wZA<(Be_@+$n^R$k-feih3p
zzoy)!TDhhCI)~?C_!pu4Ev3lwRwFf@P}U&FP%4T^b1i1b{Yn)I^>Ly$M|48<-%>m-
zfc1uokjWY^|2k*xH%VPIpNmv}OZftaevy~_5~uzKhyPXs{62?2Xn_9?hx-li-{tV%
zYk>bghkvyJ9&q^A8{psI@Nb52aAGdmbeR*wi09JXpmYWOt)!b(uUW<*%~&iE==d6B
zVUFV;u&S<ZnKy|6V{-(4ln=ER__)jl#du9shj^ML+&^ZFV2>x5J)-0ljd=bcR%l5B
z8hTNOm@GzPKzC?pqvj35>mGse7JWySIR2r6XXB836GFrSjRIoW{g$n->SiO{^Lq>x
zB0?T(ptldlbUK*hP@Due#axv5OoX4f%WCGZ7Kq>DxHTi{rXKwRMDvzsYMdxCo4EEt
z$F*}Eq<*FRpy`G3W-OiM=`5s6H2oO(g$+fe>0{7UKBGKN(@!c_5U1^Qf_*sr)<dP^
zxJ#GMz4|5%7L|cIsOMZ1WL-Y@>EB?0qsmbV`V4^nje!m+hY)o6Z~pScRSI}X5r~nh
zBH5SE%?45OC%}cRDxczM4bsO%`trGz5muCejw*}bb~<VL%aHC=avc6<2tOKue^L21
zblYAIucyBVW$JL2hS>&o=T_m!E`IG|3~>l1YjoPQh^}JObs`<O1BUd55COU~1lYRi
zUbi)3JAo)DKAkP%=q?^KcGFKU{WL@=#2>JVMTfXIz=?%DayL5+hVY`pU_dX7_~FwB
zA0fRD2G#*Sdfis)Jp#qYyTVJ}e}NXecsCGH!ST?@6CgKc<f7@qb{*;(bG+2u5T1y@
zO{iAbZX%L4;i?W#aGM@*3bth!O_kEhr?t>ZBGd|FITxlaE`z9R`9-JeTEQy_@WULg
zl?X<?TwNuJ7Ai@^RMHyWDEd}uP)WhiVLaVkm7w_vm0;RZ9{U)nq=U}lKDH|HSR{Ae
z+7zo0EOx0!ERVsY>x6QGlzzmcf~~{kwj*3*BoiPQRacS%MEq{><hC2srE&x!5rdek
zZi$Pq8|Tx&>weayuTyV(8`orFkk=40$V10hbq(;rl>Gf1a)s!tB{u*=^HjgrftjAU
zO3iTB7SNill=*`jx~W+k&Uhv}oeg|BGkIxh@xsEz^9$*jsSDHT#hLRnnc3-u*^{&B
z^u>kQ!0#Ux>rsK<#Rzv@S-EyKh_0aK)^$hE>xS;ua<&Rlbl`Vn0)Jm#*C3S+{G+O^
ztK+ybsu7TqP9yEWPXzuUniGQ_j}Qh)=B#3_qS|W?)_if{LQwwGjzq%m9;R`E-ysEl
z_sY`B)r*kM^31;Bf>mKm@0VG0S|Y@CWT0_%X;}f@YywS@>Uoeum-qZWq&rb?u)dN_
ziojzkHG%7h^eaVkIwpBd!y>Qz8IRbtaZj(OQlK4Pe2N$El3%(BO!VAbys`B13b5hW
zg}^@;tDuYs(;VTb$kHZL>ywls%|FTWU)Wkduup7LH8_a>FtiztHnbDlwDc)n`pGS&
z$4e0KIIPqxFZDPrRjF9!gs0~`S2tLJ&P<w@=#)@dH&<RmWp!pU419YKzNBH*CaU~d
zxkP)$oSaFgtDC1~O$V!2YjbkmunKFZ<$KaFU$L$iRXx{|pV2E-%f>}>ER+Hk!*WIF
zbK{QwIzg&PG?rgaj?<;qylMkZRn>4EbxzJ!H|0r03K^TOlgN9nYnf0S+?1i4P(ivi
zb>8)(#3?sm+%R>63z<_!*>Fp~?e7EGtOH9nfxd8SNuHOBo>{=2GNS40qo0*zSubLe
zcKeah6!TW{Svfr_KRc4-;!2LDSYjjzoz1naT5|L>;36WaRNdOh=xKRkLM|8tDy!OL
zUAH_35*ExyD`QZHteQ1Ics9j2F?FzoB@NJ|dlKca019*o_cu-0#4^Yu<R#Bk&nD#;
z<j_u5iAlh7kXJ2Rb4I4dPK`ot^2`Jj6I`m*DlxSYqyTjyCruhxH;S3`m|V3S9a)%@
zKU)D4N)|x;#7>UMsvEtsigU8?a=?ju5n2q%^EGH*63sF>dQZL=R$5nSa`aC6Zi@Pl
z6w@NKjrL(%H8p)?Jd+;FrfE~aim<_v%;~Z0G|!@KY)N?oM~^t++4NW@B)mN~lSdpK
zWCPu+j?NVbN_d&+JF31>Stt**02xuSW6j(!shaURY6cw0l|`Kg=!zO@vh6J>Rt*qf
zy{+EwfE4hTqzItKI>(=7jRkYLSk)a%BO{p<n1gG31g!ktsDv$MLm^NwELGduU`l~T
z1A+#%bQl!QU%&7gc2O||?7Oh^3hoKPSl6{-t5wVs{(c`&0+&Zyj&R}DyU?r<a55~2
zzOm0$^L+oRa~wuczM0s=Bm{&ChKl3KD|H70DPR?P&$wzBcm?JHHe64Ieg_~$+hPgL
zR##VXsR=JYgZk@q-6=L4=ZTgFrVmn(q2`HUFi0=p=o<3ZnGpj@H?_9FFV*{;ns+S2
zb4lHG&SucTQF{hInxSshwW`orY0@A?m;1y<w=aC;_zZNU%q$cM{M$_&cMEBZmxu~e
zZ3N&ZDP&VWSJXFS!hliD)UXwu88P(r&CTUW5o`gpAQHWEb7~P>2k3+UcHMk=VHrf#
zz!4G>qhsgaz6m7G-&$F@zKpb}1qc3eeH7V7ldQzJ4yqgngRRM1n+`34{uGG)Lce>F
zU$YG;zcVXF;n6rjq!s;TgxnHzD#Xf&K%{{Ur>YhxSrKz}fT0g>9qqQ*A>xRxGtX@S
zM{rQGZjK96;UnW_2rw-(Vo8}SHGz(ok(%gi!^_Zx$7GX0cazt3S0y+CK-VL(j;>zw
zDyhI9idsC^c&rzUn@Cjj)WO%sX^#Z!O`mQDn+ipA>z#{ZfRvqSyZ8nc7sk1?Gj*~>
zy{iytI~6c7)YPg4f#SGBc@~yByfangVlv%!c@ybRX4)@L>2t%vxt6-=+Jw~i&Bb0d
zzvrUIWSe*8KSKJw`l<>2R0|$(|4DDlV8Q($H3Vq$4eD4$=NQ1<+gC|`0ll*4ONEjR
zv#%Q*x%@;Y8bS=U{3O7XuXui!i95PssRIxJSHbCNH|d;?m1{Z|>viZ-zGwtThiM6n
zo*EB9@IDf;4cNpJdm&z=SUHIRsb03-VlW%>z%@EBN7I7)UQ3>6lODdoo<C@+c$)^>
z!U#RpHr(+N<q5dB?-AT+u`OWH)$3w=nqn3qd~@g!7U%9r-U4l@=5_lPbmAg}=o%|=
z-}6H3UWzSp+QKD%U!kNM+7jKNDE~Rwo$4RrTgex$U*o&UN@x8Ji#BH4l%=Ck10`5~
z9P9EAaC5#vRL3&;Z6b{E{hmpU5(4^A^dbAboc)kzwR|j^&-w;aR~LHMz$-r$Ezr#1
z6Q1A4?+xMIt$YFCAcD701m#OusH@(Nw?sQ0w646D2LF1Yo%(NkwB_4dA9Wk$VaiwQ
zuEk93=g_BwT@$Z7$7FLWR4EhINcnBNcn7^36WKn0e`M7ILrIJ;>vDX1PmV&4Z$;(!
zJ-m3gF2~XG8lqwVUKidwZ3s_<*loyexGmZJXIrv+vOPjx3nBXlLZOBG++3QvW|QJo
z-EoC=ghBftXJVNTp@>6;H|2LztZ0PKcH<Zms@JAcxx;=4$375U0o-A1X)DYVHnUq}
zM0dr~RAXBg`yf($AUcJLBW=-)Fuig>IS$n^SN=xKGSvuuDPjb1^7Jk<M!kXF6r&gF
zw*0Noynr-m(!1Wni}&g5C6i_=c64cqIw#u6=#-XDW9eLw_rkR3g)@;}_`N+BCS2It
zk#Um>>;)mP>4?DI-4j?#fsG4+os0<Vy*+^mfxQ#y#&k7<u0-(enGwP}6%pS1d%|lX
zJi2V63V+PRmW>2vY;$8zYz<;#0-F~Cn~n)AvnQ|?0y`@Nb}}Zg?4H0{2<$~6u$h>^
zCieu^LSW~Fz)r;kHnk_P76R)L0-KEqY<f>%Ed;i}1vZ(E3GC#az*-3GybxF>Ca{@3
zfwd6Wq7Ya%Ca_a`0%HPuFBZOLMEE)-qW-uio83`tVz06dsr9!bHL*4M3lrQHy3>@4
z#$tVZgVJehhl4kw#gf=V5QwjceI<(6<1|GOt7co4xO#PST{ZN^jf`&BS|g^QQdEtQ
z?~iEnWQ(1S(0yb&wW9mEe4=_%48F0kB)jvBtG}w+>Z+|)OYK<g8^64L{ry;9pB8=n
zv=H1>OmNL-UQI;eOE+EB)e9@Q_}hjA|M+2-#DBI<#}+F+H6{b*`lAEHbhOQTlc!Kj
zvvDzX3nPgI9@)LgQ!9bZ2!X`{kL=V(+N&By(?=c6)E#6n8*TaYN89qYHi8~aF753#
zZIK!dQBkR#@rdI;0gboS9AW01i`>e9O#Dd)WyY84_tFQKU;GJwTuE0xZ*9g;a{2v*
z@Vw{)e=v5Z`~#2f%_H4!b|n&*#iJ<7--4^<^0&KCX@7{{9^IF2tKeJh7JR$ig736j
z@O$kRe7D_#@3mX-{dNofUAqPUzTJY~Z@1t-L<<U=p#@_W6G!^Tosj5hr#Q|J+Q7*+
z;QZ4LIP92V9OqxR#SzC5Db9U?^P_EX#5vzM&VOu+({L^~&f-tEMQu2&8>#iaSN>pI
z)P_U7@ml}8rAah7{2NF8pKWUu$A9BE|GNVYJ0={*`EXktadtS4)7iNljyPi+$LZ?a
zzMeOm)YH?s6K`AV>Dv}ZoNbQdNITTi3}<kMdKR1D9N4ivTW~!8F+O7pM<RMm3&$)x
zap$->mEH%>@76pV0p*BoTei3{7NWB%lEJ521*K1xp25c^m4>4aO@et{G4iOS(pbv}
zCH?k?-+z%lD5)18kmN#`aQn=mMB)rRFaiwD52c4>Iy4$&erV{dB%i^1x>;5RujMl|
zjAM{2g~(g(Ld0Sdv>e4SbaRa%Ob*F-*lT{M@vPU73}NW}Q2n6Tkc_vI&JT%WVnY)R
zq^254ZF4lNUFz)S7)F13NOtT(1E*z&$zXz&+=6tn0V#T%Y<tX^2Fx}Nnr%<;R7--5
zN6@yXGuw(zcsOl){7gDZpz+*V6I@ghM`ac#?;TnT53NyQp@g=|nRmv$Q*3*u*!E7b
zeJoC~#XQ@Y2^>i=k+-xO@3WH)qpf@f<;L=0fWu!r>(*E*T5OsXqw?BXZUPE6Vw_|c
zXsoRf>Ip<_Jg+uC#Pl#EH>?xQ5ApjRhj?W$uF=zUG~?%Rm~Kc8=jYG{8Q0J1%u%bJ
zANoZdmV>B*A1s_7LJ721+NrHl+2zx94RUE<MSk9JPm42m&#$_t*)=_9hnKX>8K<h6
zoZ;D43^Q8_mEKqnbO0d2(|kx|p<_BqaTm|#H8h-N&JQiJEEe@KTEHyc%-u{n(WYlJ
zTN&NyL|-$C5kbVrXIqeGNB^3l)5dV|GK`j;Jv-S%aV-xGGG#XOo<}-7*u)~LoU^GW
zQfgxzkhi<qy2;bgChu?rE^KA={9#*eG<&lBYRDs0Gt+)G&LdQFs{Lv<9-*4qhiFZk
zhbn(|ZYI+Y!ObJIdNT767WpXc$v#9q58ED*0G9*t%US6ReKgKlDextJZj>%3?CVG*
zBF8-C<d($&jK0{?h}3^4GjmsjbV<kvUH6#aV@#$;Lgp{we)^akjX{z;vXuJ!>-!!+
z2Cna9(|2V)sIcI%1@Vv*K$*HLha1|I8(s=BV->XFq>_O@P_;diXgT^cpWEN)!&R2}
zQp(nqlqL%);W~<lAKQu$+lvoFm9`Tb^1|Y0FDl%XsPN+x6Yd-lK6*S@k79mlV!_>x
z1S5gm!$yG-R1pD2;=iq(`QsM*Z4>#m7x%HKuSr99JK}3n#coG?O{&=KNUup1yB*~<
zsp2Dw@V0WY`_WyKI(9v>Yf{IqM|JU`Jn;M;n{24Ih7VJeY;_|SxrE*a{T^=+3Lg0V
zF2pnie$m4>u;?>B3u`0EY&toL(cmiXcIo(97yqdi?|-3X0>6J@l|Je_`UaS8@SEmZ
z;Q#YVe6>$^Wd3k&aQOJr5*gah*8R9c=L3iCh^Q_luUK-SVd)TG_OhYFkV3g`LSRhZ
z0MA{LOR7V2K?yRp4XBlXHfy!)lq}ECaQLt2juTy)M7F%D22!F>d|L`2!B~dt`bI4k
zl=}Q*Q~(elr3|Bj|7>X+q<2AgfOiK4%SGb%MPGhX{-6sVNpX|DVB{KqxeYCeZ=va7
zFCGc}Uhx>9+xsCPf6ekBBm*`;XjrlA+BLdrYRM4!tV2fl&KntDf1=6ILS$Zy7FIFR
z7E_b(WjElO+3Q_^S0^Q79r%JA>aamb^z}F~EEz1ofaqe3UN{84M9Yv9EQ)kf@<kQI
zkNchYnjOjU@)g~syfe}3E>mEhAQ4yKa#N$k+3R_rj7<terwug82IT|~ty38eUhyib
z*(k!&;qr%F;to0SJR5#IzqdP)pc9;7c`4F%=>}QXfjSi!Gg0V-sQ(7B$r#C4072Q|
zn}$@{9^k*Ib6d_6$CY60qrDY)hWQ3>HIzats#Fc#DY17L;d_Fp;IYxp29OnUp-QOQ
z1bV8(Bu@%>*A=drtGH1Elbi}nt<*J;p$C?MJXnTdu~snN@M$M;B}>xquzVS6os-zT
zc|9NzDi-=xI?@Dq`=^KTxs&qsJSb<KwHV}57Zj<^QM&lN(>ZBxlC(ET+M6WpO_KH|
zNqdu|y-CvEB<W)^Num+T!tE8CkU44Da><;02P(dcvke-!(py|0z^j(janq~d!42>g
z;w40~>hkao#yu0`DhPq_t|0b`NrC|{zfTZ4GcC$0OUVnyhFWtlGNv;T2-Iv+1d5R0
z7!IP)fKUzNG{&Nc$*KR32j;x3A&r#0#Ddi!o`WOuDmKrwAR@&ZH}FN7SR!PU=7p5_
zl!3<BlC-x6kfb(qi4hk<o0HI7aXN`6C124U*RpG94MIH_ABR&XoRML4EHCg`Ck}MN
z0Atfp81z+)-yJ&l0pp0Po0<nP*L4U9>8(rhHOmq6x{8F8!)nbf0VQ#33!gKftzzu9
zVbfcK3YJE2A`Hc0F0TUXV4X;Y8l}x9l~U9&!wC@WDYoUUmZ%}KNj6sOR7nAQi5A+i
zRi7)MF-w^Y7)TK+pwm4FNgS+_TRWTCuB*C%dWGVqWmQKuRNc*0oH@B@SSsGqlu4)4
zqf*5JZR9G}IvLsId1G>or$jbe&)Awj#?5I?zN({ejXOyUa=}WH6<q^AgDEPf<XyaR
zjb;j>THg5i(6u(q-KObPcaI`_UB`$<3|Gz7sj}<3<e6;1Fy_EF+dorX)J;%#3u=d1
zvU>+i1uQZZ(pWGk34DJPFcnE&x33XbMS%FdS<->zU8H{J?x?gzYI#ewwH$gvX9O-O
zpOFkS%Mn~B;2A}jibvW*Lua0K9o?}iJGdT1L=Q$koyTrvt;4vX;fP_G1?X%SH&vMQ
zL$Bw{Y*VL_LOk<zX@pImzaw}b<->sVDfy;VA^erb73i-}&#DR#LtkSH69}}yndxjP
zVCxzVwTti`NeMdYCRhfz1G=oGx&->Es0JM*uwnpN>N83zF2LZ8xODk~AO*TK=MeA;
zXkp;7Xdzt50WpGXiey77lvHE_+C+K89u4EHs~c=x1Xm8QQW&?Q1i)v};#Cz2fSp-p
z(#1g*r$xBlMPEF~oGWA0c3dR07&ZmhJU|7w;&k(s%hbKlGm<*1KzEb4UR9+NT$xS3
zkifO*VoGGV=21YcK3s2<2n2Jeew0WBxTVrt5+eY>so)_qZ{;nJv}RjX?C4AqD9dp!
zOTt9$2>Wp%@K~#jNe-xjz7U%V$0S4(Y%vMs*n1f1VkZvXbXBT30$Q>JQ3_iy4OM}G
z3yKj&_<~r~sRsbf)`xEY3w;HNypV@(fab+bTavEOk<+1zd?Irl6dxfQu5f{1B9E~Y
z+4Yr5iAGZpyrd#3!8U~pTu>2P)bemK61oz!0mirjiBSs2O^H<v<M0`}HhorF!5?aX
zxS&Q2QJA3+99>!~s7Apv*f<7!;CSam`wU3m*4a971*U~I^!5upxM7hNjCloDa`ENk
zbYKkcwdNu(sr98=zM$4*$u<R1EazN0!!XZ+6Cw>B2)KrN#ZqA^9zCKV!0c6&1%^~=
zE=XIDY^w7=V>$;%Tfp_mCOSu!Ssr0|$a@JjW$~3r9^Fzkq@-&tRxMou$k>0W=Gehc
zv;!xixl6WH*&@Xy>@`en7)K#3>aLNG8s6d#8AI80Haj#Xv2NSYhtPzeM`#vt(A(hM
z=M0N3#fD3&U?i^azpA9wOtk{D1kty{jS1JT3GYCAT<C<+p$w^$xHN#h(6FpEKCzHs
zRH}lJK|R>OZ57Xl5u^jdb5whTuZ$0zR~@otVSJD-jNk4Deb)QI|IaO&ZN!F~Ty>#=
z%$z)ZotevDre@~AlI>c~Fe>zI^87joj}iKVc@I2>#^FkWzbH3HA0Ofb*XDZs{g&rO
zK*~A4s{rA8aQVgT#9tWSnNE2AemsJVW|Kqi`C+ksyxAxUkl0~px*2yb@DIX}3Q;k{
z@J3FxR~gF^yQUA=_Xk$1?)a27j&D2*{6iIW6PR#SL$LBv;)9L^gU7gL@B;`p$wt_g
z#5sQeyCt62fDQ`{c;{RA_Xqv~oLE_A&he^vHYd36AJo7JTQz!CjxlpA!pu1Izp3%`
z%s4(y)A91<quuNz_n^sY%i(TFph&0!2JmdSKJK3o*Ub*(fr=gz`O#B(pq5WGf%Btz
zkb6+%ii335{=LgTB7l)Q;uNJ`L-dxgYhUdwAA>e19p&TQ(PO4hATUw>1pI(4l%MRT
z50pEBWrX_uV_`*fvo+ixyyo@zgE$`L__4v@^10(*dYanZepa@g(RcdY&qIdi_k*v5
zK_1WJmq3X(0e=M}0T;ZnQ{S2GhxucUy}9r5!ViCZ@vpw~+?)PkAs84owqzcPXIfR3
z_ucW;ulP?!@<6<}gig;m;%k)arTSQ%G0XwhkVs|UO`ZFk*m#Z<{>dlHUf%D=xR-T-
QWiOxlKNAC5HlKt50Jn^MA^-pY

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..636120ba7912abab96d636c7994216b2746b8980
GIT binary patch
literal 4893
zcmV+&6XNV2iwFqi7LjKH|8sO@a9=SlaBFM;?L13x8_9M2lLD!)wOVD@wnpHt30lOL
ze~HvCMRFzXNTfnizt{?D05ph!1~c${_}R7MILX#ZN!zh&CzqUDl~fKn<djsrmmHF7
zs#29(a!!t^<dFO3z3!gD3_jA(<~leCC=6y^zxVq6`t|FX27Z6w&;E9+hrQ^#1=BdN
zh`D!EOOeAmG1VH;Z7Zc2M%|?&J2t6r=w-wG!kzrF``k^?0SiE`+)Mk7g2f=GixtJr
z*N9~)d&F{|x(SA78X9rwDaEo>%T{z7&TU{0=e4?fd*$E$WBr#`{-I2V1))nH-cheg
z8<J9!jts|?9>A$bI|cBfqDvrO*)VI;0VznfVm-4YxJ2}n^azAtI2%%hXmtci2dZ89
zz;!^J$|xOhKux!Dns+VT`)1_~om|xni%52bNF`O#j6JDf97=^Fsi>*NXR5x(n&OA?
zl<N>WuxBWmTe(0-@g#2)s-$SUoAj(vCwjh)+UFfpLle-C%Bn`Lr_<?*Q6p(PT~l@S
zz$hf^nq%R|1Eec<P4h2Uw@EHa3R_%-o)2)hRa;{wrsr1?Ra)ax?}9B5RVmu)K4Hjt
zRWFf4m%gPtwfa%2XqY5bb2M93Oj9{>=~&(*MMJl2(*Y&EaOogJbm_2O;wWFZa?fY(
zYHz{)+}(B)a<AO`nOnI+2TS%*oghh{y5PQi!CU)j?~=Q@x%u%|@L~tK$94-ZS4Vop
z%Y{qY4+e58Z_`-GsCgE|a5+^niVh1s&<+8k9^k?DT@Zv0GTkxoh6{>C1oSMZU<1-!
zMc;EUidfHdjOaz91peDpYCzLRN9s-i47O1c2qV@J#Bt5P9xj`J374S9pl*~15AD&q
zVgmOKW=aTbp%=XS<bHS$vSZZuH`(zDeEQ%$4)6E<H(pK2NjU|d47_vjnUZJaIXH&i
zgpQzmyXXRtNIoUU&@;fam$4raXT445?LOYIeT&DnHvsXj{Wqa!tcp@H4z>-$W<sIe
z2327(I=HWB4od29xtT*y?g22~kbG7?FMk6*=jBWAxhy_-{(gtgb%y;uJD!(U;IkI^
z;Bie>;JXY*ML>=OJn$y;6pHguNSA1YZ6JUz@ULF^g8a63;eE;PHA7zUwaCAgy^{}t
zK8&#u`nn+BZ<d8L18lh(xcXr`1=<}5JzY?W&xl^y1pOi6(HVX6FgRp{1=FZJCZCqi
zfNRcy*)D*4F3N9#OWp?8{FeNV{4T`tx8!d_982<5ISz6A9=PK>@&p7&LY|brD_;|l
zoQ4?A$}=pA2jzKrL0$y+-H?~S5I5xyzzRd)z+3Vvp)tj_O|{_Id;)puK4o$Z`!dA{
zzmCE6h`4S*F8UCUGAJp>xe}miv|bgh@eqdu(}%Yt0(9R2?Kt#W2$~eoR0x_8&};}g
zEub?Y=$wGgx9D~wAejfd*(5Eu)2#sQy8MxV-VQ-O7En0^y(gfXp}u}w-j;XdUHO6h
zP<{m8kL4$#|KCA8`FryBnO7djzboK*-&5;=`#>%Tcrno1yYe2iRAg1IvesujfKSPq
zPy9gE1)&ju{XoFXkdW(4Ku5Ig3&9Qo;qpnK^9S;C0sTFJ`6I#o#{&KbP4GVw@Xwmy
zp9}blCiou<_@6Yve<I*NZG!(yz<=HZ|3biB9}X$YCWkgpSy<D2^wh1~!OR*{^}5rp
zVT?7bRtOCE5;S3pMb8*@TQ&3@qQUGNfL|1o>6(~%`Lq}AscsRcWO#|b(QuA8Y?Odb
zEvryzg_=iIixn(NK_jBS1k9qCCo;sM=ZcPr6YUNZenmF7hp0~(rn;x<&2WbfX$sVN
zqJqGD?$6Y0ipB|cLY^da$g+>LU~1uSXebztUJ?yR)N}8Dd+&YNOBz@5zqY-Qe-S#*
ziSrzsSHM*R=&DW=OE4Rsg94Y8nyYC1L)!|Nrx#2EC5WoqTV{!vmQXA(`HmXpvPJA|
zaHOx@uP6AZRfNi2Ys;~dVLGtv@mNAx3(h@3mJS~Zw~RxtP~(B%xxeyHSbBhEP4oeT
z3DNiL3adg+bN#lM-5z5N8G<}P$e0)ikJJ)Sfn2c0;gCB--K*FxNbM+e7>Z|b9MkE<
z3jTJLJsmrMf(q7Y$!u;m=Ta#<{n5<o+RFN^mCXFi+H7WZ{?>eUadu^KZZVTtUs;5;
z*0@(cyL5mv9=Nl6@2(qM@#XhbOD(9HY9HlI1#*H*d$TU>FQ_Fr%eeII04^9OZ<O;j
z#e8PD^rB*_N)p$2P%9Trm1Pu{_PF#MYY9OI?yRq@xz(XwSa1%Gvynl2W8k6PjorKJ
zaGn#5BjZJ*#)UpD@i7bDwk-*G+}+reK{k_sDzQczw6Ns}9YMb7qGh4NCMkm?sZ<)*
zJ?xNTExpsCr*W^X`U8iUM@dI*oTb1!0)0lH4@e*HfDl7FtJ@p*cR>uxEV}e;sDmmJ
z%nE{wUQ=m0wLi@`vi3RAeyz2C5TBU3q6v`r;$-uJEdDyN8TNv}ey0UHS%G9@ajZpw
z^%ldb)eJrDsCma$HIC4i%?OOX7&>cb_X#?yFPr7y$767ajl&8_kv@e*H;#<!QZ|#R
zAKrlFG(^2}bX_WFM)BDV>3M8itgsuXienqnRkc<(OkCt6QVK-$@yamfl9u`jL9WO&
z)ZREvvZY}`F@dJ8lyKd5UCPxDrD-Jc1)FWKfI`zYbU=rkp{Y7_km<}_4E-Q;CJl@P
zn3Ui`5}eUCB9b4|ez46xh-4BNI1e_YWvT4wMH~-_lDa?nX-tyTGM=(wpO{SX93|b9
zGLzDKi3wq@iOCc{Nld_yvQ6V?V)6#iA|a{O?W4rx4Jn<LiW-5=>L%G&4ab5L17_OV
z6!ak}`VkPkm*SG>D#XG@3CLp4V@$##IM5~{-?R}EaZpBRn^>yfoRF?bew^$Poj~Rw
zj|{V9C1$1;CZTm=DGg|zluEr0i;V<wfId-@Hp(@EVm>n^)eTEU5w1(0)*yr?ieP?X
zPE1LP9lSEi*Cj8@fh6`iXbh*fjzGN$49khh=hAb((}qhYCcl??nqoO5#k~l+u`z5a
zdPz+rvze(}hN%KkgbD{5XQy(rqKWC)k@Er09%06dnW?PL_|??h9$|8@4Ggaaxo{vj
z;Xe0w(0!AV=REKNGz86#J&V9(ZpQ0i7zm(n7E2!BD;B5|k#CvtZh!$FZH;jUtU$gA
zMgVSXNc<^R7Q$t{u3AveC9)|92itTAMEUc{7;4O?Lg1hoN~u+0#(~ZJ9uF4jFe%*H
zT6=;+)SCkO*ESyFJuH~(1`bT6j%{AP?*~qh@|flb_tN?Rh7|(ld_>HRBeqfy_uKlC
zFoTMlpF`Y2z^JGxIG;RhBp5gYQDO8X6-~n{2p5Rq#+jdY08%y$eo``(y*(_V@d7-k
ze!?p1(4<=+r2+_jmVq>-Ks1d*hJi*cp?rPWc~&1CA?;pcxG4q8&|neX$eN2;%xx@`
zA%$jHMzxJPj8k^1k+Ln)p`Y<}Lnn>wn4E_plwAZ2VX?<;@^%MfBr8OLx)%X>PZye5
znk%b^AzQ$trk7BYzN|O%4eZ=s*#sLj^@S+jwAc5KzW@*2da%2@1(-uTG+nycm>yo?
zB94(%!AD7m#3QLyDUcNgitF9Dq=T#Cn%@zleK~J>oeVQXUcqn9$gaR>L8`1*VKj+h
z)s-S+>lIVI5W(ZE(^yM@A<eW)mxYsYRtEQ}deVylG20#b0PbObV$C(2+j}3md8252
z6UOG0q!SoN(!Odd1XDYVDI%$u!%I#r<<fXCl!Y%t;hofx=`57NpCs8RgRsnaJ-(ir
z8D%aqI#xcp`N-%8i0*~m#_PG07WeK!oov-0#L!cF22_Aai^=Ro>iFxqq7>7a$oM+)
zpUy_dXZ%HA!RqU@a%h``G;ZK~H_gSvpCO&!TdejX|6z4chpb$3Uy8)Yo)7V=eT)SH
z=)BD`rZ+f7@pkefQdq%s>(E%SV#2a&TM$<dk!_f?`U$|~pE-0u$6L%k))^>m>yXk)
zJ8ZGW@!Br3hAnpY!Bawz!+Haz#geEk@PU?hr?O3ob$q%2TJ>`T#2UC!(O{Y5K+>~d
ziB)o6I2~nTDji>8hmPqAJ}=^TUHnY-3h4w!^-tmAg+uUukZFL3w%Rb;j~Hbc>N1PX
zO=0OS6b$f|qHmcGU?i8J_AYUZ7mnjo4>M}7B@!*6BgKlUl{VPToGLC&6nakF>0aBq
zCvJAjeT_#w5th+RBpY}!hE)gqqGv=nZxh8bbn(233x3+sSx`cy?wj5he^`j`%dAxQ
zgYA55aCZ%0iglgp4}%Ep0=~zgBjO>I7;e?Sgaer15}Kg;SBNyw(BqP)$IH&053{OX
zEk^18vBNZfb@Z{cQT;<ESLdO{L+oo9Q(jyXr~2pIHh<^4lp8Biz067im?j7}SwB4;
zxc7lIA{LGfJ6?WcN6(Izf_A)u7guqX;C7s>{xh>IRMx)n_;kz$Q}wSIoj-s^b;I6<
z5lhUL(f>6KpH>~%^v|X{ILzQ-77ufHU=0g+ScJn&1`k;;W->ty=etWoZu5qQ3%tM(
zl4paGFLa&c^Pc2j?VKx;+Dt%ZfOebA3!cne+S#Cm7rWQO306n4;t<R_?FGnOKzyt-
zyG;DWu(1b&(q_9|+WBB0Pj|bti$UAwx;M#zbkkm{xE_?b>nw~`^wZmnC-aS9RLpdp
z%+ukhSPEKrrhB7;OS>GDcDCE4y&05tuG^*kASms8w@dqBP}+rVmv$&9?P9k}yAsS`
znQoW%R#4h(w@bU)SczVFvl8`IqL%}=KbV=Wd8^Oaz|#Hd8-sN;Skg1t9N=2?GC8zO
zB~Q3``M}LS?5dw@O=FF1B<}4G{zv?>V4zSi4#V4#=&7Q=3G5{u3+*U*>9D(3$oE=*
zPtR>{*LZabt~RTwe)Lxwj}E(Y6AAHBRK&|s5x*4`@k&(0t5FfhqawZ^6>%ae;&-DW
zPDMpb2NAuPVThrK2@_4fj)?6V3)9R+pvg7SEWCz>@0APFtiDF`7arp6sE8j&wdHnH
zj=NDgK8%X^Br4+fqax;`A{L?|mZBn-qas$KB37d!YEcpOh=>mThIokLXYYb{7aMm=
zT6SJ3^4)o(EGZ`@a+!%qm!1!_<%ww;rg%y?22)iX`Qfj)+{(8rSMldTmGR(@gA%l_
z9Qa94WvbH;f}|IJfAK^1gP>vjA&`A&yN}JYJv~coqY+5D9M8lhHm_)8IUc_mla_EZ
z0B_3TwY0?MG!Drzh}80s3(?XHk5Oo<{)`h$$0gh(xEycZBN&%pPF{{THW<bwd{Vg_
z_qG?t(@m^qnpwSKOJP**eDfsdKO2`Uv)CkQ)#5ftbIcZ^xhA6EHpEv`&Nop;*r51o
zh6^1THg8>gHJ`;!eEdy~ucpstf()AXJGP-kCkav>anhU3j{asxW-PQ&tDVJQe6#!U
z&F;rHyC1t^_hTrtEv0cL#gl?jIuc`cx@oqRme6jz4JJ7J#yyhFSV6Q|j>hcOikAk&
zW{NpZfx+6DpmBnP&HEge<J=E%scF%?92dJC<DxT|*Vryh*6=lK!i-D);uds}@Mc0o
zI2P5*@!!x+ODHPDe$?f7GTY*)g;q<P@K(&QEm;3BN7NNvNU+#R0v-^a=LBIIDaE;%
z%y`k439>oCO`A=wWR1)5Ro=wwHO>p77Wb?9ItG=@Wm~0t-TqFWO+e)k&s+zdeD`Nt
zawfJgB&X;&Aam1g{L*pXDEC!Ug#TKbNIi$wg*Y>wLJ{_n3JvR>m75JJ_nQ5VzCOVn
zr=2rdZZ0}+a$Mf?(Ro|P<-HJ{_rY;_FCIhgh`VBsDZ+H-7$O{3zv=8T;_29u=Z?WV
zlH}eABmtQ9Z^o9`4-s$1TpAObd2ta=6M>Dl(nQO88VeS&H<OOmO-CoCJo|Ow<6jQr
z>OH(fZ(X6cS)lvtbFVImRCz?JJA9?1)EolY`+8Gt{L0nF|8uSJD^?nfS+l#Vj1#Fb
z0wuz6>x%%XS6c)si`F666;(x~3ZjkdCsH)D;d>$lLmR#)QY^ILdm@EG8@|8EBB8bW
z6DkneFg}^$pbg`bDGb7Mo$JsclN=~!De(L@W-14H0o|lA7$h+8@L{$4f}XNr>Myd3
zmvl%xNB|^d4<9tB_=j=N`5!RY7Qj!{O{1)8_%OWEL(kx!%I$i81!pz>TJ99l_w)Qu
z8`z(;eL4H{m;67?@KMAeDK<6!3q2QZG{hgveYvXq7kwwtX6KbMD8L?F;Qxg*uts>F
zKBcC7MMt?ByfeASQ;T8AJx&4ChswoMGeJ*puRmJZUd!LUyLD^jE|~DY|NTF|`in3B
zhMwPEd6W+~%Dq1N_WH`s)+U%|bZu*M-9vamW9#=fcDB~m^BZf39@|~nkr8?AiTlEF
P3aS4CQjX$dzfS-F8Y+9X

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
new file mode 100644
index 00000000..351aa01c
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 1,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 2,
+    "stats.cum_n_token_goal": 10,
+    "stats.max_n_token_goal": 10,
+    "stats.cum_n_token_url": 23,
+    "stats.max_n_token_url": 23,
+    "stats.cum_n_token_focused_element_bid": 1,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 0,
+    "stats.max_n_token_last_action": 0,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1257,
+    "stats.max_n_token_dom_txt": 1257,
+    "stats.cum_n_token_axtree_txt": 75,
+    "stats.max_n_token_axtree_txt": 75,
+    "stats.cum_n_token_pruned_html": 658,
+    "stats.max_n_token_pruned_html": 658,
+    "stats.cum_n_retry_llm": 1,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 1594,
+    "stats.max_input_tokens": 1594,
+    "stats.cum_output_tokens": 64,
+    "stats.max_output_tokens": 64,
+    "stats.cum_cost": 0.00027749999999999997,
+    "stats.max_cost": 0.00027749999999999997,
+    "stats.cum_n_token_agent_messages": 1653,
+    "stats.max_n_token_agent_messages": 1653,
+    "stats.cum_step_elapsed": 5.879024505615234,
+    "stats.max_step_elapsed": 5.879024505615234,
+    "stats.cum_agent_elapsed": 3.029170036315918,
+    "stats.max_agent_elapsed": 3.029170036315918,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..3399e40c8abf6f4fe97d95219df80f775fd8b417
GIT binary patch
literal 2276
zcmbVO-EJF26i(EpPVDBVAuUK0Bm_vU(3-??+w_V?h!znVs`3D%+1;@{^6buPX4a08
zkl+HTk>;+$1&9~n8F&I7fqNc+@60;Rj|vy~!k#&E=FIu|&UgI7^`HM-U5X$7Zff+@
z3NxG)U2#$gBMVVF+f`au!MDCRsSeH1hS%Z#kKy}p&$syy=Zurke-bx}r`@=)U7W!}
z#^Uz!F8Hly&FLXoc%#*l7b5s~F6t%2$|ZfB6ta}TZ)PLzSfO*F<}Tb`xS&+gjmNx#
z=EmW7wGq!r^U71r?dGY@CF@%SKVeQguJAn;r43&#{k^>8qg<9%?JmnVNWT=*GwJiC
zC%)rEfh=-|*Tb{Ve*c9&zxs`dv{IMM)>WmA3$OfUE-IyGlx@~HbLWCHNpH8;Pj(+A
zdk3LT{mMXbN_cM(ZCw0k!dYc>QMur^=lOgQ_fMmxMLn<S6m~zjYB!(3Cls==A<JR#
ztKX#d%H`GC3N~_uLOdN>Q~rz)*v2ErVf@s;MMndj;U{N835ppi8zdISXv367baNXc
zX`MDRY~#I(TX#|G)NjX^GDrD5SH4A3H<IOeAv|ro$mL)FFV@IW=4KTj)~QTg!kOrM
z<A}}Rcxgx4LAOQ5tS-oTapDZm9HR=R3UBZse7k)n&Xml?{QO{JaiF!yk=vE0Tv@z%
zcOaD$W?^TZ2+MS-W_V~vSEA%Ho+GkcI4)K2*Lm%9@H_L{Q|U%5mHFAN&AA-T&P9OB
z4fu=fgwcn-a~PLb3fFEFm=(as$NxeBsVK8i!Oggvath$imvT7zT1?PU(dM_}N*A|m
zox&nw?ohi<b;`I>;V0VH%E`G`r~Za@U<pd}mJ!a(Sd40*O-9gJ%SEAMlLvoIqjrdB
zOX+fmoS;gwim{<GvdrZ~<~90NobaMjg3z`e-(t;UHXQpkY1#b2W0;D_tQ0Rvm+B0Y
zPG%Azp8A^$6cOfM`8Mqf_ya(qL{VLrfQkcjn>V2q<-mFalymtS=>w8T3BZl&5=@Ar
zVCxo<N7G&l{&@16@1)Q`$)&F8(so1>lO=4|G3yl3+dR)%A+4>rp`I#)wdi2FupT)K
z=+2mQCLHbfHgN|V8eLa`;$I_;nD%gVo$?{dNkgWiB@li&xy+Yqj@$8^p?2nF?MBci
zx#Vcv;2RYyL>`XrHukL(l?_KLesv^xj>3>{H;u^96F_(_G*WPT%B2G_7=}EVvnX(|
zI+0=;dGgq|tkP4U0b#EbF(6J<lRE)HC;B#?#%V;cwJ^e1E)6DE#ob8B6yCkiR-U}T
z7E|n=U;Y$5HPSR_mI3~)Gv-4|EI<nI9)f!xB)fY_ugCWGlHL8J_n2X3(PWF-7iUtU
z%Q%|=_Qc6@mN;Qy_Su_k{M%eZ<D#0eVJ>Bs^$vmyFG0N&Rx>2_|Bn%WbB?`Sq&301
z1=?kW9a0XF%xnzW=4ifSzk&h@;AY9MqqakI0aMiS;i&C*E=f>@T-Pofz3*?3e=-mL
zqqM)fpA9me^dI&gCH;dznmj%j^pXdU#N+<{?nAzRz=LmN_K7kl>I?V%rsWgV^qhY&
rJsMFY;23;P1^27;<M0pV;_`L)yxsb1`A+BBd&_@pgKb`dFuMN$a-)eg

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..ef19e47fdb4ef0c6bc2e36d001d8842c035bdd87
GIT binary patch
literal 106
zcmV-w0G0nAiwFqi7LjKH|7UMuY+r9;YGq?|E^upX0Bc~G>cRj4u~TZNX!Nj@R2HO8
z0aB?IB~vn(do-PM@{3ayN-|OvQZf^B^3xTP@+%aQDixA*GLy42^U}dmN)<|aQ%V!{
M0Kt^jsZRg^0NxBLga7~l

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..2aac84fd55bcf024fd5998d534e006eb38ef3ea7
GIT binary patch
literal 8000
zcmV-GAHU!qiwFqi7LjKH|8sO@a9=PkaBFM;?LBR59LIGe#m^=6X<4#mH(pbYNyg-n
zcO*rM5@k`8Ow%h-j!0Q_6y>^iJICAO-tJ{TNS<94NfEaavS^cLgM74U(*Obb+jWZ=
zLEyLrf)*{%BIuvAKUy^T)%HhG7%f^9NZ*@}{W#u{r-+W7mOSuo_RYLEZ{B<J=FQB`
z?q}M+-1@l|`sY89v#m{s*lX2tifmPgt(S@EIw`}lDuLg<?2^i&S+IgH1w)?=?goQ?
zy91y=Y4HzOIfp_H=8LMEEfdF4*N79`2?l97+cHSt_o$AeJFaTFkamGMq|23HVeUsi
zx$+%(=%(Mr5d;5~7xky*MOiJ&Rm-#Gn~)T^!vN2#rVRWQEW0dklAP?S&blMRB^FP~
zw}1&+W>GE@qk=$rQ+JEc2Oh9f?Dg9{Xl776$gBQFckz%vG;dfAk=-JZHC;8VH92Q(
z$+@bWH}w3vZm!X+{;mF0;1R!l%~Fk^c+Bs`MApicNZt*W{KHm-nAr-pKI_>Aiok!N
zpc`anbab?6mC2|(TGmZ{)5?uh49~%T0T?a1Wh1<x)s8S$QkWB{{?Qn9S9c96V*lto
z7L_kEu2(=7SX9ls`Uatev%0B~t-ycO^vacLDsR~&RrU;5S8ZFZ27X`GCV9(rT-$?2
zd@1ldXu-hmGBsA@OQ7X9g89~5@EgI6U{GmQTHgqYPx&31Tdfd8Y15~J`}bJx0l)Qp
zu(Y)F(of-!ekm<(kh?$M-NOF@xbzQzfP&&lU(&3ykRX~%k7ngPYI@K*1oR5P%^O$I
z+&XC61EtmP%Bc?F(8JIK7Ybcf%{33Lh}P`)5i@UTpuZ)x3^;9mcg4$rz!o)Lpxdc}
z9hbxFu7V9E;S$=>VOkntrrldnZJ@qLMG1z@+XbIC<p6v-=y$W9C+YW7@ZScX)A0Fd
z_`$1TWrTjl;6DSO3HTpYrj%*;PJD>pjXbWP8h|UMN0CrFK)97w(alq>HR*c);=6v1
z_#6m-4<xR`5AhE<MOCvlZ&;Q~8AJJX=v;Ds$A)Tn$gRhCr;0(|+d+t(%3<ZG@(BDN
zRZhVFWBeb}A8qhILrZ^@ejiol;Qw;$AHOdvs*+a-q)YHkIP@;hi+zaSgDkzmIUI~k
zIA+Yp*iWl+OgSko#E<e!Eo6tEv+QS9BrXDevczKI=a}+xJ%(7OUJKY#%-Gdu8e-Ob
z5&vLL&94(vTLMWyKS0&ADP5qRZfah=N}qC2IRuJ20s=e+$~vw*3JN+2ih5i*r91)F
z{Y%P6z`AASX{8^m`592ov&sON%Ahi&d`x+c+ILDB1^Z5eWsie>Pbz0X{b=E5L80fA
z^B|TB%JU$TPEhG3WuEww>bka`^ISHx+zIYb-nuwnhZKphYYDEq`E>+29zq0WoLg4r
zsTfw2`!vt(Pf!@7ZDKitF}!D?g$(*!0y@H>sRVS4L(>W9IEPLopl3LAYFpdR#W)kN
z3pLy_IbDk3UQu4)(1ir_B@R^*&}$sJl&I_D$_-^%Sy65(uPC?RcU5_nm-_^kQ$DHO
zp<21Ae2T-f3H&Qi{-#pkd26v6pHS8z$56_ONpme`$URC03Uza$Hb-<q_1{!HE`W`i
zijc`FFaH{6?pH`8nqQ4oepC4@hyEHb`8iJg4G#bH8u&d9zh49YJcs)=@ZaR{->QLs
zk;8wx1|D$umuui(;qb49aByNS*>af^!-(gFJ3;Y9^tY03R=j!{eKcdfNTB0ukcByp
zf5@u1x@9gC1IFeU{5T(KFY|Gk4T{N{Dh}~9OSpf`7{ne=FndtRDH`$oBdpM(1~l}7
z4l!ARMz8MB&_>M}gx5U?<1PA*3~~G;dC$fnc^N{)JdFYp*gcl5ujytj-19pP6(T|&
zYoNCe$8<WF<4`;RZi=}m@fi(2Ntad6Va*Z0({Zat+)X|F2Z-h@&(t_kWVYqnd#%?l
zwvy;d`C;7)<(r9ghNm--F4FWN=)1qHEYb8)=((Rz9;N9=l{<*jbh?FoIK1>g=_Kx_
z7B1d!Xt1dCMxf4%anSO@#aF(`0LPW%tUoMVyd^Wx5#<Pi7QX*SQ&%Y<s|dtcRk7@a
zi)Vv4`4ftzysUhTr>l@YA<_#MuO4GX8R#(hy-z7^G`$Y#HsvaZe=CF^j=>AcH=x^g
zad?z2K$!^6(lFb=?%XIG*~70rj3JJ|WQ|Un=FwGbxlXJDx5AKK6Cyx&h5%bP-RrhS
zY$p%}#iy-79No@?#t!=FqMw>5h4{Tzq2Lhr1~{>>NA6{Z!4O__7!2rz5kGkP@I#~z
zz_8uRN3S_0>JcbD-W6W*{uQ*?&bxt#3QmSb-U4!CMlPBzZ1<DYHD2YVUJBtYG59SQ
zVukIt#L~CmDuTCgo9=b;wq+P~mD0$kHPA{+s1?TY)iAxvWe|5QKksy;6}*A~Kg{7;
ziD5+LB9*i>P)SQdC5_>=qHmN2mE;W_#?!r337Vf!38q(-M?OR<X{EEc53Nc(7Rg<*
zw!|s~i(R6K<q?>4J)t~7ia+L2!S-Qt#}O_zk_ph~7B0S26d>YvfG4-zAd<>Kh(rux
zuDL}n!VYw`!0SQQrLR+OyNYWv3CODm>Eof}EnNeAFeU#WhrCGiwW1q<p?Rv`<-knO
zT%%^VXA5ZcR?7UoP2JS2O=l#Xnal*fJe{7Nx|A6|JD<LEd1`7pJ(Wq%&re*QJ~KHs
zF_xa52>hPYVm&JG+Zo~Z7gw%b4dN@P*$v&%bGo6s)vT>T6dm}j>A*jb(=|wq1^#i>
z*3}VQ8Py0#NvDx^;I{<+5t@^Lo{SL&NoK7=wyfIg4%U2S?s8E2gVvT7zvDEG6Z}>w
z@H<u(SFT=xbcSbkpUzul#`Hm%MW;nVTt@~PR~MHQ;LRq`6e-Gs9J;*ccO%`=yo2=>
zZBhUpQ>js0N2FgVn$tSYYdS6RN}u+KT^;fCD3t>3@ZuA^c)R@4GBDA(Jbz>H<rQGV
zvGaj{I8i|f5hgjpagn8sr#8kZMVfzx=Rdc-eqf*2rfP5y|6yoT9IbyhwrS~UUi#Fw
z(j!F(cpO&hEHCvaEmbaC=BTG<Jy$nafwuG*FVQBUvX)m~MP;?6(+s@Y2Vc@@)h4R^
zx?H3^V@6Joja9bJ$(jyUuU2Q|oMGkH&&hYC(|pA`Dyn*}B|oi~E0&Fm=2$2NEQaL@
z(C0=R{WXGAk!T`6N{-N_)|_esPDRyl9d$;|RJP=CL<$+3j7a1>*R@P24sOcOO{ie3
zF?HVc<HRX9VB9cugA18cM%i#nUiA-vY&L);n?PT<xhT)d1<%Z5PZ`wojiJ{iS=I}f
zq}_gSD8;;$d_f)?lAjqI;Nlt>O0mS?0CYCjwyFa|=KvQGNx9-y2ZzqdqoZ=(AW&Jw
zCL6ltIgqemK3X1zI%L(X0>U#X#)+weEi7t)Cf$`Nhj~z-OSr$O!zPwN9wBeweC5J`
z{G1%x$r>>Ucn)&QvNdOLVt9H8atF?jLNURmTB#6I8$=3FCvsA!agk9>jSb5c%h8dA
z8Ts`xn9x8T#82#jVOe$KS5{$07G4fGaj!s&A$h3^%^N_o92mMQ-wi8`R5~zpd+bh%
z`j8aUBD9V6VOupdeQ+c_Hk=uwO#v&y2FEfdhclBri?*>L<xLztl7!EW4W~oGJ7Y6>
zB+)@O(7hsbu0T-2%S_*K^$p2Fd7uTzh>IO-=7vetjMs59;6Sb{>O4SK)KCYS-hyJ)
z00Gw9qJ9UYfWHBX0BVdlew{TI%;ic&cPxzzrc+=JuI&-9^1DM4wwMitK*6w7ZF_?$
z1sV+q8r0HZP`Gsc@~hZI#Sn1d^5QGFCj?_%`=+f{Fi-gV13(E}9&I_og<Ee&vqHd`
zups)zZdc9m{j0VS7(w}FVkeUj5Xu`Wjwi1~4hB-dD)gQa)iCf1%mr*XN`-z0AO+iE
z3C&j5)^Mo_FF=F(YjoWyF&yWJmII~_Q;?zNh+!~D7vSg`^4FFY14;+Aw!qIteNN3e
zmf^W1a-CBdba2$3XXLZg`MFjF;MBnyq~LOwSnKtLi%d^IZs4!haoG*zFj6EcOtKMx
z+oO<8JzPQGN{9i*GgCwU+tOmt>z<j(kpfr&$X_J7W@gj^I1SJQ|7~A>d2R`WRKpPx
z660dqpMDcaT)Mfka(xMDQQHmtrD*)vL5Hlwhz?2|0aLBY+nWw8f}RwJokG83o?o-A
zC%-Kt#^0eNL8KM`WrW-!bSK10ix8xS4X2{!DOnM1wStMC-agK4vp~cVoo1F>0gm0E
zU)>xLM#9I$tq@>ZX3U1t8EOKpFC#V4%}&kb!DiM>-Es7??5JzH>*|i|5G@t>{c#iL
zicXmQh>5gB*X(<3gmyu+&~wdTQ(1_9ynAtsh%!@67vIF<!X_7Yr%twMbqykGrwo>b
z3R<%uG8}Oz&%!`Y?@krD7$0l8yovP3)6JKs^tnCZ<VxLgZ9<}ra<MthZ?fn#*=}9w
z8%VzkG!NZV3+{7=N$<m8(R`n7vS|1Q^`xRt^x}5yEs~o<H|+URzG%aA>jp<IJ=TVX
zkU%Y+0+{jz&u=$z3pXrv2twWpcsXsEPU2X(x|6Rihpyd=MsRePhrn>D@t6Z|91;6|
zbzH9(;>C)U1`r@cWt%Mq(;p8!BJ>iF-94`%Pqax7-eAw~GgZ7VgY8*_4r&MPc!|;|
zT-@^r?yT4ru;}WM*q)-81&G%iI%XxgJC?IRTdH~8z6qVU05Q47O5F3j5W9<Fi=3u#
ziQk<s>V~#Rw<Jn`2zICXNBG|H<?Gk@_Oa3y-BQuS^qR7C9BQBhOOIk*{vmG8H;C$3
zCciI)F@Df9sZl}%ABsL?zl*aU@~oCl#PeC-VCrf|ry6*rC*uX`8GMW9ck`P;cy}vZ
z1~`b|O%y@t1uWDawc}0Ej{A)(@1oJZo^Pi9RgboOXX~SGqjZ|`)wpXh6MG-}l(1{!
zm1dZ1ZiXsl!Wt~Sg%@w5cVi+u;2(@Fb71U<0c9k|xAx^I<oIS>j^D+L??rMPDy<_b
zM%<C`-flv8BBpLacBh+?-M=&?yHm{(@)`))KNJcr+}P&Q)HRzFuIi2}tRsxl`#2Lz
zd<aDxD!d85gJQ))dba(>m{7eYmC7CV1332H_>$i)W6Ms_oiLTX8X7vvmZlmzde-}~
z;=S>yQW9yKLWISY0?Kiyg_+V<5(cTp=u0u(C&|;h$r$w-dJ~LZsM6BcLaPGOq)w-L
z2QS{GbCvWMTbZLvP}C>UzD1|5bh=9Ce7y6GiOx3_>wMqc_gunry%ifSslc8S0-KBp
z?0fqHYbdZ0A+R$sfxWXYFd?wFW8IjpUC?z0-aXSoc+)ZAy}K{GI>MuiB&zVoOl+B0
zJjOOL_Qh5sHYTuHA+X7Wz|#8yYap--LSSbS0?X_RtbxFu7Xq6~2yA>`U=0LzQ3z~0
zA+U*kfi)0Vrx4iLguo{E1=c`db6jBKV+nzs*%w#?fn5>;OD6<2wJ)#+0-F~C%OnIg
zy)Q5(uy+#SYg&Y_6C&zQda|>-icRcHHX*g1W~3%IB7bgz+eUYia#35Xn{P)tO>J!O
zMl@IndjJB-g|IKg5j&l_2x75p+Y(ooZf>ZCUb}zM;acm&6jTbT5%T>3?U`(|(?Pmx
zOs7_K_m)po&xpY{F_vU@pK<k6R9juM)k?7$t9|1ax39mO=<AcBub&fwn@9+*{>-b6
zNPKD8Rb4&5f@{7_NbvU_bS3;}>vU|f(o+*MV6HztKupHlygzvg#dJ0)rVe2wiNGVX
zKY3~-uqh$1MBtH`_&|GA&1m|tqnWyk49><|KKa46e5d`NhZDz+J4@SSgacAsM&~`^
z_>VzTZ8b}n@n&Q9D`4Hf-%5G#rRWa%VBz_H;}0Qe&*iMG<oPVWCm)^yz3=xW?tj1U
z(G7T{`_=ZAmId*EiPG2LYN_-)?Wm&P#cz*pJ~vhH&1MU})oj7Hn=Sa=W($6=*@Evh
zTkzdx3;tQN1%J?N!9Q=d;9tfI3Y(z?6Bd(1`s>}0=ozLY&JUZw$<*Nd?JhX%@L&?>
zA9lnMhYTssJ%RJ%9dX3z-6YOG?}$@#+BV7Jzwd}za|Ab5>pidZ-j1j>M|G36{zpTT
zsB@$@iTYnV)+!G5CUO2}7aVr@H;MDp9dX2o;3Q64+fF#*<Zu$Fy=~`uzE!85&bHln
z+g?xijyU4PauP?{rJi~?eY@1NS_kLQuI<@|<N1&9`B^v;(Zg3bX5l$H$IYtr?sk4l
z=Fu1^OKjV+#oezEi&c;eKC>z)ex&#`J`AaNI{qjmn8g)h4?rpoH+=lj@sIz-^YrmY
zUHI4|7lMM-ks~cF=jkyJU~sm7tY4->qd{i-`!7iHdAwhnWu@_2K2L)-2FXx}yzN#)
zEH+BZQ4B*j*BQcizl=w?X8UVTaP`X&bk6oi$G`ezykB&-UmX7GAFUxZQA=uv17FQj
zXLrCb`jh>#W9Mr)EjdgEqpajMq%$>0@k3%eV@}mzHgSAxXM)oW3DzDg+nLVUMs&g>
zXFKDk$KnKPPoverMI~`mW^wZVQMB+V8Wk2wXuF(wXWT!VwtqHl|7_Za;%r*Nvz;Hs
zkrWd-ORMreJ6<!|%I8sTEDr`a{KXS&wWZ?4CRs5mukGbVp<pe>8HRzz+87~9AY$z)
zwAp^9hkm(cRcN-K-{siPD}!;3o{^&&@52$femR_<LmOmVEsL0=Rz2JQ{vUyZc!D1k
zobAW9H&)m5c6sdanYkK?)UYByYq;mcNxEm(+;i-jo}R-?TIRe{QBBV9*+vXAD+<+J
zTMw8%5aDS)7_y*oEKYF`Pu$fsoMz7U&$BER?=f1yEZ(bKPdf3YXVTjl-R+!SJ&G|w
zB*<qPkY@+_>Y~%eaPcyXhTS|fUPo~aj{-7f*7Tc)IucmNA}XA-i8@khyzs}HTWQ?V
z$#_e5IoK99E`F-8DZiOH(|k4LA*z{bz8dEts+n%Sn$3r(=IjHsrpcp{KRY+$V-LX1
zL$rE4{QwsEFzv}aKs^uI9+3b~0|Cng={$YH%>^m&C4L$d*AM*vj-A|;2evH*V64R!
zLZqJC>8U#+fE$2}&|MD;CWd8t+++3y+$$fJ;}ORIk0Yg?o@h@4$iVgO%-9{74<sy9
zY(PBZ1W+dK$l+!+<%XAn%tQr^9n(PI_f~ArBwCg}h-dX)H!h_l*Gsl9m(*D;2^UL5
zkl0ih*jx}8s<D{>kQWv|`=Q@MhJGKOz;E}k@8N^KC{+1{3HbIp+>1qP4;t#lP(_#*
z3-Y#i;}2Vaw?lZhV{pepyE^UL>#(j)6?+}h)v02y!?`+D>~$ztr-~0KjN8u1-iL5?
z>e%z}txg?#9=avR?!faqZL+D_8b0_?venIO>=L{_ba%YTC%EtTxDdqT`9&w+mZHz&
z%&!j)WX1-DFyvdqJuDqx!QwyJ;9V`WOyKv-t<lGF$KTe{0q)XV5BwjjB-il34e$qP
zgP+G2j10>l0JfaR9XTH`bVr15DS5?`^EE4i_-2+(9flIhb{zu4@+NrfqFhuRnhRQx
zv2j4H2(($VWv6I)h9=vjY;7bK93d7d)9VeLY^H)@w|{~*7vi6kVU+QoEl-22F6$03
z>Yy080Q~OwYiUaFwd1oRmg%cMuJPB-(31Fmm`?UekihQ}kLg)G*xJ(as^vk%ML;|<
ztyp&LDqZ=sWC(14Sv7ncjf^iQ(PU^2GAxD!t3WUDoh&G03134a>-@zlz^jvzu?~E7
z4ZSl6Btl<YlfaTe-v)#bOid1fFVQmO1dAfwlzc@6S>v81z5qvZyj)p#Dep|qy34ec
zBS^#*xZJQParWXIC}Wd6(P;xsvPn6?W9C$beJ^@t)vOg^@o4F%?c$y`@!%PLj=rm-
zrDgR1$n&(k7;C$9gP>xWz?eZomxDf0D>fO!6$>CJJA4O_O4|ec7j$m8IpVky1fH~S
z0?#mu;H`#IY94QVGIXcN-Y$f1@S%doMmw88R>*}ap<)y0mlBga$>R-ExbCguW(!PR
z$}nM4*FlCJSO)T78HUAL!Fa>Rk|dQZNvBWC3sCEf#O|i+1c^|w(68dLI>6PxJBm+)
zl&|MNIUB6SAeTr`q%uS4;*&^cr2RS2{v2q34zxc9+MfgM&w=*mK>Kr`4{8oXLz21G
z6`PP5X~}ZQjC>m^zJqfK8mZFzQXrzM6x9*aE92P;a1r8tK(gxccn(H769XxTdGJ;q
z_O?fY5iY;E4?-+jlvS3J=ZsCY>R>2L=NS;0*`xpzA;B>mM4=I&8U|*JMG=Hkx1Wr}
zd0RsoDS44ar~Nz!2jVqso@qe{ig#P!J1en7$SBPVDe*Z14XP#S|L;;hHI)|_O(Clp
z39S_8h-gXji@M`lb`_04Xa(coaEgR8FN}i0)3d204r{{h5_3!#;#G{(9XiJWn@DPz
zS_LsTbO`<EZAS7n%MtUpvV@bsO4Tg_C2<=HpBtcwVvx3J)7yCRmPT-P3&mlEt^(^|
zlt_l!qRl3cQrsTHi4N@`w&krAsr|B<Ggj<WNFF<f7MiaW%?Z$krF0sET!0Gbv<?Cg
z2dm^p&StP1s&1e<p}1*TmBCF_ce7<@MlKkZinkx7$HvBnq_PFt$l^qNaEs>+%T=Bd
z*=&_!d;T!Dq8a(Bj>0u=4`4J4#xYRVHE=JOqH-FzgEyJcOhHu38Sf8WYt!6onqFn^
zD6$u33~I!1)oetST}P5<vH`=G0het5Om$K3K)o!e9cIJsZ7>zE$W%yU&Y&dl4NkyR
zEP36&MqCvE;&-{I1Iar`{q~(9X`NJamTGHRbb!tvTv9$`8EBS+xE{bW3NYo4wTA}E
zJnK4oVO4g3H*lA{2cvt=VYjk2V7Sn5ps>t5bhay7DopO7*Ym}*iRl3$p4mtmVbf=C
z3*Lvg_m@5<FI#28Un*RN{tET1sQ@wbb+!P3K%0Ep)7g}N`&tdPi|~F%2|DT)SO&NQ
z%hu?kAM{gE4f;i3#Q?I@XB1UjXu+Fi=`saD3Us~AA>b9z!oXwELbzfBVg%U~$flGp
zs>lSiiSmd&8irR_H`w|It`K0QFkHn6fX|}ED=HKKJG0CI7spnd1mU_Cef1x6u8dLJ
zagoe?*c4pb02ScM(()?{)V<Kti#n@7cL#7?szPV7GMj86fot(alh`QD2mfj`N^cek
z#Biv7lt>x4rP5mzBLKju;2|?-<t&i2W?L2P=u8qQ%MmV1!bI%|`*9)OSg#IC4yb~@
zl9~#~Bt#QzF$rYZn-J*=CXU&3^{GArTCxPe30nmXRe|veiV+6(yjZ=7#{GKhK&yXG
zUjib}<)9m&d2!R0q!+=p;{bt+c_MQI6dxlSu4I8=V$YQn+0lxmM1v@ZSyC~TV4K2)
zEF7Te;+2Psj?k5$4KT*#NrF;1Zc40b7>Cc(73T}m3jR<7#Dz0zh{6no;OG)sUN!Qb
z!NxJ*1IIfj-e*AiHuxqw2E!!KhTeXe$22U!f-x`eN-n<sn+}ZOebHRxMKxNI<qKzZ
zmS$5BymHQ^^9=I>I3ZHwfq-kMS1bvpl0hRH0?buKSzt(|=7O{Z$+|lKHq$vc+B~j1
z*3mh#%<>4!L*9$1DT}W-@}QQgAthaFusZ1yK*s({HOCHVq8&If&Aot2?zK`}#9qVH
zhH(^Kkw<rpd{psvbjaw>jAb(Y!xHPZHGK$82zrEO5eK~u-YL$o=#p!=bP7h|3jeE0
zTGdp`FgXx?JKSw>?W*t&w8w=`7#&K#I*tng*b5EITIcf#2}Y#~7#WH76dEd?4<kqi
zhUd8U2wws3y`(y1+X8qWT>$t0GwKBY7j|8CkPz-tMe+ghI63<6Fqgl%%*=w(+SROK
zl<BM6`E?MU?(_R{9ykY$!<82PR@y9mhKCbeo9y%tTAmvNDbM@ud5Fw|h37MmcTKl_
z`>#CzAf60H3(2BR`5CSrygeumkk}bzx|?=4@DIc22>~#K>qb_!*BHwZyQUAf_j}hW
z?#P5Sg0B|~{3B&`3z%?KL$LBv%X_UY44&Yc!4DzaB%5JdS}yv%*aPtx1@uj5z}sKL
zzpwj;aOz~4S;wp3k(uD0e^>*5YgOqHHpa|}7&9Z#)h0&9rbh6YmX4Pz9q(WVtouw>
zTNXDs0!2a<Fo0*n^-2E;ab5469H{6Kksm)S2Wok|4xFEugWSU+SDbIV_D}8pF#(L-
zx27yaHN<b?y7u|D(g|pT(poy%5kLL;7y?^LkHZhxLg`coeGJ|cSVpMdKM__$cT2-P
zziWD@--km_mY?Pe7A~IrV*dMRcL!P7D5Ll5U&a^cd43PLMHtWVbbJw%xD5C!IOx0J
zc%5i#wFjn(S@xQ|g}ER7<jSA?@kjpAKPm(RBgD4MBgssw!t%b8ntakf70Uzh;@UVp
zdWi2Vj!Jd2Iun?^tRb<=zLWZ!ir6`h3cua^?NeUP@4*n4b%7->m-=7A_e{qOga80x
C24CC&

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..a426bd07904db15228cb5d4b18d279e5d180a6eb
GIT binary patch
literal 4879
zcmV+q6Y%UGiwFqi7LjKH|8sO@a9=SlaBFM;?L0|v8%c7!Nr9xU)kyM;chv%Cmf(mu
zc}t{56v>e|Es-)09p04@4WK|Y1axEj;GtQ0y|yPtvTLuHc7*K+N9@C4hy8bi_v45C
z^~=9~*f0P5upj>0nN`(|ZX8r|mt*50AkgT}%FN2EDzmx^_`SYA`Es+1z32xyLq9Ny
zv3FEWki!}=)GE;|Gf~m?nnQ=SEmB+8in{ZaGx0;`l@p_VCV(8doA&BClR-xFCB@2C
ziD@c(#B`oJF@|U872?n{ifO8*rDztMTfiL7t2O7|@-P2=?eC?DT{<WT9s2mbdR<zV
zl&W;3+lI6Yr!MUz!1IbGfqX^Xs7ePUCs~U5!j#|=(G$`W5Q5>XOC?gNAy7I{t<oos
z4dRrBX`c;hI;Bz3^<NB?&eDmMif$6gDiNumDiwWC%ISwv?nug4)cgxo+ha}9!)U^>
z3GLg{m5NikNQdzxtLMrjZ#f(EoL(bZwuahgZKHxFpr01i3b~m~CQEviB&}ps)zkw$
z7q3-p6F)8>S+c4X?}Bw3=b|LA#Zl-5A9qW&D$K<6!V02Ft6b_Gumz$jc}v|V3^}W6
z1#;-n_cXg&J4)nrgCweU#Znc+P>virk~K(P*G$W>L5Z&%+RqRjI;a&m%2$ru^@X$2
zopZi)ww##UEq8z6l&;bKf^}3QNYbOuJ8xd|)?V7Z=xl6keD*)^Vh6d)a&m80hPuSd
zflJy8269SQXr!Q5T?=BkoGIven}r@|hk#)hu)FmD1fl&*cMQD2oMI9IJqIdSfb>Ao
z_H2wI)-xR;T3#=J|2C8=(Dcxunw<lKtrrBska+}gT=lL8iw0oACFs$w=>@_=d$^_;
zz<r&W5&~Q31@9iY7vBBs820^}?05}6J@6ib_eb6vuO{WVoPbXX-Wm8z$}{pT90PAc
zhfuy9bOA^tpOGWz8DQGY*bj-b?uPSLANSb0#pT-VgSgk;o6xgnNh#<DTe@yBq0nyq
zs<0UC-&ZO&N@{btnL|+SJ}};Zd``X~e*-=j<je56Dn5ArQJc?AhW!ycUXYjJv+Dcc
zaaC5}y9h@`K#m05_a^iVit|`V7psS@FMu!duWtFGe8s(RzvK@aA;)|z@UJEJ<POjW
zG1dcL7v)EdvXDlAC6|3y-*2Trs{^5<IVJysXoU^XA0i%|(IXFnLxxx|4a+0)sC*V&
za~{lg5!`c0eh*x71zhtR^84}!5Xaw?zXfqD$=Bs5#O;URj&IB35F9aiLjG93AtE^m
zF`SmCSrqrnbMm~r0PedbFM=U%%b$Q12Ec)L<rP9Bie(vU&bD|0dG0)Cat(SiMF_u+
z!1a*0u0t+)5SP*~DaX0yqpG)F7p>6%hZxg`x5RvO-v;eC^hN+07tllini9}-06Hb0
z(*fwLfX+4PcFQN33%lJQEw|HMAMKj_ses-KKtB^uIRJeqpc{d{eoNkxx8)sqSAHx%
zf$yjCGtvKVBcA*n`Mb<3yYlY{c-HgO8sP5AIRVf6diy}$gO-x4%4OF2f(P&!x#AJ;
z%9<e5eX#Edm=O?ijR|OrwtXSkfiGNM`a0j0UkT{%3d|n}?mrap-*15bp@4tU0RK|J
zUpK)2NWlNN0sbQa|8WESrvm=72KZM3?s#yRvMh3F@hJ;ynoplQrTaLuMpUh4w`v$+
z4J#!A1HJ%F*kaPNdd*UGZJSge`}*LQL^53!nU|-%U{5uZ*ah94=<5yVdBb`E=+vSL
zl~$m6ST$L}qU0(>^cRCHit|K@nDl(!HZaj{L*bWaxjjIAMmN+wRcnOXbfBU@ohK>?
zoX@>X%~CWb*l~G+&;iprs`#fC{)UE}Zfgb6fJ9vn|GWF)oo-UUlK-XUh5Yltc}ARP
z;JgH`>O)txODz7`_&gN2tkhgV;~!e4&ph2=8Yn?j&Dt~y#4v?ozT`Ws7mFsbw!o2|
zcE6tB!)6{T_ex8S?F`d~WtYnm%35&lDYCTrP`Ilfx`i4K1lRqgf5OrOENh?-AWVq1
zXO&nLGRpPaWOjRsHDmzt3?U<8AUshEKm~Ha8izyf6Lqg-IUu#I&_O7k!EsEdQ!Dse
zRrYk`017Htr^VBmnT$iF`SkqU-OSX&O8V~V+}wP6E|XqanO>crok>ln((}`>)*5r`
zXNUH2#(np99zJmVE57W$YN|Q4qFP5;LxDNLq1|bR_U6<AoTVIk#fJ;V$?C;yRWV+e
z4!xupsuIUF9@NTtLuE6HL%SS$p0xy^efQUvSDo@eH!L{&$JofA-4XE6&ic-SH8{_R
z#-XviUgbiMN_@<Mw=Giw9uL+xWRT4uph~3P1}$tkLWhuVGH;rwutAC-Ng|QNbq_lv
zSWEYm=xNMtD}T=>#!=i>>t_k@jzFIl=zY>>+aSci_R7}!qa6^#H1ZBT7wDjj1T%u*
zlG{|6O6*TDj;wuFwBKm%AH*kyrc?w-d~vclK^A?R*bIAKV87pl9WTLTV{)tof%P84
zs#bL^X{%Y=QY##xC!G=)JrQ))_RceOR!=(3!B0ov5E+9Nk|KQ$i*6hlH>GqcRXe-|
z%V~&u<>;oAtLXU`x1?8*F|op~qbjzgOV`zEO*e3nk4Om+(Zegkn2Ve0OM+aHX`sD+
z8fQzxoMHe?O)22I@1~Tg9ZFM3<Ow!YX90z#rE7o=b4Eqgpo3I<?qcZsnKNl%BtTMv
z3rTQB+lWYhN_)XJ`yi4*VBqYoOG{GG*77(WVg+@7;`4|ksYN_x!#*~V;B%C8TS`qx
zAI8RoxyC0F{3JFGL&`Grqw$GbK#PQ=TC<K~6St&fQp#5dbXGISzN*_Moam5gtCP@&
zq-aM#@L_^WqNxxI>jfZ-yoxXh^WZ>>i20_4n23WiLfiOa?e@5IL-OKek7xvD4)R1d
z3TAA2a()6@#}|`;=1Qs5YOvUdAqVIaC265tJt*c<lTuALRTSZ-^m!FRXgm+*C&u`s
zq*(qdy?9e{XE~6>S_6&Y^zIR;H;!RBKJiL=<#k$j>G;HVQqL1?4oPq?f^KXK8;Vv?
zWASuqGLvGeKop_EzQ&o!%#3JaI=1C}fY~F+cp){J_81>e&Fv8+2iw5#s*?)`f)gHb
zfBW4xA-T>2FF=Fe?AWsiOy*|1_J@H03TLs&1AN5-bv*PfGu{m_;G?ZR?tm4@H_iya
zjdh7X=gLC3tkqN#%DGrN0pVa7Hi0OAH4#CLc`5`B6<sMbE6g~s%<uAGkq$}W?&j(<
z9HMRt=v`fZjQ6l0*YzD3N)6lG`Mwu8!IZ}|N4Pt!_hDEeVAey#xpBx+a^iklPaHC+
zxcNE2Ed-466$SIj<N5>xXCNw!p14w};1z@m#Blx0n|A<GG<1GaFqFMLETZuOJgB~8
z6?Gu#=13t2LZ4%xijpIh3Wp2=jaoqYdeU>OJ~~2Lr9Rx0oT*n}5ni7)=Q23Au~5D#
zEwFjd(rW-`b8CeZEioH<6TYTvq&_*Or=d~U>S>D^yNwy*C89vZivYZv3(ahTE2@V9
zD?mEa3aCR*+Rb@=+mDtvK>LQi5XDV+J^%P~@Xg)bot;g<9N<Ce(2aUxIL0{~Bd&sX
z;t+#JQnOMZD-03Gy=6)JSHv~H7e;$BZepDXGDKefZ_dan!8k#xv|C#=h+)>0JY(yY
zQQZ)`W6cSyDYTGg%Arfby_k=|bE+11y)W|Ip$Fj3^-^l4?%eK2$ju!;?=R;eBK9=Z
zG}WqPDtoGBsitI-Lc*a@f9whu2I4ucA+H&zdtb)cu)@%DJ9;}cv&C#^bS!l;bD_}>
z5Z#To^|x~;O<vuD!q}`rWTAuhbSU}aCX?9>(6P63MJc9Iq4723Ka~!T&-jaYg5}nz
za%dTZ)bH54cge-mo&k;DOe}XJ|3UCPOv44|jhG4910i0fkEj|ibGO(O>5hqEynp<J
z<d$)MwP_?@GGHCGC5X$1$Tmn?ehD!7r#9`=@Sd`Vbrx#b8q8sZZMHh&cr8~|gC@J(
z;3^@=VSxd8upo*Fe1hfPnrtzr+8&(`t^6edB6ZxbXt1uaVXlKw0=9c?x0Q*hbaI7l
zI-)7~fQa8_@g}Haq!SqBpTNayo8S!~LkAHpwQjZ_Fv=nnVkS#QLFq2$bnup<Z5q2U
zl8aDm7dXai+xDmj8MWIIik8r!d`Ybo*4f>gGOk4wdS2YxUfp~s?rzIH^=CUF*3C>L
z3wSYxWgGjVXGJ(~5yjLs@qmg8K5A<$D4{g>Oz(+5D8%<<R?7SSc0M+^yZUfub?ox@
z{RpiBzRRXV;;EDvZsos(1DN14nxOnwh}2it<Fc#AoA#X#vNB%Hhw1;R%``tg`q;El
z{sSgg`=P}{>}wcPZd?<){O8;@f9JWB8!J}6$|?a!6NH<rmyY^wc3@G6m15nFSKryu
zwc};K9k1cVb<7gnjuYj7W|mF+1`|)mY%rDomeF|wXjs#&Ef}%HXd3-r)9`85fu(<z
z?%*(uhZ#J~;(;~H<6!{~(<wZp-Iz)FHJs}#4Y|$h8qV_qLr9+XOFrLmlFzx4gS9h`
znAE0yG6S^JWS)0r=F-miExgdV7LKtxk`;$w)+skYW_{vgo!MdHF9eO<@0T{+>C(>m
z19_^`rCspbHq*J297s3iP8B!(GIyMX(Td*mHtouM%O4ff9VhctFe(=P7M||hsNm8r
z`K6udbZKw<rJe0`X+QBxJJ;#b-tkL2-|5m0_@!OwbZM9UGgzwArM>HyHr?scuGCke
z*WRr}-IeH7-|Y`(rW@|+bH=xHzy8i(Z4DN84K@q7CcR1yEknr?E?(Amvk$xK<yzB7
zecy<4@00%$KPBkP<@Ceg{v&!O@9p|}Lq`HzNZ#1&?iKRA(c9H^&)p_oo`kE7a-tXg
zmBgdX?%aezyc`zsYFNZ?hDE#<7V&ym#Idl5AB9C64~zJ5Sj5S&h)F-98#4?s5HUfb
zskafaZDK*1*$_0D2AcV|(D2Q1L7J7fX#UzoycZVnv#_?@3d?aPEXT)T5ub%c{BBsp
zY*@rxSj0kD#9~;)Qdq=tSj0+LL@gwuO}`-?qIk1+&b^C`+apaYtK|81ykRye$Hp_M
z@d<}s@U>-$Vd#c<N;m?ks)qdVXIoC`Tczvxd!W*o|JOh<+EeuX2B<XI?iWCJ{_N>{
zci1n02Jx3a)}iG*&7bS)T4XzmK+>gXDk`zOQXxyx=<SHKh<gBdQyQ<OMV8YzB*P$5
z(?c#qOENr0QBk!QoM0*{;jX}?Xyc~9s02BADO%rA7?toz<x<q$PZ&)$u$pdUb<7^Z
zu-y4>NzQ*JDw#&ULDI6xZII-cO+>Q|ME-q;$5YNVP=?r%cs#@TwhSBhDjv^gp&cJ@
z7vu5t>6D*A;|9kTwCE&3$|FvCx4Y5X-N=lE7HYP$7>w_>JHFfQ_-?ynM{IWt%xsHE
z%%pgd(+fvp%uY3ATWJyP#@isl;WuuIY{c@T&2ThkuV%a?AU0CWataLA_5}44By8N+
zxD@4nh)NBM=B23E-WU~~L0)6qE?L9Zu<J4^d5c@nLBg8}b>Uc4FGauht(8zth^?nf
z(RjMaMf1%zIOV;UK`XHSAv@F*O-QiNP68eZuFnKv5-G*GNMpR*OZnNH;x5ewC$h$+
z=n8M*l^W*-@rpauJRSW?X41{ly=_CM$Hu2}fM=!+Prlu=B{>sY7?M-89g3N$7Jg~F
zQIz|tA+~?5U8AnUYeJl9SE2SBM}=aw_Bze@b$ZJNM^6p^7Sr}qS7tUmZ*o%JbK!ZL
zC*?gKp7+5?c`uwm?vUGHPbk7v>I5R3RKKb83F7I*l4nl9Je1__35<P6^S2|5?5Bpe
zBMyy--MhHzrZL}6TWP#$0gV-k*o;YsYlf|nLYDo0@af+S;&MH>CU0Jnw^)&T3v#y*
z2~}!@D>OW%!xR|;+5LJ~Ui=E>#s710@hg@V^)%VpWyPr!6~3C_qy>eK)Ga1_r9|^!
z>xeSqxRRlT$fr^zwBUOx6+#QXr&1lX;Cm{SK?}bBmsLS?_oq}5v|xNX)j$izr&9?8
zlbd7H0fQVUM#1-lHex6TSpnUk5sMfw-tcj>^O~NqAlK*F#Tz;x9u)wRvWJfnRQwOP
zSNuN}EEC{oYKC4^EBLs()J4zY|HSRM|L$hi|1Ite(e|_ae;C;RuKi;9KmMBkO&LCH
zI3)Ro#($;fgN?fQuee`4{D*&|@B7-UtWpF8*ux3@3y?b25bx8YJh*b74s$hlXL6UT
z7Q>Re>>Q{MrHZR&jE-)tJz3sb&E9*kd3X5%nD9S;`KLen_}Y)@g{|c$*<hpG?V)e2
zEpKmbfO&>jH#gQ?gx55(_Go>3b9F7dzKZCPo#kyAkyoENuWdV*_+Q(pzzmX4002+Z
Bp=|&F

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
new file mode 100644
index 00000000..a17872af
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 1,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 2,
+    "stats.cum_n_token_goal": 10,
+    "stats.max_n_token_goal": 10,
+    "stats.cum_n_token_url": 23,
+    "stats.max_n_token_url": 23,
+    "stats.cum_n_token_focused_element_bid": 1,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 0,
+    "stats.max_n_token_last_action": 0,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1250,
+    "stats.max_n_token_dom_txt": 1250,
+    "stats.cum_n_token_axtree_txt": 71,
+    "stats.max_n_token_axtree_txt": 71,
+    "stats.cum_n_token_pruned_html": 651,
+    "stats.max_n_token_pruned_html": 651,
+    "stats.cum_n_retry_llm": 1,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 1589,
+    "stats.max_input_tokens": 1589,
+    "stats.cum_output_tokens": 63,
+    "stats.max_output_tokens": 63,
+    "stats.cum_cost": 0.00027614999999999996,
+    "stats.max_cost": 0.00027614999999999996,
+    "stats.cum_n_token_agent_messages": 1641,
+    "stats.max_n_token_agent_messages": 1641,
+    "stats.cum_step_elapsed": 5.891982078552246,
+    "stats.max_step_elapsed": 5.891982078552246,
+    "stats.cum_agent_elapsed": 3.4504799842834473,
+    "stats.max_agent_elapsed": 3.4504799842834473,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/error_report_trial_1_of_3.md b/tests/data/error_analysis/error_report_trial_1_of_3.md
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/data/error_analysis/result_df_trial_1_of_3.csv b/tests/data/error_analysis/result_df_trial_1_of_3.csv
new file mode 100644
index 00000000..4095252c
--- /dev/null
+++ b/tests/data/error_analysis/result_df_trial_1_of_3.csv
@@ -0,0 +1,5 @@
+env.task_name,agent.agent_name,env.benchmark,index,exp_dir,agent.chat_model.model_name,agent.chat_model.max_total_tokens,agent.chat_model.max_input_tokens,agent.chat_model.max_new_tokens,agent.chat_model.temperature,agent.chat_model.vision_support,agent.chat_model.deployment_name,agent.flags.obs.use_html,agent.flags.obs.use_ax_tree,agent.flags.obs.use_tabs,agent.flags.obs.use_focused_element,agent.flags.obs.use_error_logs,agent.flags.obs.use_history,agent.flags.obs.use_past_error_logs,agent.flags.obs.use_action_history,agent.flags.obs.use_think_history,agent.flags.obs.use_diff,agent.flags.obs.html_type,agent.flags.obs.use_screenshot,agent.flags.obs.use_som,agent.flags.obs.extract_visible_tag,agent.flags.obs.extract_clickable_tag,agent.flags.obs.extract_coords,agent.flags.obs.filter_visible_elements_only,agent.flags.obs.openai_vision_detail,agent.flags.obs.filter_with_bid_only,agent.flags.obs.filter_som_only,agent.flags.action.action_set.subsets,agent.flags.action.action_set.multiaction,agent.flags.action.action_set.strict,agent.flags.action.action_set.retry_with_force,agent.flags.action.action_set.demo_mode,agent.flags.action.long_description,agent.flags.action.individual_examples,agent.flags.action.multi_actions,agent.flags.action.is_strict,agent.flags.use_plan,agent.flags.use_criticise,agent.flags.use_thinking,agent.flags.use_memory,agent.flags.use_concrete_example,agent.flags.use_abstract_example,agent.flags.use_hints,agent.flags.enable_chat,agent.flags.max_prompt_tokens,agent.flags.be_cautious,agent.flags.extra_instructions,agent.flags.add_missparsed_messages,agent.flags.max_trunc_itr,agent.flags.flag_group,agent.max_retry,env.task_seed,env.max_steps,env.headless,env.record_video,env.wait_for_user_message,env.viewport,env.slow_mo,env.storage_state,env.task_kwargs,exp_name,enable_debug,err_msg,stack_trace,order,logging_level,logging_level_stdout,exp_id,depends_on,save_screenshot,save_som,n_steps,cum_reward,cum_raw_reward,stats.cum_steps,stats.cum_n_token_goal,stats.max_n_token_goal,stats.cum_n_token_url,stats.max_n_token_url,stats.cum_n_token_focused_element_bid,stats.max_n_token_focused_element_bid,stats.cum_n_token_last_action,stats.max_n_token_last_action,stats.cum_n_token_last_action_error,stats.max_n_token_last_action_error,stats.cum_n_token_dom_txt,stats.max_n_token_dom_txt,stats.cum_n_token_axtree_txt,stats.max_n_token_axtree_txt,stats.cum_n_token_pruned_html,stats.max_n_token_pruned_html,stats.cum_n_retry_llm,stats.max_n_retry_llm,stats.cum_n_retry,stats.max_n_retry,stats.cum_busted_retry,stats.max_busted_retry,stats.cum_input_tokens,stats.max_input_tokens,stats.cum_output_tokens,stats.max_output_tokens,stats.cum_cost,stats.max_cost,stats.cum_n_token_agent_messages,stats.max_n_token_agent_messages,stats.cum_step_elapsed,stats.max_step_elapsed,stats.cum_agent_elapsed,stats.max_agent_elapsed,terminated,truncated,err_key
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,1,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,7,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,True,,,2,10,30,dd9e91e0-75ef-4bb4-9db1-f91f06848dcb,(),True,False,2,1.0,0,3,12,6,48,24,2,1,4,4,0,0,1902,952,400,201,650,326,2,1,0.0,0.0,0,0,2789,1404,128,65,0.00049515,0.00024839999999999997,2902,1459,6.860883951187134,5.8696064949035645,3.769465684890747,2.946484327316284,True,False,
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,2,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,20,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,True,,,3,10,30,187f0f01-a240-419c-a65e-0058a14f639d,(),True,False,3,1.0,0,4,27,9,72,24,3,1,8,4,0,0,2892,966,667,223,1014,340,3,1,0.0,0.0,0,0,4339,1464,225,84,0.00078585,0.0002646,4512,1517,3.0203144550323486,1.3659462928771973,3.8209800720214844,1.8219048976898193,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,0,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,28,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,True,,,0,10,30,b403cfca-4647-48fb-98f2-57e94306a38a,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1250,1250,71,71,651,651,1,1,0.0,0.0,0,0,1589,1589,63,63,0.00027614999999999996,0.00027614999999999996,1641,1641,5.891982078552246,5.891982078552246,3.4504799842834473,3.4504799842834473,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,3,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,14,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,True,,,1,10,30,4c89cb70-0bf8-42c2-be39-a9c1a39ffe8d,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1257,1257,75,75,658,658,1,1,0.0,0.0,0,0,1594,1594,64,64,0.00027749999999999997,0.00027749999999999997,1653,1653,5.879024505615234,5.879024505615234,3.029170036315918,3.029170036315918,True,False,
diff --git a/tests/data/error_analysis/study.pkl.gz b/tests/data/error_analysis/study.pkl.gz
new file mode 100644
index 0000000000000000000000000000000000000000..8611c7d37be0f3527cb0356f3d2250c33f769e5f
GIT binary patch
literal 3761
zcmV;i4o>kOiwFqO7LjKH|8sP8WO*)dYit1R8eMZ7NAb><EK9QdDSsw$WReg&Vb3~A
zwk#7!Y$wjgS!YL19#dPhyK}d5+S^(7L#In(NPrkAH4ju}DT)+OgcqQA3QwepDvIxq
z@Wij+2k;KvvvYg9t2@b#^S}dVYkIe*r>Cc<yQgP%-WmDc&(D?QPn;r6>V^(6Ds<hW
zK5N5UPzl1w>c(R9wuB$V=iZI)#An5XB-V-F4B|Ip@pRf&J_HpgfrY8d*Zf;74!)SY
zU%}Ta$fC&CC2bmu6InW~>4n+?*V@cwu{dDbB-Gp7qD~@&IGT#!IPHq8CLU1kzw&0}
z(`BSAbd|JDx|+;!KiDSgddNfKK)FiYAbzFv?{X=@GuMm4JbpjoUE1kkmrKuy$&j`|
zjU<HL;}@U(<OlfsQ0!l00pqS7M4rd}Fy0UcEb2MD+wPD1SVkCaZgy@#n_bkZm*U6}
zqYa1PXy@)j;kxJZwg-AYtn_(HzF(J|Qg!U+2Jk&o<ZE?7Cg7CtU~V%Odv6*+q4C}*
zpt>El9T?9fKBNA8O5s&jg#?&yiX-^k;3fb|rw+w21Id0wQ{U&l?r>nMc-Slmx!;9X
z+0G+Dn5Ehjhh<M@i2CdkACeu~%w0{9n~4+_Yc$|_90k?Gu7~sP`H>64NZAuX0uuyx
z1Dl6HJ0>x_jl$FQ(1+f1oI%6EK{cVfqdm0gFmsjkEB2=qock8&cJw-N0_gMM26IB{
zr+kzm0-d`K$~w(G>Jp|aOR;E37zj5)qL9a8TD9)5(AEuR^;#WJco0^hz+Cnu+KKJQ
z#pD(FSeUp8rM`fIeLP|Kyc1Br*=^@yfI(f;ZWDjCVh|j_v6ooWzCqW(qh#ybwBW^K
zL1X|&81p3BaEBW@ah&))e2<)vDXBKao*)E!L`5I;Y3O&8Q8l=4qUnrVw9TcH$6%f?
zcQB%Z4tJZN6PP9DDK1BtYq2$EMc}V=owPlNqIpcozLbFkab1kFKvx~sft9og&mqWS
z3ZH>x$V>)9+!Xs#D?*#!5EJ;`26?18O-pj_f~kbG*C$X(JlJ|N@qPSwV$YyVG;Ww(
zoOg^-!0QkOkHo#w`0?Bi#iRigFy)L#*yymdCI~*jJ4#1%oLH9LW<lT)AAPD#g8&>C
z8%)U<2G(urEcD~rw3xuyp*MXVd2;;YNFmi8*G3|Pi1BoyWBa-oznZ?K6FZUws6)M3
zg@Dl2LmGhboU*|vbO)_M?;BMTob8vA_IAx)w(Fbr%}x8(hW)C&0#fXQNl7LOf&{4J
zxQdJcIJ^&51MC7yEjc9Wu1SWv1C9;r&ROx$5R-K{_hK>jg1eTt%<+%}t18S+t5H;>
z=V}%s5LaSFf_=udhZ7|QA@zc|HY)bolvrS0D9t{fn&8;rrWWNO_|XnwA=<7ElkNAW
z0%Ok_qa7LS>);s<?|@0e;}pgwpaAY9q=1-|9j|tz%3y-*HKnz#{igj^ZNL3?ZPdQA
zVSjzo{>G;LPThXDZhy0Ge=A2SPUJ^r+SFV%_&V4G@EE%%9`**1PH#g>#a@G$V5*i}
z{l`?7F%eU83~dNm76}Phibq_F1QpN}tu%b8_wnbU>biXH8<yT~`ms2jB?ot*z6&w4
z(k3n_%7=Ke@2bPi)#c<Jnu}wq1(<{p^qWA2phvC;8URN#uniI)b)&Y|O}Hn4$oC0k
zH~Vy)OXXTfI4TKYWWZaJd=S@4@5R^3M*N+4B|cj#!@oOsWBbQq)RHR+G}>iU<IRnv
zezz#U9oOsiSN;P(S=35lY;0b;Q!16=f4K~xtbV2BsZ=ia9^|8>%={#s#3|xZQuH1Z
zRvGG&LX;Jwaw!HGVD2JVDj<@;I0G;e6u!2*Hc~62?6qB3+8^3KvVVeOwtt!*@z2CA
zb}2i~X9kV)>c4j9MZw>$0_>5D>;Oj<+@s)_f_oJl-vLhS04EjPSAbIr?k~Ut3LY%L
zLkb>N@Q8xb3Lf179#imm0iICsWC5O1@F4{sR`3x8Pb>JSf-?%9DbV%c7YcNbDfqa8
zPkhesi}wkiRPd~VYUfgVssNu>@JkA63RV<6ui&hL)dHMTa9+WM0=!Uw7ZqGo@REW{
z3SL(5%LVw1f?rYas|EP1g3l><r2wxEf!7p#Ucna>ysqGj3cjS^%LVvK0oDrehJwoq
zVpOp|h5$0UQwSO9g^-(>5Muu_A3&bP0D_DA$=*^%0u$C6NW?UV%Z6MC(#6|=e9a6s
z;x-}E5=T-r=c`)7ft&<#lZOUiH5a#k82t~YwNY1t1aK8|;gpeSa~@F5!ZkXxxc0Fc
zB>`|bHK?jhEf%#i0%R*w0g8Re>_!|-vCKCes(}h%aodu%7*QBP7Azhagw_K3wFuI<
zoan{8K%5>{r8$s`&@H?(1Oab=sAq=OL(b|34HWxifrdK{@#F?5tDa(=HW!P>wyEta
z@yu|l0gCR?aDQkXcx(_F<vE<MaA1$H1qoSbw1qPwkwF2?lYLB@RV%c|=oYO3u0NVW
z9qfo;%*uEQT?96<aHfxJL3?yd$C!r~DFKI)pBR8u_CK)7Ccc$*jZ9U~H42JLHWSlZ
zk=Xr11JHd7D>UjbtBIc6?|^$<4Nnnrxqphp4*H|!ksJ2(Q^Yc<%hWzB*q(PalQ<N!
zMLa#~RX}>LfgfZzNL1p`8NDZ1$g4BbN*oJm8@f8a6{$6SW)&Grd4o*o$Y61JC-ZoQ
zF|#+wLLF>Z`oT8aOJ|_?9dv`Ns%RUuqA^zv|I`4i23AbNtn+6pVH?$vwsU%$TFrGb
z(P1!w7Dljv?MJXI6>H<JjJV+tA=1_l53J>h$7NKPt@f$sbce_fJ3hZhkWRP^?`oYn
zmPdkp1|5#-C{TxVJ?kxfB+mA!0cdrcx0MmYompx(4dz;4VU#59n8&x7<#C2uJCws0
zB$VfTMG5;zl(5g2`|ZqHb$lQmTMVh0&(z4XPr1HI^Uc<P%V_d6j=3Zk1)34;*3$;5
z_VGv~Nfwe}oEThzf~8Jx^AHynE5iojg&VR)x94i9ieN!uu7<}28Qy)Q=5a6bGHQ~0
zR#=p~JPNa6A;H7As1H7!4FE1#V@*_sJdN#JYBJPZ7Ld9ww=CeL(KYI3WR={(v3Pcl
zJ)65!!>lTU1hcU)$U9jL_mR12^l8A}q*99nNHS9RQoY5rPAziDbQ~Rk&6!m|iEr8(
zb(=Ccr4tgcrmGRK@8kqJTY(358gZJ`@JU8ry$YX&cAJLiLFv!}ZbEJ#7c`QY$1GYS
zxj2!l3V7}?DhN5I3dEObXlhpAr0__bj4tGGfknu(HY>5oq6jXMC1)l~0Q~W6cah6X
zNGR00n+gx+r{Q_C$#LB1D>K8Y`U%OgEQw66IyR+Ffo;DcmrP+*;DD=SSMokGw>nF-
z$6Pn(-fJv~K&!+8mB@r@(q<O8-b^yKy&OzRs<3sVTq=pNp4_b(ye7C&iyBcgZq*U9
z+(Qp4#6+(jWOx(NH}h1>JMt(N+Y{H*KI#K1hBmA1+;1&US9q=GJHI(Vmio?OY%fb~
zKyeQYZsv=j{BC?U(fXWg+fK@jdz)!did(mol}Wq9m!x<#U);K%;z63-`hW31TD<ys
zOr98#p|7<ftqv9i=EkktSUa>>$v>@ZOFXn~fx*iC;8upwZO`ELFu_;uQSkdJ?n{ZE
zv9|}AsFep$_R6-iy0btOj{~<J{lUv`Hm}VL`zSD)IZR9r5Wge8@SVti60#^HtLi)S
zguHX{GUDP7;+8J0y!R@<D#?#fTV>e;FQ3XI#_adNUaqB^kT5s#L$4TvBqvD`r#jSV
zMr}R$vJJ$L5i$q!!LItGDh~F)Me2d)urQXISgl8Qq2Crk@lFmJ4J2!L$~LirFe5)S
z6{EOU4C5O3GGvBcT)Tr06duZwEyY5N;B|&rjN^EFU)lO!;ogYFC<#N@$=`{^9zD52
z0*}WbD+P&lD*|b8(7Hnm>ICryj50FBZi_*FHRh2JHaiBeyu|~yehXefzO$ly9y+#V
z_F6`b^sbKRkkrG51hy1c=4R(EXtPyqZceXOwb^-XZb?^nQ%LbrkH%e1&47mAm6Mb-
z#Nq7;qDF(QtL1YfikKC*V)^{~>n}gw`-~`d<F54mL%$gP=XaNX_5Me1)iytVa`|_6
zOMxM#VY}^fD>9kE9Go=fHh6M(rO84)*@NwS&1MDx*>Bu_{u<<u{mV%BoztEocEfhW
zUU?@<M;X2^6Um;D)WxCn3%b5W{Xk}+duA)uxyo!Ttr7P!1C$brT^#iF)D7lF>oaE`
ztIphBzBTh&L}%)JZRWzv?Cj;a>g5X;W?p~o+6>+ZIv0!mUN?m8SEi$R>}U~kY7f$j
zj@&?HYO9s`B^0ASPyM>J*~)wcGVDWX9V*2pfgOvpON|TjRnuIWpKVkxE-qNs;>C;e
z7cVVdT3DQ~RxixY&CSxe`HRr}k+iw)aFaNAcY`(1wB&sSu{blce0e7D&GS9N<RLhp
zyReiX@qR^vHG>L%h&`O~t*x3plBvxIzGzkFtp$rTW{v7adSPLH$y%x|ROuySi5T;g
zEX*!6s*NSE>_;;U28(`PiryzXj~;v;hf>5*cMV03#1!o0VZ+f(E=9z(2{E0+=s{?~
zMn0~cZhZuHR2N5JTM8U>i+U8IfNL^&kL4g)YsxDKI-b(V7SGtPZi*?o?&YsT*r6DE
zVcpAJd9X+A39+|#b0W4oVhr1%Yu5n}k23nz#XfZ{CD{UP0GaRW?VyP_6Cengt2*9l
zLCR5i7637+^=pv3HqrV7!D+Dvxq(~M#Y8d$ywVh(69)pamb+KA0X`&Ac(E!nTEB(%
zeQ8_2hgbXCtv{erT7Q(~pP<}m{aHT$MV5cXKFfItT7Q#Oe@Aw$Pi2#TKsl_7yZ6$?
bt_SL3^Ip2xeXm{6$f*1mc0^=Rvnc=o6lFOi

literal 0
HcmV?d00001

diff --git a/tests/data/error_analysis/summary_df_trial_1_of_3.csv b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
new file mode 100644
index 00000000..545cfc29
--- /dev/null
+++ b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
@@ -0,0 +1,2 @@
+agent.agent_name,env.benchmark,avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+GenericAgent-gpt-4o-mini,miniwob,1.0,0.0,1.75,4/4,0,0.0018

From 000893d0845efcb1ce03f742e05671183715bc5c Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 22 Jan 2025 12:46:12 -0500
Subject: [PATCH 07/25] quick parsing to run from cligit push

---
 .../analyze/error_analysis/pipeline.py        | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 53021297..f3e19923 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -72,3 +72,23 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
             raise FileExistsError(f"{analysis_path} already exists")
         with analysis_path.open("w") as f:
             json.dump(error_analysis, f)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("exp_dir", type=str)
+
+    args = parser.parse_args()
+    exp_dir = Path(args.exp_dir)
+
+    pipeline = ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=None,
+        episode_summarizer=EpisodeSummarizer(),
+        step_summarizer=ChangeSummarizer(),
+        analyzer=Analyzer("prompt"),
+    )
+
+    pipeline.run_analysis()

From 4727a9e18755890e1f7cb726d3ec8642739d214d Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 22 Jan 2025 12:49:33 -0500
Subject: [PATCH 08/25] even more parsing and making imports absolute

---
 src/agentlab/analyze/error_analysis/pipeline.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index f3e19923..4a961b76 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -6,10 +6,9 @@
 
 from bgym import ExpResult
 
+from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer
 from agentlab.analyze.inspect_results import yield_all_exp_results
 
-from .summarizer import ChangeSummarizer, EpisodeSummarizer
-
 
 @dataclass
 class Analyzer:
@@ -78,7 +77,8 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("exp_dir", type=str)
+    parser.add_argument("-e", "--exp_dir", type=str)
+    parser.add_argument("-f", "--filter", type=str, default=None)
 
     args = parser.parse_args()
     exp_dir = Path(args.exp_dir)

From 42f0362282bd7da087ff4b8b02d95a3b244a9f23 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Fri, 24 Jan 2025 14:40:10 -0500
Subject: [PATCH 09/25] .

---
 .../analyze/error_analysis/base_idea.py       | 287 ++++++++++++++++++
 .../analyze/error_analysis/pipeline.py        |  33 +-
 .../analyze/error_analysis/summarizer.py      | 250 +++------------
 .../error_analysis/summarizer_prompts.py      | 202 ++++++++++++
 .../analyze/error_analysis/test_summarizer.py |  22 ++
 5 files changed, 559 insertions(+), 235 deletions(-)
 create mode 100644 src/agentlab/analyze/error_analysis/base_idea.py
 create mode 100644 src/agentlab/analyze/error_analysis/summarizer_prompts.py
 create mode 100644 tests/analyze/error_analysis/test_summarizer.py

diff --git a/src/agentlab/analyze/error_analysis/base_idea.py b/src/agentlab/analyze/error_analysis/base_idea.py
new file mode 100644
index 00000000..5d4827d4
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/base_idea.py
@@ -0,0 +1,287 @@
+from dataclasses import dataclass
+
+from bgym import ExpResult, StepInfo
+
+CHANGE_SUMMARIZER_PROMPT = """
+You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
+you will receive the following pieces of information:
+
+1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
+2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
+3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
+4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
+5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
+
+YOUR TASK (each step):
+A) SUMMARIZE THE CHANGE
+   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
+     For example, did a new panel open, did the form reset, did nothing happen, etc.?
+
+B) ASSESS THE ACTION
+   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
+     or if it appears incorrect/unhelpful. 
+   - Briefly explain why.
+
+OUTPUT FORMAT (per step):
+Return your analysis as a JSON-like structure, for example:
+
+{
+  "changeSummary": "A new search results panel appeared on the right side.",
+  "actionAssessment": "Correct",
+  "explanation": "Clicking 'Search' was appropriate to display the results."
+}
+
+Or for an incorrect action:
+
+{
+  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
+  "actionAssessment": "Incorrect",
+  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
+  "suggestion": "Correct the date format or check for error messages."
+}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Goal: {goal}
+
+LLM Plan: {plan}
+
+Previous Observation: {past_observation}
+
+Current Observation: {current_observation}
+
+Past summaries: {past_summaries}
+
+Action: {action}
+"""
+
+ERROR_CLASSIFICATION_PROMPT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
+followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. AGENT ERRORS
+These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
+
+   - Navigation & Planning Errors
+     The agent cannot construct or execute a correct sequence of actions to reach its goal 
+     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+   - Interaction Execution Errors
+     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+     repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+   - Information Processing Errors
+     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+   - Observation & Action Errors
+     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+     or misaligns its actions (clicks the wrong element or stale link).
+
+2. LANGUAGE MODEL ERRORS
+These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
+independent of the low-level web interactions.
+
+   - Task Understanding Errors
+     The agent misreads or misunderstands the user's objective (goal interpretation), 
+     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+   - Reasoning Failures
+     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+     or fails to prioritize important subtasks when handling complex goals.
+
+3. BENCHMARK & ENVIRONMENT ERRORS
+These errors are external to the agent's logic and the language model's reasoning, 
+arising from flaws in the system, network, or evaluation framework itself.
+
+   - System Errors
+     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
+
+   - Benchmark Design Errors
+     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
+     or inflexible evaluation systems that fail to account for valid alternative solutions.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+   
+2. Planning / Thought History
+   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
+
+3. Current Observation (HTML / AX Tree Snippet)
+   - The webpage structure or state that the agent sees at a given point in time.
+
+4. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+5. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
+   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
+     but the benchmark expects a more expensive product and marks the solution as wrong.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
+     and does not allow an alternative correct solution.
+
+2) EXAMPLE B (Agent Error - Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Agent Error - Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
+   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
+     The query is ambiguous because "Upitts" is not a standard location. 
+     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
+     leading the agent astray due to unclear context.
+
+4) EXAMPLE D (Language Model Error - Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Language Model Error - Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. Decide if the failure is:
+   - An Agent Error (which subcategory/subcategories),
+   - A Language Model Error (which subcategory/subcategories),
+   - A Benchmark/Environment Error (which subcategory/subcategories),
+   - Or a combination thereof (multi-label if needed).
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
+
+Output Format Example:
+{
+  "errorCategory": ["Agent Error - Navigation & Planning"],
+  "explanation": "The agent opened the wrong GitLab page and never recovered..."
+}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Overall goal: {goal}
+
+LLM Plan and thought history: {plan}
+
+Current Observation: {current_observation}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+"""
+
+
+def _diff(past_obs, current_obs):
+    """TODO: Implement the diff function.
+
+    Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+    """
+    raise ValueError("Not implemented yet.")
+
+
+@dataclass
+class ChangeSummarizer:
+
+    llm: callable  # language model
+    obs_formatter: callable
+    use_diff: bool = False
+
+    def summarize(
+        self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str]
+    ) -> str:
+        """Produces, a summary of the effect of an action."""
+        past_obs_message = self.obs_formatter(past_obs)
+        current_obs_message = self.obs_formatter(current_obs)
+
+        goal = past_obs["goal"]  # Use goal object from agentlab
+        # Outsource everything to formatter
+        plan = past_obs["plan"]
+        if self.use_diff:
+            current_obs_message = _diff(past_obs_message, current_obs_message)
+
+        return self.llm(
+            self.make_prompt(
+                past_obs_message, action, current_obs_message, past_summaries, goal, plan
+            )
+        )
+
+    def make_prompt(
+        self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
+    ):
+        """TODO: Implement the prompt."""
+        return CHANGE_SUMMARIZER_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            past_observation=past_obs_message,
+            current_observation=current_obs_message,
+            past_summaries=past_summaries,
+            action=action,
+        )
+
+
+@dataclass
+class EpisodeAnalysis:
+    analysis: str  # complete analysis of the episode
+    summary: str  # short summary of the analysis
+    categories: dict[str, float]  # score for each category e.g. type of error or difficulty levels
+
+
+@dataclass
+class EpisodeSummarizer:
+
+    change_summarizer: ChangeSummarizer = None
+
+    def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis:
+        """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+        pass
+
+
+@dataclass
+class EpisodeErrorSummarizer(EpisodeSummarizer):
+
+    change_summarizer: ChangeSummarizer = None
+
+    def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
+        """TODO: Implement the prompt."""
+        return ERROR_CLASSIFICATION_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            current_observation=current_observation,
+            historical_summaries=historical_summaries,
+            action_history=action_history,
+        )
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 4a961b76..305d00b4 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -23,7 +23,6 @@ def __call__(self, *args, **kwds):
 class ErrorAnalysisPipeline:
     exp_dir: Path
     filter: str = None
-    step_summarizer: ChangeSummarizer = None
     episode_summarizer: EpisodeSummarizer = None
     analyzer: Analyzer = None
 
@@ -38,26 +37,10 @@ def run_analysis(self):
         filtered_results = self.filter_exp_results()
 
         for exp_result in filtered_results:
-            step_analysis = self.analyze_step(exp_result)
-            episode_analysis = self.analyze_episode(exp_result, step_analysis)
-            error_analysis = self.analyze_errors(exp_result, episode_analysis, step_analysis)
+            episode_summary = self.episode_summarizer(exp_result)
+            error_analysis = self.analyze_errors(exp_result, episode_summary)
             self.save_analysis(exp_result, error_analysis)
 
-    def analyze_step(self, exp_result: ExpResult) -> list[str]:
-        step_summaries = []  # type: list[str]
-        # this assumes that there is always an extra step at the end of the episode
-        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
-        # TODO:(thibault) make some checks
-        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
-            step_summaries.append(
-                self.step_summarizer.summarize(step, step.action, next_step, step_summaries)
-            )
-        return step_summaries
-
-    def analyze_episode(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
-        episode_summary = self.episode_summarizer.summarize(exp_result, step_analysis)
-        return episode_summary
-
     def analyze_errors(
         self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
     ) -> str:
@@ -82,10 +65,20 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
 
     args = parser.parse_args()
     exp_dir = Path(args.exp_dir)
+    filter = args.filter
+
+    import openai
+
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
+
+    step_summarizer = ChangeSummarizer(llm, lambda x: x)
+    episode_summarizer = EpisodeSummarizer()
 
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
-        filter=None,
+        filter=filter,
         episode_summarizer=EpisodeSummarizer(),
         step_summarizer=ChangeSummarizer(),
         analyzer=Analyzer("prompt"),
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index b3760216..7c5f9b03 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -1,209 +1,12 @@
 from dataclasses import dataclass
 
-from bgym import StepInfo
+from bgym import ExpResult, StepInfo
 
-CHANGE_SUMMARIZER_PROMPT = """
-You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
-you will receive the following pieces of information:
-
-1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
-2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
-3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
-4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
-5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
-
-YOUR TASK (each step):
-A) SUMMARIZE THE CHANGE
-   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
-     For example, did a new panel open, did the form reset, did nothing happen, etc.?
-
-B) ASSESS THE ACTION
-   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
-     or if it appears incorrect/unhelpful. 
-   - Briefly explain why.
-
-OUTPUT FORMAT (per step):
-Return your analysis as a JSON-like structure, for example:
-
-{
-  "changeSummary": "A new search results panel appeared on the right side.",
-  "actionAssessment": "Correct",
-  "explanation": "Clicking 'Search' was appropriate to display the results."
-}
-
-Or for an incorrect action:
-
-{
-  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
-  "actionAssessment": "Incorrect",
-  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
-  "suggestion": "Correct the date format or check for error messages."
-}
-
-Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
-
-Goal: {goal}
-
-LLM Plan: {plan}
-
-Previous Observation: {past_observation}
-
-Current Observation: {current_observation}
-
-Past summaries: {past_summaries}
-
-Action: {action}
-"""
-
-ERROR_CLASSIFICATION_PROMPT = """
-You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
-Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
-followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
-a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
-
---------------------------------------------------------------------------------
-TAXONOMY DEFINITIONS
---------------------------------------------------------------------------------
-
-1. AGENT ERRORS
-These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
-
-   - Navigation & Planning Errors
-     The agent cannot construct or execute a correct sequence of actions to reach its goal 
-     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
-
-   - Interaction Execution Errors
-     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
-     repeats the same failing action without adaptation, or loses track of the changing webpage state.
-
-   - Information Processing Errors
-     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
-     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
-
-   - Observation & Action Errors
-     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
-     or misaligns its actions (clicks the wrong element or stale link).
-
-2. LANGUAGE MODEL ERRORS
-These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
-independent of the low-level web interactions.
-
-   - Task Understanding Errors
-     The agent misreads or misunderstands the user's objective (goal interpretation), 
-     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
-
-   - Reasoning Failures
-     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
-     or fails to prioritize important subtasks when handling complex goals.
-
-3. BENCHMARK & ENVIRONMENT ERRORS
-These errors are external to the agent's logic and the language model's reasoning, 
-arising from flaws in the system, network, or evaluation framework itself.
-
-   - System Errors
-     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
-
-   - Benchmark Design Errors
-     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
-     or inflexible evaluation systems that fail to account for valid alternative solutions.
-
---------------------------------------------------------------------------------
-INPUT DESCRIPTION
---------------------------------------------------------------------------------
-
-You will receive the following for each scenario:
-1. User Goal
-   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
-   
-2. Planning / Thought History
-   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
-
-3. Current Observation (HTML / AX Tree Snippet)
-   - The webpage structure or state that the agent sees at a given point in time.
-
-4. Historical change summaries
-   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
-
-5. Action History
-   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
-     along with immediate outcomes or errors.
-
-Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
-
---------------------------------------------------------------------------------
-FEW-SHOT CLASSIFICATION EXAMPLES
---------------------------------------------------------------------------------
-
-1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
-   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
-     but the benchmark expects a more expensive product and marks the solution as wrong.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
-     and does not allow an alternative correct solution.
-
-2) EXAMPLE B (Agent Error - Interaction Execution)
-   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
-     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
-   • Classification: ["Agent Error - Interaction Execution"]
-   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
-     without adaptation ("Action Repetition").
-
-3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
-   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
-     The query is ambiguous because "Upitts" is not a standard location. 
-     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
-     leading the agent astray due to unclear context.
-
-4) EXAMPLE D (Language Model Error - Task Understanding)
-   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
-     that are older than 30 days and add a comment saying 'I can help fix this.'" 
-     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
-     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
-   • Classification: ["Language Model Error - Task Understanding"]
-   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
-     it focused on creating a new issue. This is a misinterpretation of the instructions, 
-     not a mechanical error in clicking or input format.
-
---------------------------------------------------------------------------------
-CLASSIFICATION TASK
---------------------------------------------------------------------------------
-
-1. Read through:
-   - The planning and thought history
-   - The action history
-   - The current HTML or AX Tree observation
-   - The user goal
-
-2. Decide if the failure is:
-   - An Agent Error (which subcategory/subcategories),
-   - A Language Model Error (which subcategory/subcategories),
-   - A Benchmark/Environment Error (which subcategory/subcategories),
-   - Or a combination thereof (multi-label if needed).
-
-3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
-
-4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
-
-Output Format Example:
-{
-  "errorCategory": ["Agent Error - Navigation & Planning"],
-  "explanation": "The agent opened the wrong GitLab page and never recovered..."
-}
-
-Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
-
-Overall goal: {goal}
-
-LLM Plan and thought history: {plan}
-
-Current Observation: {current_observation}
-
-Historical change summaries: {historical_summaries}
-
-Action history: {action_history}
-"""
+from agentlab.analyze.error_analysis.summarizer_prompts import (
+    CHANGE_SUMMARIZER_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT,
+)
+from agentlab.analyze.inspect_results import summarize
 
 
 def _diff(past_obs, current_obs):
@@ -218,25 +21,31 @@ def _diff(past_obs, current_obs):
 class ChangeSummarizer:
 
     llm: callable  # language model
-    obs_formatter: callable
+    obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available")
     use_diff: bool = False
 
-    def summarize(
-        self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str]
-    ) -> str:
+    def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
         """Produces, a summary of the effect of an action."""
-        past_obs_message = self.obs_formatter(past_obs)
-        current_obs_message = self.obs_formatter(current_obs)
+        obs_message = self.obs_formatter(obs.obs)
+        next_obs_message = self.obs_formatter(next_obs.obs)
 
-        goal = past_obs["goal"]  # Use goal object from agentlab
+        action = obs.action
+
+        goal = obs.obs["goal"]  # Use goal object from agentlab
+        # TODO(thibault): switch to 'goal_object'
         # Outsource everything to formatter
-        plan = past_obs["plan"]
+
         if self.use_diff:
-            current_obs_message = _diff(past_obs_message, current_obs_message)
+            next_obs_message = _diff(obs_message, next_obs_message)
 
         return self.llm(
             self.make_prompt(
-                past_obs_message, action, current_obs_message, past_summaries, goal, plan
+                obs_message,
+                action,
+                next_obs_message,
+                past_summaries,
+                goal,
+                obs.obs.get("plan", "No plan available"),
             )
         )
 
@@ -266,9 +75,20 @@ class EpisodeSummarizer:
 
     change_summarizer: ChangeSummarizer = None
 
-    def summarize(episode: list[StepInfo]) -> EpisodeAnalysis:
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
+
+    def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
-        pass
+        summaries = self.make_change_summaries(exp_results)
+
+    def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
+        summaries = []  # type: list[str]
+        # this assumes that there is always an extra step at the end of the episode
+        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+        # TODO:(thibault) make some checks or w/e
+        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+        return summaries
 
 
 @dataclass
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
new file mode 100644
index 00000000..382c2805
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -0,0 +1,202 @@
+CHANGE_SUMMARIZER_PROMPT = """
+You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
+you will receive the following pieces of information:
+
+1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
+2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
+3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
+4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
+5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
+
+YOUR TASK (each step):
+A) SUMMARIZE THE CHANGE
+   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
+     For example, did a new panel open, did the form reset, did nothing happen, etc.?
+
+B) ASSESS THE ACTION
+   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
+     or if it appears incorrect/unhelpful. 
+   - Briefly explain why.
+
+OUTPUT FORMAT (per step):
+Return your analysis as a JSON-like structure, for example:
+
+{{
+  "changeSummary": "A new search results panel appeared on the right side.",
+  "actionAssessment": "Correct",
+  "explanation": "Clicking 'Search' was appropriate to display the results."
+}}
+
+Or for an incorrect action:
+
+{{
+  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
+  "actionAssessment": "Incorrect",
+  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
+  "suggestion": "Correct the date format or check for error messages."
+}}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Goal: {goal}
+
+LLM Plan: {plan}
+
+Current Observation: {past_observation}
+
+Next Observation: {current_observation}
+
+Past summaries: {past_summaries}
+
+Action: {action}
+"""
+
+ERROR_CLASSIFICATION_PROMPT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
+followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. AGENT ERRORS
+These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
+
+   - Navigation & Planning Errors
+     The agent cannot construct or execute a correct sequence of actions to reach its goal 
+     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+   - Interaction Execution Errors
+     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+     repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+   - Information Processing Errors
+     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+   - Observation & Action Errors
+     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+     or misaligns its actions (clicks the wrong element or stale link).
+
+2. LANGUAGE MODEL ERRORS
+These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
+independent of the low-level web interactions.
+
+   - Task Understanding Errors
+     The agent misreads or misunderstands the user's objective (goal interpretation), 
+     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+   - Reasoning Failures
+     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+     or fails to prioritize important subtasks when handling complex goals.
+
+3. BENCHMARK & ENVIRONMENT ERRORS
+These errors are external to the agent's logic and the language model's reasoning, 
+arising from flaws in the system, network, or evaluation framework itself.
+
+   - System Errors
+     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
+
+   - Benchmark Design Errors
+     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
+     or inflexible evaluation systems that fail to account for valid alternative solutions.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+   
+2. Planning / Thought History
+   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
+
+3. Current Observation (HTML / AX Tree Snippet)
+   - The webpage structure or state that the agent sees at a given point in time.
+
+4. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+5. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
+   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
+     but the benchmark expects a more expensive product and marks the solution as wrong.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
+     and does not allow an alternative correct solution.
+
+2) EXAMPLE B (Agent Error - Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Agent Error - Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
+   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
+     The query is ambiguous because "Upitts" is not a standard location. 
+     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
+   • Classification: ["Benchmark Design Error"]
+   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
+     leading the agent astray due to unclear context.
+
+4) EXAMPLE D (Language Model Error - Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Language Model Error - Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. Decide if the failure is:
+   - An Agent Error (which subcategory/subcategories),
+   - A Language Model Error (which subcategory/subcategories),
+   - A Benchmark/Environment Error (which subcategory/subcategories),
+   - Or a combination thereof (multi-label if needed).
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
+
+Output Format Example:
+{{
+  "errorCategory": ["Agent Error - Navigation & Planning"],
+  "explanation": "The agent opened the wrong GitLab page and never recovered..."
+}}
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Overall goal: {goal}
+
+LLM Plan and thought history: {plan}
+
+Current Observation: {current_observation}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+"""
diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py
new file mode 100644
index 00000000..e9fe0ecc
--- /dev/null
+++ b/tests/analyze/error_analysis/test_summarizer.py
@@ -0,0 +1,22 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@pytest.fixture(scope="module")
+def exp_results() -> list[ExpResult]:
+    exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+    return list(yield_all_exp_results(exp_dir))
+
+
+def test_change_summarizer(exp_results: list[ExpResult]):
+    summarizer = ChangeSummarizer(llm=lambda x: x)
+    step = exp_results[0].steps_info[0]
+    next_step = exp_results[0].steps_info[1]
+    past_summaries = []
+    summary = summarizer.summarize(step, next_step, past_summaries)
+    assert isinstance(summary, str)

From 394999bac9ffde98432e62e776fbb489d5e1e432 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Fri, 24 Jan 2025 14:41:10 -0500
Subject: [PATCH 10/25] chat_models can take str as input

---
 src/agentlab/llm/chat_api.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 7392e666..fb65c3dd 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -4,7 +4,7 @@
 import time
 from dataclasses import dataclass
 from functools import partial
-from typing import Optional
+from typing import Optional, Union
 
 import openai
 from huggingface_hub import InferenceClient
@@ -13,7 +13,7 @@
 import agentlab.llm.tracking as tracking
 from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
 from agentlab.llm.huggingface_utils import HFBaseChatModel
-from agentlab.llm.llm_utils import AIMessage, Discussion
+from agentlab.llm.llm_utils import AIMessage, Discussion, HumanMessage
 
 
 def make_system_message(content: str) -> dict:
@@ -261,7 +261,13 @@ def __init__(
             **client_args,
         )
 
-    def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict:
+    def __call__(
+        self, messages: Union[str, list[dict]], n_samples: int = 1, temperature: float = None
+    ) -> dict:
+
+        if isinstance(messages, str):
+            messages = [HumanMessage(messages)]
+
         # Initialize retry tracking attributes
         self.retries = 0
         self.success = False

From e0e786cfbc9530866a17ea662a41add384af4e51 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Fri, 24 Jan 2025 14:41:19 -0500
Subject: [PATCH 11/25] typing

---
 src/agentlab/llm/llm_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/llm/llm_configs.py b/src/agentlab/llm/llm_configs.py
index ec608686..16920336 100644
--- a/src/agentlab/llm/llm_configs.py
+++ b/src/agentlab/llm/llm_configs.py
@@ -16,7 +16,7 @@
     "test",
 ]
 
-CHAT_MODEL_ARGS_DICT = {
+CHAT_MODEL_ARGS_DICT = {  # type: dict[str, Union[AzureModelArgs, OpenAIModelArgs, SelfHostedModelArgs, OpenRouterModelArgs]]
     "openai/gpt-4o-mini-2024-07-18": OpenAIModelArgs(
         model_name="gpt-4o-mini-2024-07-18",
         max_total_tokens=128_000,

From 46d2c8c10646dc034584c80a68829e1d46dd40ae Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Tue, 28 Jan 2025 11:09:27 -0500
Subject: [PATCH 12/25] keep this here bc it's going to pop back up

---
 .../summary_df.csv                                              | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv

diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
new file mode 100644
index 00000000..85b34311
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
@@ -0,0 +1,2 @@
+avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+1.0,0.0,1.0,1/1,0,0.0003

From 8a882ad58d23438319952b4f7802568473964bd8 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Tue, 28 Jan 2025 11:09:53 -0500
Subject: [PATCH 13/25] pipeline mvp

---
 .../analyze/error_analysis/pipeline.py        | 24 ++++------
 .../analyze/error_analysis/summarizer.py      | 44 ++++++++++++++++---
 .../error_analysis/summarizer_prompts.py      | 14 +-----
 3 files changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 305d00b4..62c313aa 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -6,7 +6,11 @@
 
 from bgym import ExpResult
 
-from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer, EpisodeSummarizer
+from agentlab.analyze.error_analysis.summarizer import (
+    ChangeSummarizer,
+    EpisodeErrorSummarizer,
+    EpisodeSummarizer,
+)
 from agentlab.analyze.inspect_results import yield_all_exp_results
 
 
@@ -24,7 +28,6 @@ class ErrorAnalysisPipeline:
     exp_dir: Path
     filter: str = None
     episode_summarizer: EpisodeSummarizer = None
-    analyzer: Analyzer = None
 
     def filter_exp_results(self) -> Generator[ExpResult, None, None]:
         # TODO:(thibault) improve filtering
@@ -37,23 +40,16 @@ def run_analysis(self):
         filtered_results = self.filter_exp_results()
 
         for exp_result in filtered_results:
-            episode_summary = self.episode_summarizer(exp_result)
-            error_analysis = self.analyze_errors(exp_result, episode_summary)
+            error_analysis = self.episode_summarizer(exp_result)
             self.save_analysis(exp_result, error_analysis)
 
-    def analyze_errors(
-        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
-    ) -> str:
-        error_analysis = self.analyzer(exp_result, episode_analysis, step_analysis)
-        return error_analysis
-
     def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
         """Save the analysis to json"""
         analysis_path = exp_result.exp_dir / "error_analysis.json"
         if not exists_ok and analysis_path.exists():
             raise FileExistsError(f"{analysis_path} already exists")
         with analysis_path.open("w") as f:
-            json.dump(error_analysis, f)
+            json.dump(error_analysis, f, indent=4)
 
 
 if __name__ == "__main__":
@@ -67,8 +63,6 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     exp_dir = Path(args.exp_dir)
     filter = args.filter
 
-    import openai
-
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
     llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
@@ -79,9 +73,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeSummarizer(),
-        step_summarizer=ChangeSummarizer(),
-        analyzer=Analyzer("prompt"),
+        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm),
     )
 
     pipeline.run_analysis()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 7c5f9b03..5c1fc343 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -7,6 +7,7 @@
     ERROR_CLASSIFICATION_PROMPT,
 )
 from agentlab.analyze.inspect_results import summarize
+from agentlab.llm.llm_utils import json_parser
 
 
 def _diff(past_obs, current_obs):
@@ -21,7 +22,7 @@ def _diff(past_obs, current_obs):
 class ChangeSummarizer:
 
     llm: callable  # language model
-    obs_formatter: callable = lambda x: x.get("axtree_txt", "No AXTREE available")
+    obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
     use_diff: bool = False
 
     def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
@@ -74,12 +75,25 @@ class EpisodeAnalysis:
 class EpisodeSummarizer:
 
     change_summarizer: ChangeSummarizer = None
+    llm: callable = None
+    parser: callable = lambda x: json_parser(x)[0]
 
     def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
 
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+
+        if exp_results.steps_info[-1].reward == 1:
+            return {"analysis": "Success", "summaries": {}}
+
         summaries = self.make_change_summaries(exp_results)
+        prompt = self.make_prompt(exp_results, summaries)
+        raw_analysis = self.llm(prompt)["content"]
+        analysis = self.parser(raw_analysis)
+        return {
+            "analysis": analysis,
+            "summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
+        }
 
     def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         summaries = []  # type: list[str]
@@ -87,7 +101,9 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
         # TODO:(thibault) make some checks or w/e
         for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
-            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+            summaries.append(
+                self.change_summarizer.summarize(step, next_step, summaries)["content"]
+            )
         return summaries
 
 
@@ -96,12 +112,26 @@ class EpisodeErrorSummarizer(EpisodeSummarizer):
 
     change_summarizer: ChangeSummarizer = None
 
-    def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
         """TODO: Implement the prompt."""
+        goal = exp_results.steps_info[0].obs["goal"]
+
+        txt_summaries = "\n".join(summaries)
+
+        thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
+        actions = [step.action for step in exp_results.steps_info[:-1]]
+        action_errors = "\n".join(
+            [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
+        )
+
+        txt_actions = "\n".join(
+            [
+                f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}"
+                for action, thought, action_error in zip(actions, thoughts, action_errors)
+            ]
+        )
         return ERROR_CLASSIFICATION_PROMPT.format(
             goal=goal,
-            plan=plan,
-            current_observation=current_observation,
-            historical_summaries=historical_summaries,
-            action_history=action_history,
+            historical_summaries=txt_summaries,
+            action_history=txt_actions,
         )
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
index 382c2805..a37be0a9 100644
--- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -110,17 +110,11 @@
 You will receive the following for each scenario:
 1. User Goal
    - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
-   
-2. Planning / Thought History
-   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
 
-3. Current Observation (HTML / AX Tree Snippet)
-   - The webpage structure or state that the agent sees at a given point in time.
-
-4. Historical change summaries
+2. Historical change summaries
    - A list of summaries of changes in the observation that the agent has seen during the course of actions.
 
-5. Action History
+3. Action History
    - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
      along with immediate outcomes or errors.
 
@@ -192,10 +186,6 @@
 
 Overall goal: {goal}
 
-LLM Plan and thought history: {plan}
-
-Current Observation: {current_observation}
-
 Historical change summaries: {historical_summaries}
 
 Action history: {action_history}

From 3fab5b4fc1d75a899f30a8b1d8263021b41dad8b Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Tue, 28 Jan 2025 11:10:21 -0500
Subject: [PATCH 14/25] added a specific tab and viz for it in xray

---
 src/agentlab/analyze/agent_xray.py | 46 ++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 9764898c..7466db87 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1,4 +1,5 @@
 import base64
+import json
 import os
 import traceback
 from copy import deepcopy
@@ -30,6 +31,32 @@
 TASK_SEED_KEY = "env.task_seed"
 
 
+def dict_to_markdown(data, level=1):
+    """
+    Convert a nested dictionary to a Markdown string with hierarchical headers.
+
+    Parameters:
+        data (dict): The dictionary to convert.
+        level (int): The current header level (default is 1).
+
+    Returns:
+        str: The formatted Markdown string.
+    """
+    markdown = ""
+
+    for key, value in data.items():
+        if isinstance(value, dict):
+            # Add a header for the key and recursively process the dictionary
+            markdown += f"{'#' * level} {key}\n"
+            markdown += dict_to_markdown(value, level + 1)
+        else:
+            # Add the key-value pair with indentation
+            markdown += f"{'#' * level} {key}\n"
+            markdown += f"    {value}\n"
+
+    return markdown
+
+
 def display_table(df: pd.DataFrame):
     df = df.copy()
     df.columns = clean_column_names(df.columns)
@@ -358,6 +385,9 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Task Error") as tab_error:
                 task_error = gr.Markdown()
 
+            with gr.Tab("Error Analysis") as tab_error_analysis:
+                error_analysis = gr.Markdown()
+
             with gr.Tab("Logs") as tab_logs:
                 logs = gr.Code(language=None, **code_args)
 
@@ -485,6 +515,7 @@ def run_gradio(results_dir: Path):
         tab_axtree.select(fn=update_axtree, outputs=axtree_code)
         tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
         tab_error.select(fn=update_task_error, outputs=task_error)
+        tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis)
         tab_logs.select(fn=update_logs, outputs=logs)
         tab_stats.select(fn=update_stats, outputs=stats)
         tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
@@ -612,6 +643,20 @@ def update_task_error():
         return "No Task Error"
 
 
+def update_error_analysis():
+    global info
+    try:
+        error_analysis = info.exp_result.exp_dir / "error_analysis.json"
+        if not error_analysis.exists():
+            return "No Error Analysis Found"
+        with error_analysis.open("r") as f:
+            json_data = json.load(f)
+        res = dict_to_markdown(json_data)
+        return res
+    except FileNotFoundError:
+        return "No Error Analysis"
+
+
 def update_logs():
     global info
     try:
@@ -1200,3 +1245,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+    main()

From 2be23e5270ac3eb7ef99d675c3a9303548d4c733 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Tue, 28 Jan 2025 11:12:37 -0500
Subject: [PATCH 15/25] added formatting options

---
 src/agentlab/analyze/error_analysis/pipeline.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 62c313aa..4330cce2 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -52,6 +52,10 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
             json.dump(error_analysis, f, indent=4)
 
 
+AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available")
+HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")
+
+
 if __name__ == "__main__":
     import argparse
 
@@ -73,7 +77,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm), llm),
+        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, HTML_FORMATTER), llm),
     )
 
     pipeline.run_analysis()

From 41f8f69f1719954f934a49dc03a79bc0a1db0b1b Mon Sep 17 00:00:00 2001
From: Megh Thakkar <Megh-Thakkar@users.noreply.github.com>
Date: Wed, 29 Jan 2025 13:22:53 -0500
Subject: [PATCH 16/25] Update summarizer_prompts.py

---
 .../error_analysis/summarizer_prompts.py      | 106 +++++++-----------
 1 file changed, 39 insertions(+), 67 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
index a37be0a9..2b893d6e 100644
--- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -53,55 +53,37 @@
 
 ERROR_CLASSIFICATION_PROMPT = """
 You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
-Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
-followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), 
 a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
 
 --------------------------------------------------------------------------------
 TAXONOMY DEFINITIONS
 --------------------------------------------------------------------------------
 
-1. AGENT ERRORS
-These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
+1. Navigation & Planning Errors
+  The agent cannot construct or execute a correct sequence of actions to reach its goal 
+  (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
 
-   - Navigation & Planning Errors
-     The agent cannot construct or execute a correct sequence of actions to reach its goal 
-     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+2. Interaction Execution Errors
+  The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+  repeats the same failing action without adaptation, or loses track of the changing webpage state.
 
-   - Interaction Execution Errors
-     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
-     repeats the same failing action without adaptation, or loses track of the changing webpage state.
+3. Information Processing Errors
+  The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+  misconstrues relationships between pieces of information, or fails to validate data against task requirements.
 
-   - Information Processing Errors
-     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
-     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+4. Observation & Action Errors
+  The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+  or misaligns its actions (clicks the wrong element or stale link).
 
-   - Observation & Action Errors
-     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
-     or misaligns its actions (clicks the wrong element or stale link).
+5. Task Understanding Errors
+  The agent misreads or misunderstands the user's objective (goal interpretation), 
+  loses crucial context (context loss), or performs actions beyond or short of the intended scope.
 
-2. LANGUAGE MODEL ERRORS
-These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
-independent of the low-level web interactions.
-
-   - Task Understanding Errors
-     The agent misreads or misunderstands the user's objective (goal interpretation), 
-     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
-
-   - Reasoning Failures
-     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
-     or fails to prioritize important subtasks when handling complex goals.
-
-3. BENCHMARK & ENVIRONMENT ERRORS
-These errors are external to the agent's logic and the language model's reasoning, 
-arising from flaws in the system, network, or evaluation framework itself.
-
-   - System Errors
-     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
-
-   - Benchmark Design Errors
-     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
-     or inflexible evaluation systems that fail to account for valid alternative solutions.
+6. Reasoning Failures
+  The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+  or fails to prioritize important subtasks when handling complex goals.
 
 --------------------------------------------------------------------------------
 INPUT DESCRIPTION
@@ -124,34 +106,19 @@
 FEW-SHOT CLASSIFICATION EXAMPLES
 --------------------------------------------------------------------------------
 
-1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
-   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
-     but the benchmark expects a more expensive product and marks the solution as wrong.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
-     and does not allow an alternative correct solution.
-
-2) EXAMPLE B (Agent Error - Interaction Execution)
+1) EXAMPLE A (Interaction Execution)
    • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
      Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
-   • Classification: ["Agent Error - Interaction Execution"]
+   • Classification: ["Interaction Execution"]
    • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
      without adaptation ("Action Repetition").
 
-3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
-   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
-     The query is ambiguous because "Upitts" is not a standard location. 
-     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
-     leading the agent astray due to unclear context.
-
-4) EXAMPLE D (Language Model Error - Task Understanding)
+2) EXAMPLE B (Task Understanding)
    • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
      that are older than 30 days and add a comment saying 'I can help fix this.'" 
      The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
      with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
-   • Classification: ["Language Model Error - Task Understanding"]
+   • Classification: ["Task Understanding"]
    • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
      it focused on creating a new issue. This is a misinterpretation of the instructions, 
      not a mechanical error in clicking or input format.
@@ -166,23 +133,28 @@
    - The current HTML or AX Tree observation
    - The user goal
 
-2. Decide if the failure is:
-   - An Agent Error (which subcategory/subcategories),
-   - A Language Model Error (which subcategory/subcategories),
-   - A Benchmark/Environment Error (which subcategory/subcategories),
-   - Or a combination thereof (multi-label if needed).
+2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
+   If the task is successful, you can keep the error category as blank.
 
 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
 
-4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
+Output format example for an unsuccessful interaction:
+{{
+  "explanation": "The agent opened the wrong GitLab page and never recovered...",
+  "success": False,
+  "errorCategory": ["Navigation & Planning"],
+}}
 
-Output Format Example:
+Output format example for a successful interaction:
 {{
-  "errorCategory": ["Agent Error - Navigation & Planning"],
-  "explanation": "The agent opened the wrong GitLab page and never recovered..."
+  "explanation": "The agent opened the correct GitLab page and ...",
+  "success": True,
+  "errorCategory": [],
 }}
 
-Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+Please follow this structure at every step. Keep your responses concise and clear. 
+
+Below are the details for the interaction.
 
 Overall goal: {goal}
 

From 6163b47a7caf527bd08ee820f33a4cabb6d94b9f Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 29 Jan 2025 19:30:10 +0000
Subject: [PATCH 17/25] xml parsing

---
 .../analyze/error_analysis/pipeline.py        |  4 +-
 .../analyze/error_analysis/summarizer.py      | 29 ++++++++++----
 .../error_analysis/summarizer_prompts.py      | 39 +++++++++----------
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 4330cce2..887a0ba3 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -69,7 +69,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
 
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
-    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"].make_model()
+    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
 
     step_summarizer = ChangeSummarizer(llm, lambda x: x)
     episode_summarizer = EpisodeSummarizer()
@@ -77,7 +77,7 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, HTML_FORMATTER), llm),
+        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
     )
 
     pipeline.run_analysis()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 5c1fc343..7df5e754 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -7,7 +7,7 @@
     ERROR_CLASSIFICATION_PROMPT,
 )
 from agentlab.analyze.inspect_results import summarize
-from agentlab.llm.llm_utils import json_parser
+from agentlab.llm.llm_utils import json_parser, parse_html_tags
 
 
 def _diff(past_obs, current_obs):
@@ -39,7 +39,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]
         if self.use_diff:
             next_obs_message = _diff(obs_message, next_obs_message)
 
-        return self.llm(
+        return self.parse(self.llm(
             self.make_prompt(
                 obs_message,
                 action,
@@ -48,7 +48,7 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]
                 goal,
                 obs.obs.get("plan", "No plan available"),
             )
-        )
+        )['content'])
 
     def make_prompt(
         self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
@@ -63,6 +63,10 @@ def make_prompt(
             action=action,
         )
 
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"])[0]
+        return parsed_result
+
 
 @dataclass
 class EpisodeAnalysis:
@@ -83,13 +87,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
 
-        if exp_results.steps_info[-1].reward == 1:
-            return {"analysis": "Success", "summaries": {}}
+        # if exp_results.steps_info[-1].reward == 1:
+        #     return {"analysis": "Success", "summaries": {}}
 
         summaries = self.make_change_summaries(exp_results)
         prompt = self.make_prompt(exp_results, summaries)
         raw_analysis = self.llm(prompt)["content"]
-        analysis = self.parser(raw_analysis)
+        analysis = self.parse(raw_analysis)
         return {
             "analysis": analysis,
             "summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
@@ -102,10 +106,13 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         # TODO:(thibault) make some checks or w/e
         for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
             summaries.append(
-                self.change_summarizer.summarize(step, next_step, summaries)["content"]
+                self.change_summarizer.summarize(step, next_step, summaries)
             )
         return summaries
 
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["explanation", "success", "errorCategory"])[0]
+        return parsed_result
 
 @dataclass
 class EpisodeErrorSummarizer(EpisodeSummarizer):
@@ -116,7 +123,13 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
         """TODO: Implement the prompt."""
         goal = exp_results.steps_info[0].obs["goal"]
 
-        txt_summaries = "\n".join(summaries)
+        def format_summary(summary):
+            res = ''
+            for key, value in summary.items():
+                res += f"{key}: {value}\n"
+            return res
+
+        txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
 
         thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
         actions = [step.action for step in exp_results.steps_info[:-1]]
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
index 2b893d6e..807f1a2c 100644
--- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -21,20 +21,19 @@
 OUTPUT FORMAT (per step):
 Return your analysis as a JSON-like structure, for example:
 
-{{
-  "changeSummary": "A new search results panel appeared on the right side.",
-  "actionAssessment": "Correct",
-  "explanation": "Clicking 'Search' was appropriate to display the results."
-}}
+<changeSummary>A new search results panel appeared on the right side.</changeSummary>
+<actionAssessment>Correct</actionAssessment>
+<explanation>Clicking 'Search' was appropriate to display the results.</explanation>
 
 Or for an incorrect action:
 
-{{
-  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
-  "actionAssessment": "Incorrect",
-  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
-  "suggestion": "Correct the date format or check for error messages."
-}}
+<changeSummary>The page reloaded but the date fields were reset to defaults.</changeSummary>
+<actionAssessment>Incorrect</actionAssessment>
+<explanation>The agent should have fixed the date format first instead of re-clicking 'Show report'.</explanation>
+<suggestion>Correct the date format or check for error messages.</suggestion>
+
+
+Please use single quotes '' to quote elements from the page, so as not to create parsing issues.
 
 Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
 
@@ -139,19 +138,17 @@
 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
 
 Output format example for an unsuccessful interaction:
-{{
-  "explanation": "The agent opened the wrong GitLab page and never recovered...",
-  "success": False,
-  "errorCategory": ["Navigation & Planning"],
-}}
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<success>False</success>
+<errorCategory>["Navigation & Planning"]</errorCategory>
 
 Output format example for a successful interaction:
-{{
-  "explanation": "The agent opened the correct GitLab page and ...",
-  "success": True,
-  "errorCategory": [],
-}}
 
+<explanation>The agent opened the correct GitLab page and ...</explanation>
+<success>True</success>
+<errorCategory>[]</errorCategory>
+  
 Please follow this structure at every step. Keep your responses concise and clear. 
 
 Below are the details for the interaction.

From a1f3416275ac8c5c616ba506db536ac6a5af598c Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Wed, 29 Jan 2025 19:33:49 +0000
Subject: [PATCH 18/25] fix

---
 src/agentlab/analyze/error_analysis/summarizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 7df5e754..fb2b47fe 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -96,7 +96,7 @@ def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         analysis = self.parse(raw_analysis)
         return {
             "analysis": analysis,
-            "summaries": {i: self.parser(a) for i, a in enumerate(summaries)},
+            "summaries": {i: a for i, a in enumerate(summaries)},
         }
 
     def make_change_summaries(self, exp_result: ExpResult) -> list[str]:

From a455d0d431b92491486f4b20591a455eff40f888 Mon Sep 17 00:00:00 2001
From: Leo Boisvert <leo.boisvert@hotmail.ca>
Date: Tue, 4 Feb 2025 20:50:40 +0000
Subject: [PATCH 19/25] add error analysis prediction validation script

---
 .../validate_analysis_predictions.py          | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 src/agentlab/analyze/error_analysis/validate_analysis_predictions.py

diff --git a/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
new file mode 100644
index 00000000..af5613bb
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from agentlab.analyze.inspect_results import (
+    load_result_df,
+)
+import json
+
+
+def get_aggregate_statistics(exp_dir: Path):
+    """Get aggregate statistics for the experiment results."""
+    results = load_result_df(exp_dir, filter=filter)
+
+
+if __name__ == "__main__":
+    path = Path(
+        "/mnt/colab_public/data/ui_copilot/thibault/tmlr_exps/2024-10-23_14-17-47_5_agents_on_workarena_l1"
+    )
+    results = load_result_df(path).reset_index()
+    results = results.loc[results["agent.chat_model.model_name"].str.contains("anthropic")]
+    success_predictions = []
+    for dir in results["exp_dir"]:
+        error_analysis = Path(dir) / "error_analysis.json"
+        if error_analysis.exists():
+            with open(error_analysis, "r") as f:
+                error_analysis = json.load(f)
+            task_success_prediction_str = error_analysis["analysis"]["success"]
+            task_success_prediction = True if task_success_prediction_str == "True" else False
+            success_predictions.append(task_success_prediction)
+        else:
+            success_predictions.append(None)
+    results["success_predictions"] = success_predictions
+    a = 1

From 82dbabad0289b56a02f6d5085ade2d1dbe9119aa Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 6 Feb 2025 11:01:40 -0500
Subject: [PATCH 20/25] black version update

---
 requirements.txt                              |  2 +-
 .../analyze/error_analysis/summarizer.py      | 37 +++++++++++--------
 2 files changed, 22 insertions(+), 17 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c598b342..a59d4a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-black[jupyter]>=24.2.0
+black[jupyter]>=24.2.0,<25
 blacken-docs
 pre-commit
 pytest==7.3.2
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index fb2b47fe..14ab10ba 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -39,16 +39,18 @@ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]
         if self.use_diff:
             next_obs_message = _diff(obs_message, next_obs_message)
 
-        return self.parse(self.llm(
-            self.make_prompt(
-                obs_message,
-                action,
-                next_obs_message,
-                past_summaries,
-                goal,
-                obs.obs.get("plan", "No plan available"),
-            )
-        )['content'])
+        return self.parse(
+            self.llm(
+                self.make_prompt(
+                    obs_message,
+                    action,
+                    next_obs_message,
+                    past_summaries,
+                    goal,
+                    obs.obs.get("plan", "No plan available"),
+                )
+            )["content"]
+        )
 
     def make_prompt(
         self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
@@ -64,7 +66,9 @@ def make_prompt(
         )
 
     def parse(self, raw_output: str) -> dict:
-        parsed_result = parse_html_tags(raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"])[0]
+        parsed_result = parse_html_tags(
+            raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"]
+        )[0]
         return parsed_result
 
 
@@ -105,15 +109,16 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
         # TODO:(thibault) make some checks or w/e
         for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
-            summaries.append(
-                self.change_summarizer.summarize(step, next_step, summaries)
-            )
+            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
         return summaries
 
     def parse(self, raw_output: str) -> dict:
-        parsed_result = parse_html_tags(raw_output, keys=["explanation", "success", "errorCategory"])[0]
+        parsed_result = parse_html_tags(
+            raw_output, keys=["explanation", "success", "errorCategory"]
+        )[0]
         return parsed_result
 
+
 @dataclass
 class EpisodeErrorSummarizer(EpisodeSummarizer):
 
@@ -124,7 +129,7 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
         goal = exp_results.steps_info[0].obs["goal"]
 
         def format_summary(summary):
-            res = ''
+            res = ""
             for key, value in summary.items():
                 res += f"{key}: {value}\n"
             return res

From 5fbbe57ae52e58d1f1f7ea1734cbcd95c25417a5 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 20 Feb 2025 15:41:53 -0500
Subject: [PATCH 21/25] phony command, joblib stuff, took think out of prompt

---
 pyproject.toml                                |  1 +
 .../analyze/error_analysis/pipeline.py        | 41 +++++++++++++++----
 .../analyze/error_analysis/summarizer.py      | 19 +++++----
 3 files changed, 45 insertions(+), 16 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2a1e06c3..782b1f26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,3 +57,4 @@ exclude = '''
 [project.scripts]
 agentlab-assistant = "agentlab.ui_assistant:main"
 agentlab-xray = "agentlab.analyze.agent_xray:main"
+agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main"
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index 887a0ba3..f726891f 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -23,6 +23,11 @@ def __call__(self, *args, **kwds):
         return "analysis"
 
 
+def analyze(exp_result, episode_summarizer, save_analysis_func):
+    error_analysis = episode_summarizer(exp_result)
+    save_analysis_func(exp_result, error_analysis)
+
+
 @dataclass
 class ErrorAnalysisPipeline:
     exp_dir: Path
@@ -36,12 +41,21 @@ def filter_exp_results(self) -> Generator[ExpResult, None, None]:
             if self.filter is None or self.filter in str(exp_result.exp_dir):
                 yield exp_result
 
-    def run_analysis(self):
+    def run_analysis(self, parallel=False, jobs=-1):
         filtered_results = self.filter_exp_results()
 
-        for exp_result in filtered_results:
-            error_analysis = self.episode_summarizer(exp_result)
-            self.save_analysis(exp_result, error_analysis)
+        if parallel:
+            import joblib
+
+            joblib.Parallel(n_jobs=jobs, backend="threading")(
+                joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis)
+                for exp_result in filtered_results
+            )
+
+        else:
+            for exp_result in filtered_results:
+                error_analysis = self.episode_summarizer(exp_result)
+                self.save_analysis(exp_result, error_analysis)
 
     def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
         """Save the analysis to json"""
@@ -56,28 +70,37 @@ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=T
 HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")
 
 
-if __name__ == "__main__":
+def main():
     import argparse
 
     parser = argparse.ArgumentParser()
     parser.add_argument("-e", "--exp_dir", type=str)
     parser.add_argument("-f", "--filter", type=str, default=None)
+    parser.add_argument("-p", "--parallel", action="store_true")
+    parser.add_argument("-j", "--jobs", type=int, default=-1)
 
     args = parser.parse_args()
+
+    assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir"
+
     exp_dir = Path(args.exp_dir)
     filter = args.filter
+    parallel = args.parallel
+    jobs = args.jobs
 
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
     llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
 
-    step_summarizer = ChangeSummarizer(llm, lambda x: x)
-    episode_summarizer = EpisodeSummarizer()
-
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
         episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
     )
 
-    pipeline.run_analysis()
+    pipeline.run_analysis(parallel=parallel, jobs=jobs)
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 14ab10ba..0b667cd8 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -6,8 +6,8 @@
     CHANGE_SUMMARIZER_PROMPT,
     ERROR_CLASSIFICATION_PROMPT,
 )
-from agentlab.analyze.inspect_results import summarize
 from agentlab.llm.llm_utils import json_parser, parse_html_tags
+from agentlab.llm.tracking import set_tracker
 
 
 def _diff(past_obs, current_obs):
@@ -94,14 +94,20 @@ def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         # if exp_results.steps_info[-1].reward == 1:
         #     return {"analysis": "Success", "summaries": {}}
 
-        summaries = self.make_change_summaries(exp_results)
+        with set_tracker("summary") as summaries_tracker:
+            summaries = self.make_change_summaries(exp_results)
         prompt = self.make_prompt(exp_results, summaries)
-        raw_analysis = self.llm(prompt)["content"]
+
+        with set_tracker("analysis") as analysis_tracker:
+            raw_analysis = self.llm(prompt)["content"]
         analysis = self.parse(raw_analysis)
-        return {
+        res = {
             "analysis": analysis,
             "summaries": {i: a for i, a in enumerate(summaries)},
         }
+        res.update(analysis_tracker.stats)
+        res.update(summaries_tracker.stats)
+        return res
 
     def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         summaries = []  # type: list[str]
@@ -136,7 +142,6 @@ def format_summary(summary):
 
         txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
 
-        thoughts = [step.agent_info.think for step in exp_results.steps_info[:-1]]
         actions = [step.action for step in exp_results.steps_info[:-1]]
         action_errors = "\n".join(
             [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
@@ -144,8 +149,8 @@ def format_summary(summary):
 
         txt_actions = "\n".join(
             [
-                f"Thoughts: {thought}\nAction: {action}\nAction Error: {action_error}"
-                for action, thought, action_error in zip(actions, thoughts, action_errors)
+                f"Action: {action}\nAction Error: {action_error}"
+                for action, action_error in zip(actions, action_errors)
             ]
         )
         return ERROR_CLASSIFICATION_PROMPT.format(

From 3a3d602f83f363a3606ddee7de29b41b031c6d87 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 20 Feb 2025 16:05:12 -0500
Subject: [PATCH 22/25] task_info

---
 .../analyze/error_analysis/summarizer.py         | 12 +++++++-----
 .../analyze/error_analysis/summarizer_prompts.py | 16 +++++-----------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 0b667cd8..00a17e09 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -91,8 +91,8 @@ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
 
-        # if exp_results.steps_info[-1].reward == 1:
-        #     return {"analysis": "Success", "summaries": {}}
+        if exp_results.steps_info[-1].reward == 1:
+            return {"analysis": "Success", "summaries": {}}
 
         with set_tracker("summary") as summaries_tracker:
             summaries = self.make_change_summaries(exp_results)
@@ -119,9 +119,7 @@ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
         return summaries
 
     def parse(self, raw_output: str) -> dict:
-        parsed_result = parse_html_tags(
-            raw_output, keys=["explanation", "success", "errorCategory"]
-        )[0]
+        parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0]
         return parsed_result
 
 
@@ -153,8 +151,12 @@ def format_summary(summary):
                 for action, action_error in zip(actions, action_errors)
             ]
         )
+
+        extra_info = exp_results.steps_info[-1].task_info
+
         return ERROR_CLASSIFICATION_PROMPT.format(
             goal=goal,
             historical_summaries=txt_summaries,
             action_history=txt_actions,
+            extra_info=extra_info,
         )
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
index 807f1a2c..91ea8d9c 100644
--- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -132,30 +132,24 @@
    - The current HTML or AX Tree observation
    - The user goal
 
-2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
-   If the task is successful, you can keep the error category as blank.
+2. Decide the error category, or a combination thereof, under which the reason for failure lies.
 
 3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
 
-Output format example for an unsuccessful interaction:
+Output format example for an interaction:
 
 <explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
-<success>False</success>
 <errorCategory>["Navigation & Planning"]</errorCategory>
 
-Output format example for a successful interaction:
-
-<explanation>The agent opened the correct GitLab page and ...</explanation>
-<success>True</success>
-<errorCategory>[]</errorCategory>
-  
 Please follow this structure at every step. Keep your responses concise and clear. 
 
-Below are the details for the interaction.
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
 
 Overall goal: {goal}
 
 Historical change summaries: {historical_summaries}
 
 Action history: {action_history}
+
+Extra information: {extra_info}
 """

From 5bf1bac08a99840843f213e14703b2b412be0d43 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 20 Feb 2025 16:14:12 -0500
Subject: [PATCH 23/25] added flag to oracle success or no

---
 .../analyze/error_analysis/pipeline.py        |   6 +-
 .../analyze/error_analysis/summarizer.py      |  15 ++-
 .../error_analysis/summarizer_prompts.py      | 113 ++++++++++++++++++
 3 files changed, 130 insertions(+), 4 deletions(-)

diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
index f726891f..32e5e9df 100644
--- a/src/agentlab/analyze/error_analysis/pipeline.py
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -78,6 +78,7 @@ def main():
     parser.add_argument("-f", "--filter", type=str, default=None)
     parser.add_argument("-p", "--parallel", action="store_true")
     parser.add_argument("-j", "--jobs", type=int, default=-1)
+    parser.add_argument("-g", "--guess_success", action="store_true")
 
     args = parser.parse_args()
 
@@ -87,6 +88,7 @@ def main():
     filter = args.filter
     parallel = args.parallel
     jobs = args.jobs
+    guess_success = args.guess_success
 
     from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
 
@@ -95,7 +97,9 @@ def main():
     pipeline = ErrorAnalysisPipeline(
         exp_dir=exp_dir,
         filter=filter,
-        episode_summarizer=EpisodeErrorSummarizer(ChangeSummarizer(llm, AXTREE_FORMATTER), llm),
+        episode_summarizer=EpisodeErrorSummarizer(
+            ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
+        ),
     )
 
     pipeline.run_analysis(parallel=parallel, jobs=jobs)
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index 00a17e09..e184583d 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -5,6 +5,7 @@
 from agentlab.analyze.error_analysis.summarizer_prompts import (
     CHANGE_SUMMARIZER_PROMPT,
     ERROR_CLASSIFICATION_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
 )
 from agentlab.llm.llm_utils import json_parser, parse_html_tags
 from agentlab.llm.tracking import set_tracker
@@ -85,14 +86,16 @@ class EpisodeSummarizer:
     change_summarizer: ChangeSummarizer = None
     llm: callable = None
     parser: callable = lambda x: json_parser(x)[0]
+    guess_success: bool = False
 
     def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
 
     def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
         """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
 
-        if exp_results.steps_info[-1].reward == 1:
-            return {"analysis": "Success", "summaries": {}}
+        if not self.guess_success:
+            if exp_results.steps_info[-1].reward == 1:
+                return {"analysis": "Success", "summaries": {}}
 
         with set_tracker("summary") as summaries_tracker:
             summaries = self.make_change_summaries(exp_results)
@@ -154,7 +157,13 @@ def format_summary(summary):
 
         extra_info = exp_results.steps_info[-1].task_info
 
-        return ERROR_CLASSIFICATION_PROMPT.format(
+        prompt = (
+            ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
+            if self.guess_success
+            else ERROR_CLASSIFICATION_PROMPT
+        )
+
+        return prompt.format(
             goal=goal,
             historical_summaries=txt_summaries,
             action_history=txt_actions,
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
index 91ea8d9c..a0df9fc9 100644
--- a/src/agentlab/analyze/error_analysis/summarizer_prompts.py
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -50,6 +50,119 @@
 Action: {action}
 """
 
+ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+  The agent cannot construct or execute a correct sequence of actions to reach its goal 
+  (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+  The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+  repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+  The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+  misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+  The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+  or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+  The agent misreads or misunderstands the user's objective (goal interpretation), 
+  loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+  The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+  or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
+   If the task is successful, you can keep the error category as blank.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an unsuccessful interaction:
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<success>False</success>
+<errorCategory>["Navigation & Planning"]</errorCategory>
+
+Output format example for a successful interaction:
+
+<explanation>The agent opened the correct GitLab page and ...</explanation>
+<success>True</success>
+<errorCategory>[]</errorCategory>
+  
+Please follow this structure at every step. Keep your responses concise and clear. 
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
+
+
 ERROR_CLASSIFICATION_PROMPT = """
 You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
 Below are the high-level definitions of each category,

From 097213029f9575b88568ccf8eb127b9c4145b82e Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 20 Feb 2025 16:31:47 -0500
Subject: [PATCH 24/25] darglint

---
 .../analyze/error_analysis/base_idea.py       | 287 ------------------
 .../analyze/error_analysis/summarizer.py      |   7 +
 2 files changed, 7 insertions(+), 287 deletions(-)
 delete mode 100644 src/agentlab/analyze/error_analysis/base_idea.py

diff --git a/src/agentlab/analyze/error_analysis/base_idea.py b/src/agentlab/analyze/error_analysis/base_idea.py
deleted file mode 100644
index 5d4827d4..00000000
--- a/src/agentlab/analyze/error_analysis/base_idea.py
+++ /dev/null
@@ -1,287 +0,0 @@
-from dataclasses import dataclass
-
-from bgym import ExpResult, StepInfo
-
-CHANGE_SUMMARIZER_PROMPT = """
-You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
-you will receive the following pieces of information:
-
-1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
-2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
-3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
-4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
-5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
-
-YOUR TASK (each step):
-A) SUMMARIZE THE CHANGE
-   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
-     For example, did a new panel open, did the form reset, did nothing happen, etc.?
-
-B) ASSESS THE ACTION
-   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
-     or if it appears incorrect/unhelpful. 
-   - Briefly explain why.
-
-OUTPUT FORMAT (per step):
-Return your analysis as a JSON-like structure, for example:
-
-{
-  "changeSummary": "A new search results panel appeared on the right side.",
-  "actionAssessment": "Correct",
-  "explanation": "Clicking 'Search' was appropriate to display the results."
-}
-
-Or for an incorrect action:
-
-{
-  "changeSummary": "The page reloaded but the date fields were reset to defaults.",
-  "actionAssessment": "Incorrect",
-  "explanation": "The agent should have fixed the date format first instead of re-clicking 'Show report'.",
-  "suggestion": "Correct the date format or check for error messages."
-}
-
-Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
-
-Goal: {goal}
-
-LLM Plan: {plan}
-
-Previous Observation: {past_observation}
-
-Current Observation: {current_observation}
-
-Past summaries: {past_summaries}
-
-Action: {action}
-"""
-
-ERROR_CLASSIFICATION_PROMPT = """
-You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
-Below are the high-level definitions of each top-level category (Agent Errors, Language Model Errors, and Benchmark/Environment Errors),
-followed by an explanation of the inputs you will receive (planning history, chain of thought, etc.), 
-a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
-
---------------------------------------------------------------------------------
-TAXONOMY DEFINITIONS
---------------------------------------------------------------------------------
-
-1. AGENT ERRORS
-These errors arise when agents interact with web interfaces and fail due to limitations in perception, navigation, or manipulation.
-
-   - Navigation & Planning Errors
-     The agent cannot construct or execute a correct sequence of actions to reach its goal 
-     (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
-
-   - Interaction Execution Errors
-     The agent enters data in the wrong format, forgets to click "Submit" after typing, 
-     repeats the same failing action without adaptation, or loses track of the changing webpage state.
-
-   - Information Processing Errors
-     The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
-     misconstrues relationships between pieces of information, or fails to validate data against task requirements.
-
-   - Observation & Action Errors
-     The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
-     or misaligns its actions (clicks the wrong element or stale link).
-
-2. LANGUAGE MODEL ERRORS
-These errors result from the model's inability to correctly interpret or reason about the task at a higher level, 
-independent of the low-level web interactions.
-
-   - Task Understanding Errors
-     The agent misreads or misunderstands the user's objective (goal interpretation), 
-     loses crucial context (context loss), or performs actions beyond or short of the intended scope.
-
-   - Reasoning Failures
-     The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
-     or fails to prioritize important subtasks when handling complex goals.
-
-3. BENCHMARK & ENVIRONMENT ERRORS
-These errors are external to the agent's logic and the language model's reasoning, 
-arising from flaws in the system, network, or evaluation framework itself.
-
-   - System Errors
-     Network failures, API downtime, or dynamic web changes that break the agent's assumptions (e.g., layout shifts).
-
-   - Benchmark Design Errors
-     Ambiguous or contradictory task specifications, incorrect validation criteria (where correct solutions are flagged as failures), 
-     or inflexible evaluation systems that fail to account for valid alternative solutions.
-
---------------------------------------------------------------------------------
-INPUT DESCRIPTION
---------------------------------------------------------------------------------
-
-You will receive the following for each scenario:
-1. User Goal
-   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
-   
-2. Planning / Thought History
-   - The internal reasoning or plan the agent considered. May include branches of logic or key decision points.
-
-3. Current Observation (HTML / AX Tree Snippet)
-   - The webpage structure or state that the agent sees at a given point in time.
-
-4. Historical change summaries
-   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
-
-5. Action History
-   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
-     along with immediate outcomes or errors.
-
-Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
-
---------------------------------------------------------------------------------
-FEW-SHOT CLASSIFICATION EXAMPLES
---------------------------------------------------------------------------------
-
-1) EXAMPLE A (Benchmark Error - Benchmark Design Error)
-   • Context: The agent correctly finds a cheaper product meeting the user's criteria, 
-     but the benchmark expects a more expensive product and marks the solution as wrong.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The agent's solution is objectively valid, but the evaluation framework is too rigid 
-     and does not allow an alternative correct solution.
-
-2) EXAMPLE B (Agent Error - Interaction Execution)
-   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
-     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
-   • Classification: ["Agent Error - Interaction Execution"]
-   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
-     without adaptation ("Action Repetition").
-
-3) EXAMPLE C (Benchmark Error - Benchmark Design Error)
-   • Context: The user asks, "Where is the nearest In-N-Out to Upitts?" 
-     The query is ambiguous because "Upitts" is not a standard location. 
-     The agent flounders, eventually returning "No In-N-Out found," which is incorrect for the region.
-   • Classification: ["Benchmark Design Error"]
-   • Justification: The task goal is poorly specified ("Upitts" is ambiguous or unrealistic), 
-     leading the agent astray due to unclear context.
-
-4) EXAMPLE D (Language Model Error - Task Understanding)
-   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
-     that are older than 30 days and add a comment saying 'I can help fix this.'" 
-     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
-     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
-   • Classification: ["Language Model Error - Task Understanding"]
-   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
-     it focused on creating a new issue. This is a misinterpretation of the instructions, 
-     not a mechanical error in clicking or input format.
-
---------------------------------------------------------------------------------
-CLASSIFICATION TASK
---------------------------------------------------------------------------------
-
-1. Read through:
-   - The planning and thought history
-   - The action history
-   - The current HTML or AX Tree observation
-   - The user goal
-
-2. Decide if the failure is:
-   - An Agent Error (which subcategory/subcategories),
-   - A Language Model Error (which subcategory/subcategories),
-   - A Benchmark/Environment Error (which subcategory/subcategories),
-   - Or a combination thereof (multi-label if needed).
-
-3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
-
-4. If the agent succeeds (no error), label the errorCategory accordingly as "Success".
-
-Output Format Example:
-{
-  "errorCategory": ["Agent Error - Navigation & Planning"],
-  "explanation": "The agent opened the wrong GitLab page and never recovered..."
-}
-
-Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
-
-Overall goal: {goal}
-
-LLM Plan and thought history: {plan}
-
-Current Observation: {current_observation}
-
-Historical change summaries: {historical_summaries}
-
-Action history: {action_history}
-"""
-
-
-def _diff(past_obs, current_obs):
-    """TODO: Implement the diff function.
-
-    Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
-    """
-    raise ValueError("Not implemented yet.")
-
-
-@dataclass
-class ChangeSummarizer:
-
-    llm: callable  # language model
-    obs_formatter: callable
-    use_diff: bool = False
-
-    def summarize(
-        self, past_obs: dict, action: str, current_obs: dict, past_summaries: list[str]
-    ) -> str:
-        """Produces, a summary of the effect of an action."""
-        past_obs_message = self.obs_formatter(past_obs)
-        current_obs_message = self.obs_formatter(current_obs)
-
-        goal = past_obs["goal"]  # Use goal object from agentlab
-        # Outsource everything to formatter
-        plan = past_obs["plan"]
-        if self.use_diff:
-            current_obs_message = _diff(past_obs_message, current_obs_message)
-
-        return self.llm(
-            self.make_prompt(
-                past_obs_message, action, current_obs_message, past_summaries, goal, plan
-            )
-        )
-
-    def make_prompt(
-        self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
-    ):
-        """TODO: Implement the prompt."""
-        return CHANGE_SUMMARIZER_PROMPT.format(
-            goal=goal,
-            plan=plan,
-            past_observation=past_obs_message,
-            current_observation=current_obs_message,
-            past_summaries=past_summaries,
-            action=action,
-        )
-
-
-@dataclass
-class EpisodeAnalysis:
-    analysis: str  # complete analysis of the episode
-    summary: str  # short summary of the analysis
-    categories: dict[str, float]  # score for each category e.g. type of error or difficulty levels
-
-
-@dataclass
-class EpisodeSummarizer:
-
-    change_summarizer: ChangeSummarizer = None
-
-    def summarize(exp_results: list[ExpResult], change_summaries: list[str]) -> EpisodeAnalysis:
-        """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
-        pass
-
-
-@dataclass
-class EpisodeErrorSummarizer(EpisodeSummarizer):
-
-    change_summarizer: ChangeSummarizer = None
-
-    def make_prompt(self, current_observation, action_history, historical_summaries, goal, plan):
-        """TODO: Implement the prompt."""
-        return ERROR_CLASSIFICATION_PROMPT.format(
-            goal=goal,
-            plan=plan,
-            current_observation=current_observation,
-            historical_summaries=historical_summaries,
-            action_history=action_history,
-        )
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
index e184583d..2919f052 100644
--- a/src/agentlab/analyze/error_analysis/summarizer.py
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -15,6 +15,13 @@ def _diff(past_obs, current_obs):
     """TODO: Implement the diff function.
 
     Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+
+    Args:
+        past_obs: The past observation.
+        current_obs: The current observation.
+
+    Raises:
+        ValueError: Not implemented yet.
     """
     raise ValueError("Not implemented yet.")
 

From 46d10754c3d9220b48d07319982f90ab7f5e2042 Mon Sep 17 00:00:00 2001
From: ThibaultLSDC <thibault.de.chezelles@gmail.com>
Date: Thu, 20 Feb 2025 17:00:22 -0500
Subject: [PATCH 25/25] tests

---
 tests/analyze/error_analysis/test_pipeline.py | 26 ++-----------------
 .../analyze/error_analysis/test_summarizer.py | 12 +++++++--
 2 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py
index f9570c2b..a2f6295d 100644
--- a/tests/analyze/error_analysis/test_pipeline.py
+++ b/tests/analyze/error_analysis/test_pipeline.py
@@ -16,7 +16,7 @@ def summarize(
 
 
 class MockEpisodeSummarizer:
-    def summarize(self, exp_result: ExpResult, step_analysis: list[str]) -> str:
+    def __call__(self, exp_result: ExpResult) -> str:
         return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}"
 
 
@@ -33,8 +33,6 @@ def pipeline() -> ErrorAnalysisPipeline:
         exp_dir=exp_dir,
         filter=None,
         episode_summarizer=MockEpisodeSummarizer(),
-        step_summarizer=MockStepSummarizer(),
-        analyzer=MockAnalyzer(),
     )
 
 
@@ -49,30 +47,10 @@ def test_yield_with_filter(pipeline: ErrorAnalysisPipeline):
     pipeline.filter = None
 
 
-def test_analyze_step(pipeline: ErrorAnalysisPipeline):
-    exp_result = next(pipeline.filter_exp_results())
-    step_analysis = pipeline.analyze_step(exp_result)
-
-    assert len(exp_result.steps_info) == len(step_analysis) + 1
-    assert step_analysis[0] == f"Agent took action {exp_result.steps_info[0].action} at step 0"
-
-
-def test_analyze_episode(pipeline: ErrorAnalysisPipeline):
-    exp_result = next(pipeline.filter_exp_results())
-    step_analysis = pipeline.analyze_step(exp_result)
-    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
-
-    for step_info in exp_result.steps_info:
-        if step_info.action:
-            assert step_info.action in episode_analysis
-
-
 def test_save_analysis(pipeline: ErrorAnalysisPipeline):
     exp_result = next(pipeline.filter_exp_results())
-    step_analysis = pipeline.analyze_step(exp_result)
-    episode_analysis = pipeline.analyze_episode(exp_result, step_analysis)
-    error_analysis = pipeline.analyze_errors(exp_result, episode_analysis, step_analysis)
 
+    error_analysis = pipeline.episode_summarizer(exp_result)
     pipeline.save_analysis(exp_result, error_analysis, exists_ok=False)
 
     assert (exp_result.exp_dir / "error_analysis.json").exists()
diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py
index e9fe0ecc..83418496 100644
--- a/tests/analyze/error_analysis/test_summarizer.py
+++ b/tests/analyze/error_analysis/test_summarizer.py
@@ -13,10 +13,18 @@ def exp_results() -> list[ExpResult]:
     return list(yield_all_exp_results(exp_dir))
 
 
+@pytest.mark.pricy
 def test_change_summarizer(exp_results: list[ExpResult]):
-    summarizer = ChangeSummarizer(llm=lambda x: x)
+    summarizer = ChangeSummarizer(llm=lambda x: {"content": x})
     step = exp_results[0].steps_info[0]
     next_step = exp_results[0].steps_info[1]
     past_summaries = []
     summary = summarizer.summarize(step, next_step, past_summaries)
-    assert isinstance(summary, str)
+    assert isinstance(summary, dict)
+
+
+if __name__ == "__main__":
+    exp_res = list(
+        yield_all_exp_results(Path(__file__).parent.parent.parent / "data/error_analysis")
+    )
+    test_change_summarizer(exp_res)