diff --git a/pyproject.toml b/pyproject.toml
index 2a1e06c3..782b1f26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,3 +57,4 @@ exclude = '''
 [project.scripts]
 agentlab-assistant = "agentlab.ui_assistant:main"
 agentlab-xray = "agentlab.analyze.agent_xray:main"
+agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main"
diff --git a/requirements.txt b/requirements.txt
index c598b342..a59d4a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-black[jupyter]>=24.2.0
+black[jupyter]>=24.2.0,<25
 blacken-docs
 pre-commit
 pytest==7.3.2
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index e21ada58..f34d630c 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -257,7 +257,7 @@
 )
 
 AGENT_4o_MINI = GenericAgentArgs(
-    chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+    chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
     flags=FLAGS_GPT_4o,
 )
 AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 9764898c..7466db87 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1,4 +1,5 @@
 import base64
+import json
 import os
 import traceback
 from copy import deepcopy
@@ -30,6 +31,32 @@
 TASK_SEED_KEY = "env.task_seed"
 
 
+def dict_to_markdown(data, level=1):
+    """
+    Convert a nested dictionary to a Markdown string with hierarchical headers.
+
+    Parameters:
+        data (dict): The dictionary to convert.
+        level (int): The current header level (default is 1).
+
+    Returns:
+        str: The formatted Markdown string.
+    """
+    markdown = ""
+
+    for key, value in data.items():
+        if isinstance(value, dict):
+            # Add a header for the key and recursively process the dictionary
+            markdown += f"{'#' * level} {key}\n"
+            markdown += dict_to_markdown(value, level + 1)
+        else:
+            # Add the key-value pair with indentation
+            markdown += f"{'#' * level} {key}\n"
+            markdown += f"    {value}\n"
+
+    return markdown
+
+
 def display_table(df: pd.DataFrame):
     df = df.copy()
     df.columns = clean_column_names(df.columns)
@@ -358,6 +385,9 @@ def run_gradio(results_dir: Path):
             with gr.Tab("Task Error") as tab_error:
                 task_error = gr.Markdown()
 
+            with gr.Tab("Error Analysis") as tab_error_analysis:
+                error_analysis = gr.Markdown()
+
             with gr.Tab("Logs") as tab_logs:
                 logs = gr.Code(language=None, **code_args)
 
@@ -485,6 +515,7 @@ def run_gradio(results_dir: Path):
         tab_axtree.select(fn=update_axtree, outputs=axtree_code)
         tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
         tab_error.select(fn=update_task_error, outputs=task_error)
+        tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis)
         tab_logs.select(fn=update_logs, outputs=logs)
         tab_stats.select(fn=update_stats, outputs=stats)
         tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
@@ -612,6 +643,20 @@ def update_task_error():
         return "No Task Error"
 
 
+def update_error_analysis():
+    global info
+    try:
+        error_analysis = info.exp_result.exp_dir / "error_analysis.json"
+        if not error_analysis.exists():
+            return "No Error Analysis Found"
+        with error_analysis.open("r") as f:
+            json_data = json.load(f)
+        res = dict_to_markdown(json_data)
+        return res
+    except FileNotFoundError:
+        return "No Error Analysis"
+
+
 def update_logs():
     global info
     try:
@@ -1200,3 +1245,4 @@ def main():
 
 if __name__ == "__main__":
     main()
+    main()
diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
new file mode 100644
index 00000000..32e5e9df
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -0,0 +1,110 @@
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator
+
+from bgym import ExpResult
+
+from agentlab.analyze.error_analysis.summarizer import (
+    ChangeSummarizer,
+    EpisodeErrorSummarizer,
+    EpisodeSummarizer,
+)
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@dataclass
+class Analyzer:
+    prompt: str
+    llm = None
+
+    def __call__(self, *args, **kwds):
+        return "analysis"
+
+
+def analyze(exp_result, episode_summarizer, save_analysis_func):
+    error_analysis = episode_summarizer(exp_result)
+    save_analysis_func(exp_result, error_analysis)
+
+
+@dataclass
+class ErrorAnalysisPipeline:
+    exp_dir: Path
+    filter: str = None
+    episode_summarizer: EpisodeSummarizer = None
+
+    def filter_exp_results(self) -> Generator[ExpResult, None, None]:
+        # TODO:(thibault) improve filtering
+        exp_results = yield_all_exp_results(self.exp_dir)
+        for exp_result in exp_results:
+            if self.filter is None or self.filter in str(exp_result.exp_dir):
+                yield exp_result
+
+    def run_analysis(self, parallel=False, jobs=-1):
+        filtered_results = self.filter_exp_results()
+
+        if parallel:
+            import joblib
+
+            joblib.Parallel(n_jobs=jobs, backend="threading")(
+                joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis)
+                for exp_result in filtered_results
+            )
+
+        else:
+            for exp_result in filtered_results:
+                error_analysis = self.episode_summarizer(exp_result)
+                self.save_analysis(exp_result, error_analysis)
+
+    def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
+        """Save the analysis to json"""
+        analysis_path = exp_result.exp_dir / "error_analysis.json"
+        if not exists_ok and analysis_path.exists():
+            raise FileExistsError(f"{analysis_path} already exists")
+        with analysis_path.open("w") as f:
+            json.dump(error_analysis, f, indent=4)
+
+
+AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available")
+HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-e", "--exp_dir", type=str)
+    parser.add_argument("-f", "--filter", type=str, default=None)
+    parser.add_argument("-p", "--parallel", action="store_true")
+    parser.add_argument("-j", "--jobs", type=int, default=-1)
+    parser.add_argument("-g", "--guess_success", action="store_true")
+
+    args = parser.parse_args()
+
+    assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir"
+
+    exp_dir = Path(args.exp_dir)
+    filter = args.filter
+    parallel = args.parallel
+    jobs = args.jobs
+    guess_success = args.guess_success
+
+    from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+    llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
+
+    pipeline = ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=filter,
+        episode_summarizer=EpisodeErrorSummarizer(
+            ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
+        ),
+    )
+
+    pipeline.run_analysis(parallel=parallel, jobs=jobs)
+
+
+if __name__ == "__main__":
+
+    main()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
new file mode 100644
index 00000000..2919f052
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -0,0 +1,178 @@
+from dataclasses import dataclass
+
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer_prompts import (
+    CHANGE_SUMMARIZER_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT,
+    ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
+)
+from agentlab.llm.llm_utils import json_parser, parse_html_tags
+from agentlab.llm.tracking import set_tracker
+
+
+def _diff(past_obs, current_obs):
+    """TODO: Implement the diff function.
+
+    Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+
+    Args:
+        past_obs: The past observation.
+        current_obs: The current observation.
+
+    Raises:
+        ValueError: Not implemented yet.
+    """
+    raise ValueError("Not implemented yet.")
+
+
+@dataclass
+class ChangeSummarizer:
+
+    llm: callable  # language model
+    obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
+    use_diff: bool = False
+
+    def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
+        """Produces, a summary of the effect of an action."""
+        obs_message = self.obs_formatter(obs.obs)
+        next_obs_message = self.obs_formatter(next_obs.obs)
+
+        action = obs.action
+
+        goal = obs.obs["goal"]  # Use goal object from agentlab
+        # TODO(thibault): switch to 'goal_object'
+        # Outsource everything to formatter
+
+        if self.use_diff:
+            next_obs_message = _diff(obs_message, next_obs_message)
+
+        return self.parse(
+            self.llm(
+                self.make_prompt(
+                    obs_message,
+                    action,
+                    next_obs_message,
+                    past_summaries,
+                    goal,
+                    obs.obs.get("plan", "No plan available"),
+                )
+            )["content"]
+        )
+
+    def make_prompt(
+        self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
+    ):
+        """TODO: Implement the prompt."""
+        return CHANGE_SUMMARIZER_PROMPT.format(
+            goal=goal,
+            plan=plan,
+            past_observation=past_obs_message,
+            current_observation=current_obs_message,
+            past_summaries=past_summaries,
+            action=action,
+        )
+
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(
+            raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"]
+        )[0]
+        return parsed_result
+
+
+@dataclass
+class EpisodeAnalysis:
+    analysis: str  # complete analysis of the episode
+    summary: str  # short summary of the analysis
+    categories: dict[str, float]  # score for each category e.g. type of error or difficulty levels
+
+
+@dataclass
+class EpisodeSummarizer:
+
+    change_summarizer: ChangeSummarizer = None
+    llm: callable = None
+    parser: callable = lambda x: json_parser(x)[0]
+    guess_success: bool = False
+
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
+
+    def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
+        """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+
+        if not self.guess_success:
+            if exp_results.steps_info[-1].reward == 1:
+                return {"analysis": "Success", "summaries": {}}
+
+        with set_tracker("summary") as summaries_tracker:
+            summaries = self.make_change_summaries(exp_results)
+        prompt = self.make_prompt(exp_results, summaries)
+
+        with set_tracker("analysis") as analysis_tracker:
+            raw_analysis = self.llm(prompt)["content"]
+        analysis = self.parse(raw_analysis)
+        res = {
+            "analysis": analysis,
+            "summaries": {i: a for i, a in enumerate(summaries)},
+        }
+        res.update(analysis_tracker.stats)
+        res.update(summaries_tracker.stats)
+        return res
+
+    def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
+        summaries = []  # type: list[str]
+        # this assumes that there is always an extra step at the end of the episode
+        # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+        # TODO:(thibault) make some checks or w/e
+        for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+            summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+        return summaries
+
+    def parse(self, raw_output: str) -> dict:
+        parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0]
+        return parsed_result
+
+
+@dataclass
+class EpisodeErrorSummarizer(EpisodeSummarizer):
+
+    change_summarizer: ChangeSummarizer = None
+
+    def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
+        """TODO: Implement the prompt."""
+        goal = exp_results.steps_info[0].obs["goal"]
+
+        def format_summary(summary):
+            res = ""
+            for key, value in summary.items():
+                res += f"{key}: {value}\n"
+            return res
+
+        txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
+
+        actions = [step.action for step in exp_results.steps_info[:-1]]
+        action_errors = "\n".join(
+            [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
+        )
+
+        txt_actions = "\n".join(
+            [
+                f"Action: {action}\nAction Error: {action_error}"
+                for action, action_error in zip(actions, action_errors)
+            ]
+        )
+
+        extra_info = exp_results.steps_info[-1].task_info
+
+        prompt = (
+            ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
+            if self.guess_success
+            else ERROR_CLASSIFICATION_PROMPT
+        )
+
+        return prompt.format(
+            goal=goal,
+            historical_summaries=txt_summaries,
+            action_history=txt_actions,
+            extra_info=extra_info,
+        )
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
new file mode 100644
index 00000000..a0df9fc9
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -0,0 +1,268 @@
+CHANGE_SUMMARIZER_PROMPT = """
+You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, 
+you will receive the following pieces of information:
+
+1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
+2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
+3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
+4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
+5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
+
+YOUR TASK (each step):
+A) SUMMARIZE THE CHANGE
+   - Describe what visibly changed between the previous observation (or diff) and the current observation. 
+     For example, did a new panel open, did the form reset, did nothing happen, etc.?
+
+B) ASSESS THE ACTION
+   - Decide whether the agent's action seems helpful or correct given the user's main goal, 
+     or if it appears incorrect/unhelpful. 
+   - Briefly explain why.
+
+OUTPUT FORMAT (per step):
+Return your analysis as a JSON-like structure, for example:
+
+<changeSummary>A new search results panel appeared on the right side.</changeSummary>
+<actionAssessment>Correct</actionAssessment>
+<explanation>Clicking 'Search' was appropriate to display the results.</explanation>
+
+Or for an incorrect action:
+
+<changeSummary>The page reloaded but the date fields were reset to defaults.</changeSummary>
+<actionAssessment>Incorrect</actionAssessment>
+<explanation>The agent should have fixed the date format first instead of re-clicking 'Show report'.</explanation>
+<suggestion>Correct the date format or check for error messages.</suggestion>
+
+
+Please use single quotes '' to quote elements from the page, so as not to create parsing issues.
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Goal: {goal}
+
+LLM Plan: {plan}
+
+Current Observation: {past_observation}
+
+Next Observation: {current_observation}
+
+Past summaries: {past_summaries}
+
+Action: {action}
+"""
+
+ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+  The agent cannot construct or execute a correct sequence of actions to reach its goal 
+  (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+  The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+  repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+  The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+  misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+  The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+  or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+  The agent misreads or misunderstands the user's objective (goal interpretation), 
+  loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+  The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+  or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
+   If the task is successful, you can keep the error category as blank.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an unsuccessful interaction:
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<success>False</success>
+<errorCategory>["Navigation & Planning"]</errorCategory>
+
+Output format example for a successful interaction:
+
+<explanation>The agent opened the correct GitLab page and ...</explanation>
+<success>True</success>
+<errorCategory>[]</errorCategory>
+  
+Please follow this structure at every step. Keep your responses concise and clear. 
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
+
+
+ERROR_CLASSIFICATION_PROMPT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. 
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), 
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+  The agent cannot construct or execute a correct sequence of actions to reach its goal 
+  (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+  The agent enters data in the wrong format, forgets to click "Submit" after typing, 
+  repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+  The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), 
+  misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+  The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+  or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+  The agent misreads or misunderstands the user's objective (goal interpretation), 
+  loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+  The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, 
+  or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+   - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+   - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+   - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) 
+     along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+   • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. 
+     Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+   • Classification: ["Interaction Execution"]
+   • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action 
+     without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+   • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' 
+     that are older than 30 days and add a comment saying 'I can help fix this.'" 
+     The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue 
+     with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+   • Classification: ["Task Understanding"]
+   • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, 
+     it focused on creating a new issue. This is a misinterpretation of the instructions, 
+     not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+   - The planning and thought history
+   - The action history
+   - The current HTML or AX Tree observation
+   - The user goal
+
+2. Decide the error category, or a combination thereof, under which the reason for failure lies.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an interaction:
+
+<explanation>The agent opened the wrong GitLab page and never recovered...</explanation>
+<errorCategory>["Navigation & Planning"]</errorCategory>
+
+Please follow this structure at every step. Keep your responses concise and clear. 
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
diff --git a/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
new file mode 100644
index 00000000..af5613bb
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from agentlab.analyze.inspect_results import (
+    load_result_df,
+)
+import json
+
+
+def get_aggregate_statistics(exp_dir: Path):
+    """Get aggregate statistics for the experiment results."""
+    results = load_result_df(exp_dir, filter=filter)
+
+
+if __name__ == "__main__":
+    path = Path(
+        "/mnt/colab_public/data/ui_copilot/thibault/tmlr_exps/2024-10-23_14-17-47_5_agents_on_workarena_l1"
+    )
+    results = load_result_df(path).reset_index()
+    results = results.loc[results["agent.chat_model.model_name"].str.contains("anthropic")]
+    success_predictions = []
+    for dir in results["exp_dir"]:
+        error_analysis = Path(dir) / "error_analysis.json"
+        if error_analysis.exists():
+            with open(error_analysis, "r") as f:
+                error_analysis = json.load(f)
+            task_success_prediction_str = error_analysis["analysis"]["success"]
+            task_success_prediction = True if task_success_prediction_str == "True" else False
+            success_predictions.append(task_success_prediction)
+        else:
+            success_predictions.append(None)
+    results["success_predictions"] = success_predictions
+    a = 1
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 096aae00..567e0798 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -4,7 +4,7 @@
 import time
 from dataclasses import dataclass
 from functools import partial
-from typing import Optional
+from typing import Optional, Union
 
 import openai
 from huggingface_hub import InferenceClient
@@ -13,7 +13,7 @@
 import agentlab.llm.tracking as tracking
 from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
 from agentlab.llm.huggingface_utils import HFBaseChatModel
-from agentlab.llm.llm_utils import AIMessage, Discussion
+from agentlab.llm.llm_utils import AIMessage, Discussion, HumanMessage
 
 
 def make_system_message(content: str) -> dict:
@@ -268,7 +268,13 @@ def __init__(
             **client_args,
         )
 
-    def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict:
+    def __call__(
+        self, messages: Union[str, list[dict]], n_samples: int = 1, temperature: float = None
+    ) -> dict:
+
+        if isinstance(messages, str):
+            messages = [HumanMessage(messages)]
+
         # Initialize retry tracking attributes
         self.retries = 0
         self.success = False
diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py
new file mode 100644
index 00000000..a2f6295d
--- /dev/null
+++ b/tests/analyze/error_analysis/test_pipeline.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline
+
+exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+
+
+class MockStepSummarizer:
+    def summarize(
+        self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str]
+    ) -> str:
+        return f"Agent took action {action} at step {len(step_summaries)}"
+
+
+class MockEpisodeSummarizer:
+    def __call__(self, exp_result: ExpResult) -> str:
+        return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}"
+
+
+class MockAnalyzer:
+    def __call__(
+        self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+    ) -> str:
+        return {"error": "analysis", "episode": episode_analysis}
+
+
+@pytest.fixture(scope="module")
+def pipeline() -> ErrorAnalysisPipeline:
+    return ErrorAnalysisPipeline(
+        exp_dir=exp_dir,
+        filter=None,
+        episode_summarizer=MockEpisodeSummarizer(),
+    )
+
+
+def test_yield_no_filter(pipeline: ErrorAnalysisPipeline):
+    assert len(list(pipeline.filter_exp_results())) == 4
+
+
+def test_yield_with_filter(pipeline: ErrorAnalysisPipeline):
+    pattern = "click-dialog"
+    pipeline.filter = pattern
+    assert len(list(pipeline.filter_exp_results())) == 2
+    pipeline.filter = None
+
+
+def test_save_analysis(pipeline: ErrorAnalysisPipeline):
+    exp_result = next(pipeline.filter_exp_results())
+
+    error_analysis = pipeline.episode_summarizer(exp_result)
+    pipeline.save_analysis(exp_result, error_analysis, exists_ok=False)
+
+    assert (exp_result.exp_dir / "error_analysis.json").exists()
+
+    # remove the file
+    (exp_result.exp_dir / "error_analysis.json").unlink()
+
+
+if __name__ == "__main__":
+    test_yield_with_filter()
diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py
new file mode 100644
index 00000000..83418496
--- /dev/null
+++ b/tests/analyze/error_analysis/test_summarizer.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@pytest.fixture(scope="module")
+def exp_results() -> list[ExpResult]:
+    exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+    return list(yield_all_exp_results(exp_dir))
+
+
+@pytest.mark.pricy
+def test_change_summarizer(exp_results: list[ExpResult]):
+    summarizer = ChangeSummarizer(llm=lambda x: {"content": x})
+    step = exp_results[0].steps_info[0]
+    next_step = exp_results[0].steps_info[1]
+    past_summaries = []
+    summary = summarizer.summarize(step, next_step, past_summaries)
+    assert isinstance(summary, dict)
+
+
+if __name__ == "__main__":
+    exp_res = list(
+        yield_all_exp_results(Path(__file__).parent.parent.parent / "data/error_analysis")
+    )
+    test_change_summarizer(exp_res)
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl
new file mode 100644
index 00000000..b2856641
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz
new file mode 100644
index 00000000..482f9b3d
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz
new file mode 100644
index 00000000..00267af0
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz
new file mode 100644
index 00000000..52c5209d
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz
new file mode 100644
index 00000000..b00c7bcf
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz
new file mode 100644
index 00000000..d7f75bd6
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
new file mode 100644
index 00000000..34e6f226
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 3,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 4,
+    "stats.cum_n_token_goal": 27,
+    "stats.max_n_token_goal": 9,
+    "stats.cum_n_token_url": 72,
+    "stats.max_n_token_url": 24,
+    "stats.cum_n_token_focused_element_bid": 3,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 8,
+    "stats.max_n_token_last_action": 4,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 2892,
+    "stats.max_n_token_dom_txt": 966,
+    "stats.cum_n_token_axtree_txt": 667,
+    "stats.max_n_token_axtree_txt": 223,
+    "stats.cum_n_token_pruned_html": 1014,
+    "stats.max_n_token_pruned_html": 340,
+    "stats.cum_n_retry_llm": 3,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 4339,
+    "stats.max_input_tokens": 1464,
+    "stats.cum_output_tokens": 225,
+    "stats.max_output_tokens": 84,
+    "stats.cum_cost": 0.00078585,
+    "stats.max_cost": 0.0002646,
+    "stats.cum_n_token_agent_messages": 4512,
+    "stats.max_n_token_agent_messages": 1517,
+    "stats.cum_step_elapsed": 3.0203144550323486,
+    "stats.max_step_elapsed": 1.3659462928771973,
+    "stats.cum_agent_elapsed": 3.8209800720214844,
+    "stats.max_agent_elapsed": 1.8219048976898193,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl
new file mode 100644
index 00000000..6bdd8639
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz
new file mode 100644
index 00000000..45522b9e
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz
new file mode 100644
index 00000000..1d3d08b1
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz
new file mode 100644
index 00000000..18c107bd
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz
new file mode 100644
index 00000000..d55bd69a
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
new file mode 100644
index 00000000..6f351629
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 2,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 3,
+    "stats.cum_n_token_goal": 12,
+    "stats.max_n_token_goal": 6,
+    "stats.cum_n_token_url": 48,
+    "stats.max_n_token_url": 24,
+    "stats.cum_n_token_focused_element_bid": 2,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 4,
+    "stats.max_n_token_last_action": 4,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1902,
+    "stats.max_n_token_dom_txt": 952,
+    "stats.cum_n_token_axtree_txt": 400,
+    "stats.max_n_token_axtree_txt": 201,
+    "stats.cum_n_token_pruned_html": 650,
+    "stats.max_n_token_pruned_html": 326,
+    "stats.cum_n_retry_llm": 2,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 2789,
+    "stats.max_input_tokens": 1404,
+    "stats.cum_output_tokens": 128,
+    "stats.max_output_tokens": 65,
+    "stats.cum_cost": 0.00049515,
+    "stats.max_cost": 0.00024839999999999997,
+    "stats.cum_n_token_agent_messages": 2902,
+    "stats.max_n_token_agent_messages": 1459,
+    "stats.cum_step_elapsed": 6.860883951187134,
+    "stats.max_step_elapsed": 5.8696064949035645,
+    "stats.cum_agent_elapsed": 3.769465684890747,
+    "stats.max_agent_elapsed": 2.946484327316284,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl
new file mode 100644
index 00000000..71da24d7
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz
new file mode 100644
index 00000000..6f8de674
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz
new file mode 100644
index 00000000..94b8701c
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz
new file mode 100644
index 00000000..636120ba
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
new file mode 100644
index 00000000..351aa01c
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 1,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 2,
+    "stats.cum_n_token_goal": 10,
+    "stats.max_n_token_goal": 10,
+    "stats.cum_n_token_url": 23,
+    "stats.max_n_token_url": 23,
+    "stats.cum_n_token_focused_element_bid": 1,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 0,
+    "stats.max_n_token_last_action": 0,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1257,
+    "stats.max_n_token_dom_txt": 1257,
+    "stats.cum_n_token_axtree_txt": 75,
+    "stats.max_n_token_axtree_txt": 75,
+    "stats.cum_n_token_pruned_html": 658,
+    "stats.max_n_token_pruned_html": 658,
+    "stats.cum_n_retry_llm": 1,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 1594,
+    "stats.max_input_tokens": 1594,
+    "stats.cum_output_tokens": 64,
+    "stats.max_output_tokens": 64,
+    "stats.cum_cost": 0.00027749999999999997,
+    "stats.max_cost": 0.00027749999999999997,
+    "stats.cum_n_token_agent_messages": 1653,
+    "stats.max_n_token_agent_messages": 1653,
+    "stats.cum_step_elapsed": 5.879024505615234,
+    "stats.max_step_elapsed": 5.879024505615234,
+    "stats.cum_agent_elapsed": 3.029170036315918,
+    "stats.max_agent_elapsed": 3.029170036315918,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl
new file mode 100644
index 00000000..3399e40c
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz
new file mode 100644
index 00000000..ef19e47f
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz
new file mode 100644
index 00000000..2aac84fd
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz
new file mode 100644
index 00000000..a426bd07
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
new file mode 100644
index 00000000..85b34311
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
@@ -0,0 +1,2 @@
+avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+1.0,0.0,1.0,1/1,0,0.0003
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
new file mode 100644
index 00000000..a17872af
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
@@ -0,0 +1,44 @@
+{
+    "n_steps": 1,
+    "cum_reward": 1.0,
+    "cum_raw_reward": 0,
+    "err_msg": null,
+    "stack_trace": null,
+    "stats.cum_steps": 2,
+    "stats.cum_n_token_goal": 10,
+    "stats.max_n_token_goal": 10,
+    "stats.cum_n_token_url": 23,
+    "stats.max_n_token_url": 23,
+    "stats.cum_n_token_focused_element_bid": 1,
+    "stats.max_n_token_focused_element_bid": 1,
+    "stats.cum_n_token_last_action": 0,
+    "stats.max_n_token_last_action": 0,
+    "stats.cum_n_token_last_action_error": 0,
+    "stats.max_n_token_last_action_error": 0,
+    "stats.cum_n_token_dom_txt": 1250,
+    "stats.max_n_token_dom_txt": 1250,
+    "stats.cum_n_token_axtree_txt": 71,
+    "stats.max_n_token_axtree_txt": 71,
+    "stats.cum_n_token_pruned_html": 651,
+    "stats.max_n_token_pruned_html": 651,
+    "stats.cum_n_retry_llm": 1,
+    "stats.max_n_retry_llm": 1,
+    "stats.cum_n_retry": 0.0,
+    "stats.max_n_retry": 0.0,
+    "stats.cum_busted_retry": 0,
+    "stats.max_busted_retry": 0,
+    "stats.cum_input_tokens": 1589,
+    "stats.max_input_tokens": 1589,
+    "stats.cum_output_tokens": 63,
+    "stats.max_output_tokens": 63,
+    "stats.cum_cost": 0.00027614999999999996,
+    "stats.max_cost": 0.00027614999999999996,
+    "stats.cum_n_token_agent_messages": 1641,
+    "stats.max_n_token_agent_messages": 1641,
+    "stats.cum_step_elapsed": 5.891982078552246,
+    "stats.max_step_elapsed": 5.891982078552246,
+    "stats.cum_agent_elapsed": 3.4504799842834473,
+    "stats.max_agent_elapsed": 3.4504799842834473,
+    "terminated": true,
+    "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/error_report_trial_1_of_3.md b/tests/data/error_analysis/error_report_trial_1_of_3.md
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/data/error_analysis/result_df_trial_1_of_3.csv b/tests/data/error_analysis/result_df_trial_1_of_3.csv
new file mode 100644
index 00000000..4095252c
--- /dev/null
+++ b/tests/data/error_analysis/result_df_trial_1_of_3.csv
@@ -0,0 +1,5 @@
+env.task_name,agent.agent_name,env.benchmark,index,exp_dir,agent.chat_model.model_name,agent.chat_model.max_total_tokens,agent.chat_model.max_input_tokens,agent.chat_model.max_new_tokens,agent.chat_model.temperature,agent.chat_model.vision_support,agent.chat_model.deployment_name,agent.flags.obs.use_html,agent.flags.obs.use_ax_tree,agent.flags.obs.use_tabs,agent.flags.obs.use_focused_element,agent.flags.obs.use_error_logs,agent.flags.obs.use_history,agent.flags.obs.use_past_error_logs,agent.flags.obs.use_action_history,agent.flags.obs.use_think_history,agent.flags.obs.use_diff,agent.flags.obs.html_type,agent.flags.obs.use_screenshot,agent.flags.obs.use_som,agent.flags.obs.extract_visible_tag,agent.flags.obs.extract_clickable_tag,agent.flags.obs.extract_coords,agent.flags.obs.filter_visible_elements_only,agent.flags.obs.openai_vision_detail,agent.flags.obs.filter_with_bid_only,agent.flags.obs.filter_som_only,agent.flags.action.action_set.subsets,agent.flags.action.action_set.multiaction,agent.flags.action.action_set.strict,agent.flags.action.action_set.retry_with_force,agent.flags.action.action_set.demo_mode,agent.flags.action.long_description,agent.flags.action.individual_examples,agent.flags.action.multi_actions,agent.flags.action.is_strict,agent.flags.use_plan,agent.flags.use_criticise,agent.flags.use_thinking,agent.flags.use_memory,agent.flags.use_concrete_example,agent.flags.use_abstract_example,agent.flags.use_hints,agent.flags.enable_chat,agent.flags.max_prompt_tokens,agent.flags.be_cautious,agent.flags.extra_instructions,agent.flags.add_missparsed_messages,agent.flags.max_trunc_itr,agent.flags.flag_group,agent.max_retry,env.task_seed,env.max_steps,env.headless,env.record_video,env.wait_for_user_message,env.viewport,env.slow_mo,env.storage_state,env.task_kwargs,exp_name,enable_debug,err_msg,stack_trace,order,logging_level,logging_level_stdout,exp_id,depends_on,save_screenshot,save_som,n_steps,cum_reward,cum_raw_reward,stats.cum_steps,stats.cum_n_token_goal,stats.max_n_token_goal,stats.cum_n_token_url,stats.max_n_token_url,stats.cum_n_token_focused_element_bid,stats.max_n_token_focused_element_bid,stats.cum_n_token_last_action,stats.max_n_token_last_action,stats.cum_n_token_last_action_error,stats.max_n_token_last_action_error,stats.cum_n_token_dom_txt,stats.max_n_token_dom_txt,stats.cum_n_token_axtree_txt,stats.max_n_token_axtree_txt,stats.cum_n_token_pruned_html,stats.max_n_token_pruned_html,stats.cum_n_retry_llm,stats.max_n_retry_llm,stats.cum_n_retry,stats.max_n_retry,stats.cum_busted_retry,stats.max_busted_retry,stats.cum_input_tokens,stats.max_input_tokens,stats.cum_output_tokens,stats.max_output_tokens,stats.cum_cost,stats.max_cost,stats.cum_n_token_agent_messages,stats.max_n_token_agent_messages,stats.cum_step_elapsed,stats.max_step_elapsed,stats.cum_agent_elapsed,stats.max_agent_elapsed,terminated,truncated,err_key
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,1,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,7,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,True,,,2,10,30,dd9e91e0-75ef-4bb4-9db1-f91f06848dcb,(),True,False,2,1.0,0,3,12,6,48,24,2,1,4,4,0,0,1902,952,400,201,650,326,2,1,0.0,0.0,0,0,2789,1404,128,65,0.00049515,0.00024839999999999997,2902,1459,6.860883951187134,5.8696064949035645,3.769465684890747,2.946484327316284,True,False,
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,2,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,20,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,True,,,3,10,30,187f0f01-a240-419c-a65e-0058a14f639d,(),True,False,3,1.0,0,4,27,9,72,24,3,1,8,4,0,0,2892,966,667,223,1014,340,3,1,0.0,0.0,0,0,4339,1464,225,84,0.00078585,0.0002646,4512,1517,3.0203144550323486,1.3659462928771973,3.8209800720214844,1.8219048976898193,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,0,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,28,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,True,,,0,10,30,b403cfca-4647-48fb-98f2-57e94306a38a,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1250,1250,71,71,651,651,1,1,0.0,0.0,0,0,1589,1589,63,63,0.00027614999999999996,0.00027614999999999996,1641,1641,5.891982078552246,5.891982078552246,3.4504799842834473,3.4504799842834473,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,3,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,14,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,True,,,1,10,30,4c89cb70-0bf8-42c2-be39-a9c1a39ffe8d,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1257,1257,75,75,658,658,1,1,0.0,0.0,0,0,1594,1594,64,64,0.00027749999999999997,0.00027749999999999997,1653,1653,5.879024505615234,5.879024505615234,3.029170036315918,3.029170036315918,True,False,
diff --git a/tests/data/error_analysis/study.pkl.gz b/tests/data/error_analysis/study.pkl.gz
new file mode 100644
index 00000000..8611c7d3
Binary files /dev/null and b/tests/data/error_analysis/study.pkl.gz differ
diff --git a/tests/data/error_analysis/summary_df_trial_1_of_3.csv b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
new file mode 100644
index 00000000..545cfc29
--- /dev/null
+++ b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
@@ -0,0 +1,2 @@
+agent.agent_name,env.benchmark,avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+GenericAgent-gpt-4o-mini,miniwob,1.0,0.0,1.75,4/4,0,0.0018