diff --git a/pyproject.toml b/pyproject.toml
index 2a1e06c3..782b1f26 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,3 +57,4 @@ exclude = '''
[project.scripts]
agentlab-assistant = "agentlab.ui_assistant:main"
agentlab-xray = "agentlab.analyze.agent_xray:main"
+agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main"
diff --git a/requirements.txt b/requirements.txt
index c598b342..a59d4a4a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-black[jupyter]>=24.2.0
+black[jupyter]>=24.2.0,<25
blacken-docs
pre-commit
pytest==7.3.2
diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py
index e21ada58..f34d630c 100644
--- a/src/agentlab/agents/generic_agent/agent_configs.py
+++ b/src/agentlab/agents/generic_agent/agent_configs.py
@@ -257,7 +257,7 @@
)
AGENT_4o_MINI = GenericAgentArgs(
- chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"],
+ chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"],
flags=FLAGS_GPT_4o,
)
AGENT_CLAUDE_SONNET_35 = GenericAgentArgs(
diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py
index 9764898c..7466db87 100644
--- a/src/agentlab/analyze/agent_xray.py
+++ b/src/agentlab/analyze/agent_xray.py
@@ -1,4 +1,5 @@
import base64
+import json
import os
import traceback
from copy import deepcopy
@@ -30,6 +31,32 @@
TASK_SEED_KEY = "env.task_seed"
+def dict_to_markdown(data, level=1):
+ """
+ Convert a nested dictionary to a Markdown string with hierarchical headers.
+
+ Parameters:
+ data (dict): The dictionary to convert.
+ level (int): The current header level (default is 1).
+
+ Returns:
+ str: The formatted Markdown string.
+ """
+ markdown = ""
+
+ for key, value in data.items():
+ if isinstance(value, dict):
+ # Add a header for the key and recursively process the dictionary
+ markdown += f"{'#' * level} {key}\n"
+ markdown += dict_to_markdown(value, level + 1)
+ else:
+ # Add the key-value pair with indentation
+ markdown += f"{'#' * level} {key}\n"
+ markdown += f" {value}\n"
+
+ return markdown
+
+
def display_table(df: pd.DataFrame):
df = df.copy()
df.columns = clean_column_names(df.columns)
@@ -358,6 +385,9 @@ def run_gradio(results_dir: Path):
with gr.Tab("Task Error") as tab_error:
task_error = gr.Markdown()
+ with gr.Tab("Error Analysis") as tab_error_analysis:
+ error_analysis = gr.Markdown()
+
with gr.Tab("Logs") as tab_logs:
logs = gr.Code(language=None, **code_args)
@@ -485,6 +515,7 @@ def run_gradio(results_dir: Path):
tab_axtree.select(fn=update_axtree, outputs=axtree_code)
tab_chat.select(fn=update_chat_messages, outputs=chat_messages)
tab_error.select(fn=update_task_error, outputs=task_error)
+ tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis)
tab_logs.select(fn=update_logs, outputs=logs)
tab_stats.select(fn=update_stats, outputs=stats)
tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html)
@@ -612,6 +643,20 @@ def update_task_error():
return "No Task Error"
+def update_error_analysis():
+ global info
+ try:
+ error_analysis = info.exp_result.exp_dir / "error_analysis.json"
+ if not error_analysis.exists():
+ return "No Error Analysis Found"
+ with error_analysis.open("r") as f:
+ json_data = json.load(f)
+ res = dict_to_markdown(json_data)
+ return res
+ except FileNotFoundError:
+ return "No Error Analysis"
+
+
def update_logs():
global info
try:
@@ -1200,3 +1245,4 @@ def main():
if __name__ == "__main__":
main()
+ main()
diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py
new file mode 100644
index 00000000..32e5e9df
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/pipeline.py
@@ -0,0 +1,110 @@
+import json
+import re
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Generator
+
+from bgym import ExpResult
+
+from agentlab.analyze.error_analysis.summarizer import (
+ ChangeSummarizer,
+ EpisodeErrorSummarizer,
+ EpisodeSummarizer,
+)
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@dataclass
+class Analyzer:
+ prompt: str
+ llm = None
+
+ def __call__(self, *args, **kwds):
+ return "analysis"
+
+
+def analyze(exp_result, episode_summarizer, save_analysis_func):
+ error_analysis = episode_summarizer(exp_result)
+ save_analysis_func(exp_result, error_analysis)
+
+
+@dataclass
+class ErrorAnalysisPipeline:
+ exp_dir: Path
+ filter: str = None
+ episode_summarizer: EpisodeSummarizer = None
+
+ def filter_exp_results(self) -> Generator[ExpResult, None, None]:
+ # TODO:(thibault) improve filtering
+ exp_results = yield_all_exp_results(self.exp_dir)
+ for exp_result in exp_results:
+ if self.filter is None or self.filter in str(exp_result.exp_dir):
+ yield exp_result
+
+ def run_analysis(self, parallel=False, jobs=-1):
+ filtered_results = self.filter_exp_results()
+
+ if parallel:
+ import joblib
+
+ joblib.Parallel(n_jobs=jobs, backend="threading")(
+ joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis)
+ for exp_result in filtered_results
+ )
+
+ else:
+ for exp_result in filtered_results:
+ error_analysis = self.episode_summarizer(exp_result)
+ self.save_analysis(exp_result, error_analysis)
+
+ def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True):
+ """Save the analysis to json"""
+ analysis_path = exp_result.exp_dir / "error_analysis.json"
+ if not exists_ok and analysis_path.exists():
+ raise FileExistsError(f"{analysis_path} already exists")
+ with analysis_path.open("w") as f:
+ json.dump(error_analysis, f, indent=4)
+
+
+AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available")
+HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available")
+
+
+def main():
+ import argparse
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-e", "--exp_dir", type=str)
+ parser.add_argument("-f", "--filter", type=str, default=None)
+ parser.add_argument("-p", "--parallel", action="store_true")
+ parser.add_argument("-j", "--jobs", type=int, default=-1)
+ parser.add_argument("-g", "--guess_success", action="store_true")
+
+ args = parser.parse_args()
+
+ assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir"
+
+ exp_dir = Path(args.exp_dir)
+ filter = args.filter
+ parallel = args.parallel
+ jobs = args.jobs
+ guess_success = args.guess_success
+
+ from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
+
+ llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model()
+
+ pipeline = ErrorAnalysisPipeline(
+ exp_dir=exp_dir,
+ filter=filter,
+ episode_summarizer=EpisodeErrorSummarizer(
+ ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success
+ ),
+ )
+
+ pipeline.run_analysis(parallel=parallel, jobs=jobs)
+
+
+if __name__ == "__main__":
+
+ main()
diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py
new file mode 100644
index 00000000..2919f052
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/summarizer.py
@@ -0,0 +1,178 @@
+from dataclasses import dataclass
+
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer_prompts import (
+ CHANGE_SUMMARIZER_PROMPT,
+ ERROR_CLASSIFICATION_PROMPT,
+ ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT,
+)
+from agentlab.llm.llm_utils import json_parser, parse_html_tags
+from agentlab.llm.tracking import set_tracker
+
+
+def _diff(past_obs, current_obs):
+ """TODO: Implement the diff function.
+
+ Returns a diff version of current_obs compares to past_obs, unless there is too many changes.
+
+ Args:
+ past_obs: The past observation.
+ current_obs: The current observation.
+
+ Raises:
+ ValueError: Not implemented yet.
+ """
+ raise ValueError("Not implemented yet.")
+
+
+@dataclass
+class ChangeSummarizer:
+
+ llm: callable # language model
+ obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available")
+ use_diff: bool = False
+
+ def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str:
+ """Produces, a summary of the effect of an action."""
+ obs_message = self.obs_formatter(obs.obs)
+ next_obs_message = self.obs_formatter(next_obs.obs)
+
+ action = obs.action
+
+ goal = obs.obs["goal"] # Use goal object from agentlab
+ # TODO(thibault): switch to 'goal_object'
+ # Outsource everything to formatter
+
+ if self.use_diff:
+ next_obs_message = _diff(obs_message, next_obs_message)
+
+ return self.parse(
+ self.llm(
+ self.make_prompt(
+ obs_message,
+ action,
+ next_obs_message,
+ past_summaries,
+ goal,
+ obs.obs.get("plan", "No plan available"),
+ )
+ )["content"]
+ )
+
+ def make_prompt(
+ self, past_obs_message, action, current_obs_message, past_summaries, goal, plan
+ ):
+ """TODO: Implement the prompt."""
+ return CHANGE_SUMMARIZER_PROMPT.format(
+ goal=goal,
+ plan=plan,
+ past_observation=past_obs_message,
+ current_observation=current_obs_message,
+ past_summaries=past_summaries,
+ action=action,
+ )
+
+ def parse(self, raw_output: str) -> dict:
+ parsed_result = parse_html_tags(
+ raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"]
+ )[0]
+ return parsed_result
+
+
+@dataclass
+class EpisodeAnalysis:
+ analysis: str # complete analysis of the episode
+ summary: str # short summary of the analysis
+ categories: dict[str, float] # score for each category e.g. type of error or difficulty levels
+
+
+@dataclass
+class EpisodeSummarizer:
+
+ change_summarizer: ChangeSummarizer = None
+ llm: callable = None
+ parser: callable = lambda x: json_parser(x)[0]
+ guess_success: bool = False
+
+ def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ...
+
+ def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis:
+ """Run Change Summarizer for every step in the episode or extract a pre-computed one."""
+
+ if not self.guess_success:
+ if exp_results.steps_info[-1].reward == 1:
+ return {"analysis": "Success", "summaries": {}}
+
+ with set_tracker("summary") as summaries_tracker:
+ summaries = self.make_change_summaries(exp_results)
+ prompt = self.make_prompt(exp_results, summaries)
+
+ with set_tracker("analysis") as analysis_tracker:
+ raw_analysis = self.llm(prompt)["content"]
+ analysis = self.parse(raw_analysis)
+ res = {
+ "analysis": analysis,
+ "summaries": {i: a for i, a in enumerate(summaries)},
+ }
+ res.update(analysis_tracker.stats)
+ res.update(summaries_tracker.stats)
+ return res
+
+ def make_change_summaries(self, exp_result: ExpResult) -> list[str]:
+ summaries = [] # type: list[str]
+ # this assumes that there is always an extra step at the end of the episode
+ # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info
+ # TODO:(thibault) make some checks or w/e
+ for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]):
+ summaries.append(self.change_summarizer.summarize(step, next_step, summaries))
+ return summaries
+
+ def parse(self, raw_output: str) -> dict:
+ parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0]
+ return parsed_result
+
+
+@dataclass
+class EpisodeErrorSummarizer(EpisodeSummarizer):
+
+ change_summarizer: ChangeSummarizer = None
+
+ def make_prompt(self, exp_results: ExpResult, summaries: list[str]):
+ """TODO: Implement the prompt."""
+ goal = exp_results.steps_info[0].obs["goal"]
+
+ def format_summary(summary):
+ res = ""
+ for key, value in summary.items():
+ res += f"{key}: {value}\n"
+ return res
+
+ txt_summaries = "\n".join([format_summary(summary) for summary in summaries])
+
+ actions = [step.action for step in exp_results.steps_info[:-1]]
+ action_errors = "\n".join(
+ [step.obs["last_action_error"] for step in exp_results.steps_info[1:]]
+ )
+
+ txt_actions = "\n".join(
+ [
+ f"Action: {action}\nAction Error: {action_error}"
+ for action, action_error in zip(actions, action_errors)
+ ]
+ )
+
+ extra_info = exp_results.steps_info[-1].task_info
+
+ prompt = (
+ ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT
+ if self.guess_success
+ else ERROR_CLASSIFICATION_PROMPT
+ )
+
+ return prompt.format(
+ goal=goal,
+ historical_summaries=txt_summaries,
+ action_history=txt_actions,
+ extra_info=extra_info,
+ )
diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
new file mode 100644
index 00000000..a0df9fc9
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py
@@ -0,0 +1,268 @@
+CHANGE_SUMMARIZER_PROMPT = """
+You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website,
+you will receive the following pieces of information:
+
+1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'").
+2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries.
+3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet).
+4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'").
+5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available.
+
+YOUR TASK (each step):
+A) SUMMARIZE THE CHANGE
+ - Describe what visibly changed between the previous observation (or diff) and the current observation.
+ For example, did a new panel open, did the form reset, did nothing happen, etc.?
+
+B) ASSESS THE ACTION
+ - Decide whether the agent's action seems helpful or correct given the user's main goal,
+ or if it appears incorrect/unhelpful.
+ - Briefly explain why.
+
+OUTPUT FORMAT (per step):
+Return your analysis as a JSON-like structure, for example:
+
+A new search results panel appeared on the right side.
+Correct
+Clicking 'Search' was appropriate to display the results.
+
+Or for an incorrect action:
+
+The page reloaded but the date fields were reset to defaults.
+Incorrect
+The agent should have fixed the date format first instead of re-clicking 'Show report'.
+Correct the date format or check for error messages.
+
+
+Please use single quotes '' to quote elements from the page, so as not to create parsing issues.
+
+Please follow this structure at every step. Keep your responses concise and clear. Below are the details.
+
+Goal: {goal}
+
+LLM Plan: {plan}
+
+Current Observation: {past_observation}
+
+Next Observation: {current_observation}
+
+Past summaries: {past_summaries}
+
+Action: {action}
+"""
+
+ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy.
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.),
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+ The agent cannot construct or execute a correct sequence of actions to reach its goal
+ (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+ The agent enters data in the wrong format, forgets to click "Submit" after typing,
+ repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+ The agent misreads or misinterprets visible data (e.g., extracting the wrong field values),
+ misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+ The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+ or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+ The agent misreads or misunderstands the user's objective (goal interpretation),
+ loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+ The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps,
+ or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+ - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+ - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+ - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.)
+ along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+ • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format.
+ Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+ • Classification: ["Interaction Execution"]
+ • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action
+ without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+ • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted'
+ that are older than 30 days and add a comment saying 'I can help fix this.'"
+ The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue
+ with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+ • Classification: ["Task Understanding"]
+ • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues,
+ it focused on creating a new issue. This is a misinterpretation of the instructions,
+ not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+ - The planning and thought history
+ - The action history
+ - The current HTML or AX Tree observation
+ - The user goal
+
+2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies.
+ If the task is successful, you can keep the error category as blank.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an unsuccessful interaction:
+
+The agent opened the wrong GitLab page and never recovered...
+False
+["Navigation & Planning"]
+
+Output format example for a successful interaction:
+
+The agent opened the correct GitLab page and ...
+True
+[]
+
+Please follow this structure at every step. Keep your responses concise and clear.
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
+
+
+ERROR_CLASSIFICATION_PROMPT = """
+You are an expert evaluator that classifies web agent failures according to a predefined taxonomy.
+Below are the high-level definitions of each category,
+followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.),
+a set of labeled examples for reference (few-shot), and finally the classification task you must complete.
+
+--------------------------------------------------------------------------------
+TAXONOMY DEFINITIONS
+--------------------------------------------------------------------------------
+
+1. Navigation & Planning Errors
+ The agent cannot construct or execute a correct sequence of actions to reach its goal
+ (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms).
+
+2. Interaction Execution Errors
+ The agent enters data in the wrong format, forgets to click "Submit" after typing,
+ repeats the same failing action without adaptation, or loses track of the changing webpage state.
+
+3. Information Processing Errors
+ The agent misreads or misinterprets visible data (e.g., extracting the wrong field values),
+ misconstrues relationships between pieces of information, or fails to validate data against task requirements.
+
+4. Observation & Action Errors
+ The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded)
+ or misaligns its actions (clicks the wrong element or stale link).
+
+5. Task Understanding Errors
+ The agent misreads or misunderstands the user's objective (goal interpretation),
+ loses crucial context (context loss), or performs actions beyond or short of the intended scope.
+
+6. Reasoning Failures
+ The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps,
+ or fails to prioritize important subtasks when handling complex goals.
+
+--------------------------------------------------------------------------------
+INPUT DESCRIPTION
+--------------------------------------------------------------------------------
+
+You will receive the following for each scenario:
+1. User Goal
+ - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'").
+
+2. Historical change summaries
+ - A list of summaries of changes in the observation that the agent has seen during the course of actions.
+
+3. Action History
+ - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.)
+ along with immediate outcomes or errors.
+
+Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories.
+
+--------------------------------------------------------------------------------
+FEW-SHOT CLASSIFICATION EXAMPLES
+--------------------------------------------------------------------------------
+
+1) EXAMPLE A (Interaction Execution)
+ • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format.
+ Each time, the site resets to default dates. The agent never notices and keeps doing the same thing.
+ • Classification: ["Interaction Execution"]
+ • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action
+ without adaptation ("Action Repetition").
+
+2) EXAMPLE B (Task Understanding)
+ • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted'
+ that are older than 30 days and add a comment saying 'I can help fix this.'"
+ The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue
+ with label 'help wanted,' ignoring the user's actual request to find and comment on old issues.
+ • Classification: ["Task Understanding"]
+ • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues,
+ it focused on creating a new issue. This is a misinterpretation of the instructions,
+ not a mechanical error in clicking or input format.
+
+--------------------------------------------------------------------------------
+CLASSIFICATION TASK
+--------------------------------------------------------------------------------
+
+1. Read through:
+ - The planning and thought history
+ - The action history
+ - The current HTML or AX Tree observation
+ - The user goal
+
+2. Decide the error category, or a combination thereof, under which the reason for failure lies.
+
+3. Provide a brief explanation justifying your classification, referencing specific steps if helpful.
+
+Output format example for an interaction:
+
+The agent opened the wrong GitLab page and never recovered...
+["Navigation & Planning"]
+
+Please follow this structure at every step. Keep your responses concise and clear.
+
+Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant.
+
+Overall goal: {goal}
+
+Historical change summaries: {historical_summaries}
+
+Action history: {action_history}
+
+Extra information: {extra_info}
+"""
diff --git a/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
new file mode 100644
index 00000000..af5613bb
--- /dev/null
+++ b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py
@@ -0,0 +1,31 @@
+from pathlib import Path
+from agentlab.analyze.inspect_results import (
+ load_result_df,
+)
+import json
+
+
+def get_aggregate_statistics(exp_dir: Path):
+ """Get aggregate statistics for the experiment results."""
+ results = load_result_df(exp_dir, filter=filter)
+
+
+if __name__ == "__main__":
+ path = Path(
+ "/mnt/colab_public/data/ui_copilot/thibault/tmlr_exps/2024-10-23_14-17-47_5_agents_on_workarena_l1"
+ )
+ results = load_result_df(path).reset_index()
+ results = results.loc[results["agent.chat_model.model_name"].str.contains("anthropic")]
+ success_predictions = []
+ for dir in results["exp_dir"]:
+ error_analysis = Path(dir) / "error_analysis.json"
+ if error_analysis.exists():
+ with open(error_analysis, "r") as f:
+ error_analysis = json.load(f)
+ task_success_prediction_str = error_analysis["analysis"]["success"]
+ task_success_prediction = True if task_success_prediction_str == "True" else False
+ success_predictions.append(task_success_prediction)
+ else:
+ success_predictions.append(None)
+ results["success_predictions"] = success_predictions
+ a = 1
diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py
index 096aae00..567e0798 100644
--- a/src/agentlab/llm/chat_api.py
+++ b/src/agentlab/llm/chat_api.py
@@ -4,7 +4,7 @@
import time
from dataclasses import dataclass
from functools import partial
-from typing import Optional
+from typing import Optional, Union
import openai
from huggingface_hub import InferenceClient
@@ -13,7 +13,7 @@
import agentlab.llm.tracking as tracking
from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs
from agentlab.llm.huggingface_utils import HFBaseChatModel
-from agentlab.llm.llm_utils import AIMessage, Discussion
+from agentlab.llm.llm_utils import AIMessage, Discussion, HumanMessage
def make_system_message(content: str) -> dict:
@@ -268,7 +268,13 @@ def __init__(
**client_args,
)
- def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict:
+ def __call__(
+ self, messages: Union[str, list[dict]], n_samples: int = 1, temperature: float = None
+ ) -> dict:
+
+ if isinstance(messages, str):
+ messages = [HumanMessage(messages)]
+
# Initialize retry tracking attributes
self.retries = 0
self.success = False
diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py
new file mode 100644
index 00000000..a2f6295d
--- /dev/null
+++ b/tests/analyze/error_analysis/test_pipeline.py
@@ -0,0 +1,63 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline
+
+exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+
+
+class MockStepSummarizer:
+ def summarize(
+ self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str]
+ ) -> str:
+ return f"Agent took action {action} at step {len(step_summaries)}"
+
+
+class MockEpisodeSummarizer:
+ def __call__(self, exp_result: ExpResult) -> str:
+ return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}"
+
+
+class MockAnalyzer:
+ def __call__(
+ self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str]
+ ) -> str:
+ return {"error": "analysis", "episode": episode_analysis}
+
+
+@pytest.fixture(scope="module")
+def pipeline() -> ErrorAnalysisPipeline:
+ return ErrorAnalysisPipeline(
+ exp_dir=exp_dir,
+ filter=None,
+ episode_summarizer=MockEpisodeSummarizer(),
+ )
+
+
+def test_yield_no_filter(pipeline: ErrorAnalysisPipeline):
+ assert len(list(pipeline.filter_exp_results())) == 4
+
+
+def test_yield_with_filter(pipeline: ErrorAnalysisPipeline):
+ pattern = "click-dialog"
+ pipeline.filter = pattern
+ assert len(list(pipeline.filter_exp_results())) == 2
+ pipeline.filter = None
+
+
+def test_save_analysis(pipeline: ErrorAnalysisPipeline):
+ exp_result = next(pipeline.filter_exp_results())
+
+ error_analysis = pipeline.episode_summarizer(exp_result)
+ pipeline.save_analysis(exp_result, error_analysis, exists_ok=False)
+
+ assert (exp_result.exp_dir / "error_analysis.json").exists()
+
+ # remove the file
+ (exp_result.exp_dir / "error_analysis.json").unlink()
+
+
+if __name__ == "__main__":
+ test_yield_with_filter()
diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py
new file mode 100644
index 00000000..83418496
--- /dev/null
+++ b/tests/analyze/error_analysis/test_summarizer.py
@@ -0,0 +1,30 @@
+from pathlib import Path
+
+import pytest
+from bgym import ExpResult, StepInfo
+
+from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer
+from agentlab.analyze.inspect_results import yield_all_exp_results
+
+
+@pytest.fixture(scope="module")
+def exp_results() -> list[ExpResult]:
+ exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis"
+ return list(yield_all_exp_results(exp_dir))
+
+
+@pytest.mark.pricy
+def test_change_summarizer(exp_results: list[ExpResult]):
+ summarizer = ChangeSummarizer(llm=lambda x: {"content": x})
+ step = exp_results[0].steps_info[0]
+ next_step = exp_results[0].steps_info[1]
+ past_summaries = []
+ summary = summarizer.summarize(step, next_step, past_summaries)
+ assert isinstance(summary, dict)
+
+
+if __name__ == "__main__":
+ exp_res = list(
+ yield_all_exp_results(Path(__file__).parent.parent.parent / "data/error_analysis")
+ )
+ test_change_summarizer(exp_res)
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl
new file mode 100644
index 00000000..b2856641
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz
new file mode 100644
index 00000000..482f9b3d
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz
new file mode 100644
index 00000000..00267af0
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz
new file mode 100644
index 00000000..52c5209d
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz
new file mode 100644
index 00000000..b00c7bcf
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz
new file mode 100644
index 00000000..d7f75bd6
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
new file mode 100644
index 00000000..34e6f226
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json
@@ -0,0 +1,44 @@
+{
+ "n_steps": 3,
+ "cum_reward": 1.0,
+ "cum_raw_reward": 0,
+ "err_msg": null,
+ "stack_trace": null,
+ "stats.cum_steps": 4,
+ "stats.cum_n_token_goal": 27,
+ "stats.max_n_token_goal": 9,
+ "stats.cum_n_token_url": 72,
+ "stats.max_n_token_url": 24,
+ "stats.cum_n_token_focused_element_bid": 3,
+ "stats.max_n_token_focused_element_bid": 1,
+ "stats.cum_n_token_last_action": 8,
+ "stats.max_n_token_last_action": 4,
+ "stats.cum_n_token_last_action_error": 0,
+ "stats.max_n_token_last_action_error": 0,
+ "stats.cum_n_token_dom_txt": 2892,
+ "stats.max_n_token_dom_txt": 966,
+ "stats.cum_n_token_axtree_txt": 667,
+ "stats.max_n_token_axtree_txt": 223,
+ "stats.cum_n_token_pruned_html": 1014,
+ "stats.max_n_token_pruned_html": 340,
+ "stats.cum_n_retry_llm": 3,
+ "stats.max_n_retry_llm": 1,
+ "stats.cum_n_retry": 0.0,
+ "stats.max_n_retry": 0.0,
+ "stats.cum_busted_retry": 0,
+ "stats.max_busted_retry": 0,
+ "stats.cum_input_tokens": 4339,
+ "stats.max_input_tokens": 1464,
+ "stats.cum_output_tokens": 225,
+ "stats.max_output_tokens": 84,
+ "stats.cum_cost": 0.00078585,
+ "stats.max_cost": 0.0002646,
+ "stats.cum_n_token_agent_messages": 4512,
+ "stats.max_n_token_agent_messages": 1517,
+ "stats.cum_step_elapsed": 3.0203144550323486,
+ "stats.max_step_elapsed": 1.3659462928771973,
+ "stats.cum_agent_elapsed": 3.8209800720214844,
+ "stats.max_agent_elapsed": 1.8219048976898193,
+ "terminated": true,
+ "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl
new file mode 100644
index 00000000..6bdd8639
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz
new file mode 100644
index 00000000..45522b9e
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz
new file mode 100644
index 00000000..1d3d08b1
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz
new file mode 100644
index 00000000..18c107bd
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz
new file mode 100644
index 00000000..d55bd69a
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
new file mode 100644
index 00000000..6f351629
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json
@@ -0,0 +1,44 @@
+{
+ "n_steps": 2,
+ "cum_reward": 1.0,
+ "cum_raw_reward": 0,
+ "err_msg": null,
+ "stack_trace": null,
+ "stats.cum_steps": 3,
+ "stats.cum_n_token_goal": 12,
+ "stats.max_n_token_goal": 6,
+ "stats.cum_n_token_url": 48,
+ "stats.max_n_token_url": 24,
+ "stats.cum_n_token_focused_element_bid": 2,
+ "stats.max_n_token_focused_element_bid": 1,
+ "stats.cum_n_token_last_action": 4,
+ "stats.max_n_token_last_action": 4,
+ "stats.cum_n_token_last_action_error": 0,
+ "stats.max_n_token_last_action_error": 0,
+ "stats.cum_n_token_dom_txt": 1902,
+ "stats.max_n_token_dom_txt": 952,
+ "stats.cum_n_token_axtree_txt": 400,
+ "stats.max_n_token_axtree_txt": 201,
+ "stats.cum_n_token_pruned_html": 650,
+ "stats.max_n_token_pruned_html": 326,
+ "stats.cum_n_retry_llm": 2,
+ "stats.max_n_retry_llm": 1,
+ "stats.cum_n_retry": 0.0,
+ "stats.max_n_retry": 0.0,
+ "stats.cum_busted_retry": 0,
+ "stats.max_busted_retry": 0,
+ "stats.cum_input_tokens": 2789,
+ "stats.max_input_tokens": 1404,
+ "stats.cum_output_tokens": 128,
+ "stats.max_output_tokens": 65,
+ "stats.cum_cost": 0.00049515,
+ "stats.max_cost": 0.00024839999999999997,
+ "stats.cum_n_token_agent_messages": 2902,
+ "stats.max_n_token_agent_messages": 1459,
+ "stats.cum_step_elapsed": 6.860883951187134,
+ "stats.max_step_elapsed": 5.8696064949035645,
+ "stats.cum_agent_elapsed": 3.769465684890747,
+ "stats.max_agent_elapsed": 2.946484327316284,
+ "terminated": true,
+ "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl
new file mode 100644
index 00000000..71da24d7
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz
new file mode 100644
index 00000000..6f8de674
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz
new file mode 100644
index 00000000..94b8701c
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz
new file mode 100644
index 00000000..636120ba
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
new file mode 100644
index 00000000..351aa01c
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json
@@ -0,0 +1,44 @@
+{
+ "n_steps": 1,
+ "cum_reward": 1.0,
+ "cum_raw_reward": 0,
+ "err_msg": null,
+ "stack_trace": null,
+ "stats.cum_steps": 2,
+ "stats.cum_n_token_goal": 10,
+ "stats.max_n_token_goal": 10,
+ "stats.cum_n_token_url": 23,
+ "stats.max_n_token_url": 23,
+ "stats.cum_n_token_focused_element_bid": 1,
+ "stats.max_n_token_focused_element_bid": 1,
+ "stats.cum_n_token_last_action": 0,
+ "stats.max_n_token_last_action": 0,
+ "stats.cum_n_token_last_action_error": 0,
+ "stats.max_n_token_last_action_error": 0,
+ "stats.cum_n_token_dom_txt": 1257,
+ "stats.max_n_token_dom_txt": 1257,
+ "stats.cum_n_token_axtree_txt": 75,
+ "stats.max_n_token_axtree_txt": 75,
+ "stats.cum_n_token_pruned_html": 658,
+ "stats.max_n_token_pruned_html": 658,
+ "stats.cum_n_retry_llm": 1,
+ "stats.max_n_retry_llm": 1,
+ "stats.cum_n_retry": 0.0,
+ "stats.max_n_retry": 0.0,
+ "stats.cum_busted_retry": 0,
+ "stats.max_busted_retry": 0,
+ "stats.cum_input_tokens": 1594,
+ "stats.max_input_tokens": 1594,
+ "stats.cum_output_tokens": 64,
+ "stats.max_output_tokens": 64,
+ "stats.cum_cost": 0.00027749999999999997,
+ "stats.max_cost": 0.00027749999999999997,
+ "stats.cum_n_token_agent_messages": 1653,
+ "stats.max_n_token_agent_messages": 1653,
+ "stats.cum_step_elapsed": 5.879024505615234,
+ "stats.max_step_elapsed": 5.879024505615234,
+ "stats.cum_agent_elapsed": 3.029170036315918,
+ "stats.max_agent_elapsed": 3.029170036315918,
+ "terminated": true,
+ "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl
new file mode 100644
index 00000000..3399e40c
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz
new file mode 100644
index 00000000..ef19e47f
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
new file mode 100644
index 00000000..512944ab
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt
@@ -0,0 +1,287 @@
+Faker==30.6.0
+Farama-Notifications==0.0.4
+Flask==3.0.3
+GitPython==3.1.43
+Jinja2==3.1.4
+MarkupSafe==2.1.5
+PyYAML==6.0.2
+Pygments==2.18.0
+SQLAlchemy==2.0.36
+Send2Trash==1.8.3
+Werkzeug==3.0.4
+agentlab==0.3.2
+agentlab==0.3.2
+aiofiles==23.2.1
+aiohappyeyeballs==2.4.3
+aiohttp-cors==0.7.0
+aiohttp==3.10.10
+aiolimiter==1.1.0
+aiosignal==1.3.1
+annotated-types==0.7.0
+anthropic==0.37.1
+anyio==4.6.2.post1
+argcomplete==3.5.1
+argon2-cffi-bindings==21.2.0
+argon2-cffi==23.1.0
+arrow==1.3.0
+asttokens==2.4.1
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beartype==0.12.0
+beautifulsoup4==4.12.3
+black==24.2.0
+blacken-docs==1.19.0
+bleach==6.1.0
+blinker==1.8.2
+browsergym-assistantbench==0.12.0
+browsergym-core==0.12.0
+browsergym-experiments==0.12.0
+browsergym-miniwob==0.12.0
+browsergym-visualwebarena==0.12.0
+browsergym-webarena==0.12.0
+browsergym-workarena==0.4.1
+browsergym==0.12.0
+cachetools==5.5.0
+certifi==2024.8.30
+cffi==1.17.1
+cfgv==3.4.0
+charset-normalizer==3.4.0
+click==8.1.7
+cloudpickle==3.1.0
+colorama==0.4.6
+colorama==0.4.6
+colorful==0.5.6
+comm==0.2.2
+contexttimer==0.3.3
+contourpy==1.3.0
+cycler==0.12.1
+dask==2024.10.0
+dataclasses-json==0.6.7
+datasets==3.0.1
+debugpy==1.8.7
+decorator==5.1.1
+defusedxml==0.7.1
+dill==0.3.8
+distlib==0.3.9
+distributed==2024.10.0
+distro==1.9.0
+english-words==2.0.1
+evaluate==0.4.3
+execnet==2.1.1
+executing==2.1.0
+fastapi==0.115.2
+fastjsonschema==2.20.0
+ffmpy==0.4.0
+filelock==3.16.1
+fonttools==4.54.1
+fqdn==1.5.1
+frozenlist==1.4.1
+fsspec==2024.6.1
+gitdb==4.0.11
+google-api-core==2.23.0
+google-auth==2.36.0
+googleapis-common-protos==1.66.0
+gradio==5.7.1
+gradio_client==1.5.0
+greenlet==3.0.0
+grpcio==1.68.0
+gymnasium==1.0.0
+h11==0.14.0
+httpcore==1.0.6
+httpx==0.27.2
+huggingface-hub==0.26.0
+identify==2.6.1
+idna==3.10
+imageio==2.36.0
+importlib_resources==6.4.5
+iniconfig==2.0.0
+ipykernel==6.29.5
+ipython==8.28.0
+isoduration==20.11.0
+itsdangerous==2.2.0
+jedi==0.19.1
+jiter==0.6.1
+joblib==1.4.2
+json5==0.9.25
+jsonpatch==1.33
+jsonpointer==3.0.0
+jsonschema-specifications==2024.10.1
+jsonschema==4.23.0
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.5
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+kiwisolver==1.4.7
+langchain-community==0.3.3
+langchain-core==0.3.12
+langchain-text-splitters==0.3.0
+langchain==0.3.4
+langsmith==0.1.136
+lazy_loader==0.4
+libvisualwebarena==0.0.14
+libwebarena==0.0.3
+linkify-it-py==2.0.3
+locket==1.0.0
+lxml==5.3.0
+markdown-it-py==3.0.0
+marshmallow==3.23.0
+matplotlib-inline==0.1.7
+matplotlib==3.9.2
+mdit-py-plugins==0.4.2
+mdurl==0.1.2
+memray==1.14.0
+mistune==3.0.2
+mpmath==1.3.0
+msgpack==1.1.0
+multidict==6.1.0
+multiprocess==0.70.16
+mypy-extensions==1.0.0
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.1
+nltk==3.9.1
+nodeenv==1.9.1
+notebook_shim==0.2.4
+numpy==1.26.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.52.0
+opencensus-context==0.1.3
+opencensus==0.11.4
+orjson==3.10.7
+overrides==7.7.0
+packaging==24.1
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+partd==1.4.2
+pathspec==0.12.1
+pexpect==4.9.0
+pillow==10.4.0
+pip==24.2
+platformdirs==4.3.6
+playwright==1.39.0
+pluggy==1.5.0
+portalocker==2.10.1
+pre_commit==4.0.1
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+propcache==0.2.0
+proto-plus==1.25.0
+protobuf==5.28.3
+psutil==6.1.0
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-spy==0.4.0
+pyarrow==17.0.0
+pyasn1==0.6.1
+pyasn1_modules==0.4.1
+pycparser==2.22
+pydantic-settings==2.6.0
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydub==0.25.1
+pyee==11.0.1
+pyparsing==3.2.0
+pytest-base-url==2.1.0
+pytest-playwright==0.5.2
+pytest-xdist==3.6.1
+pytest==7.3.2
+python-dateutil==2.9.0.post0
+python-dotenv==1.0.1
+python-json-logger==2.0.7
+python-multipart==0.0.12
+python-slugify==8.0.4
+pytz==2024.2
+pyzmq==26.2.0
+ray==2.39.0
+referencing==0.35.1
+regex==2024.9.11
+requests-toolbelt==1.0.0
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.2
+rpds-py==0.20.0
+rsa==4.9
+ruff==0.7.0
+sacrebleu==2.4.3
+safehttpx==0.1.6
+safetensors==0.4.5
+scikit-image==0.24.0
+scipy==1.14.1
+semantic-version==2.10.0
+setproctitle==1.2.2
+setuptools==75.1.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.5
+smmap==5.0.1
+sniffio==1.3.1
+sortedcontainers==2.4.0
+soupsieve==2.6
+stack-data==0.6.3
+starlette==0.40.0
+sympy==1.13.1
+tabulate==0.9.0
+tblib==3.0.0
+tenacity==9.0.0
+terminado==0.18.1
+text-generation==0.7.0
+text-unidecode==1.3
+textual==0.86.2
+tifffile==2024.9.20
+tiktoken==0.8.0
+tinycss2==1.3.0
+tokenize-rt==6.0.0
+tokenizers==0.20.1
+tomlkit==0.12.0
+toolz==1.0.0
+torch==2.5.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.45.2
+triton==3.1.0
+typer==0.12.5
+types-python-dateutil==2.9.0.20241003
+types-tqdm==4.66.0.20240417
+typing-inspect==0.9.0
+typing_extensions==4.12.2
+tzdata==2024.2
+uc-micro-py==1.0.3
+uri-template==1.3.0
+urllib3==2.2.3
+uvicorn==0.32.0
+virtualenv==20.27.0
+wcwidth==0.2.13
+webcolors==24.8.0
+webencodings==0.5.1
+weblinx-browsergym==0.0.1.dev10
+weblinx==0.3.2
+websocket-client==1.8.0
+websockets==12.0
+wheel==0.44.0
+wrapt==1.16.0
+xxhash==3.5.0
+yarl==1.15.5
+zict==3.0.0
\ No newline at end of file
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz
new file mode 100644
index 00000000..2aac84fd
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz
new file mode 100644
index 00000000..a426bd07
Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz differ
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
new file mode 100644
index 00000000..85b34311
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv
@@ -0,0 +1,2 @@
+avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+1.0,0.0,1.0,1/1,0,0.0003
diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
new file mode 100644
index 00000000..a17872af
--- /dev/null
+++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json
@@ -0,0 +1,44 @@
+{
+ "n_steps": 1,
+ "cum_reward": 1.0,
+ "cum_raw_reward": 0,
+ "err_msg": null,
+ "stack_trace": null,
+ "stats.cum_steps": 2,
+ "stats.cum_n_token_goal": 10,
+ "stats.max_n_token_goal": 10,
+ "stats.cum_n_token_url": 23,
+ "stats.max_n_token_url": 23,
+ "stats.cum_n_token_focused_element_bid": 1,
+ "stats.max_n_token_focused_element_bid": 1,
+ "stats.cum_n_token_last_action": 0,
+ "stats.max_n_token_last_action": 0,
+ "stats.cum_n_token_last_action_error": 0,
+ "stats.max_n_token_last_action_error": 0,
+ "stats.cum_n_token_dom_txt": 1250,
+ "stats.max_n_token_dom_txt": 1250,
+ "stats.cum_n_token_axtree_txt": 71,
+ "stats.max_n_token_axtree_txt": 71,
+ "stats.cum_n_token_pruned_html": 651,
+ "stats.max_n_token_pruned_html": 651,
+ "stats.cum_n_retry_llm": 1,
+ "stats.max_n_retry_llm": 1,
+ "stats.cum_n_retry": 0.0,
+ "stats.max_n_retry": 0.0,
+ "stats.cum_busted_retry": 0,
+ "stats.max_busted_retry": 0,
+ "stats.cum_input_tokens": 1589,
+ "stats.max_input_tokens": 1589,
+ "stats.cum_output_tokens": 63,
+ "stats.max_output_tokens": 63,
+ "stats.cum_cost": 0.00027614999999999996,
+ "stats.max_cost": 0.00027614999999999996,
+ "stats.cum_n_token_agent_messages": 1641,
+ "stats.max_n_token_agent_messages": 1641,
+ "stats.cum_step_elapsed": 5.891982078552246,
+ "stats.max_step_elapsed": 5.891982078552246,
+ "stats.cum_agent_elapsed": 3.4504799842834473,
+ "stats.max_agent_elapsed": 3.4504799842834473,
+ "terminated": true,
+ "truncated": false
+}
\ No newline at end of file
diff --git a/tests/data/error_analysis/error_report_trial_1_of_3.md b/tests/data/error_analysis/error_report_trial_1_of_3.md
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/data/error_analysis/result_df_trial_1_of_3.csv b/tests/data/error_analysis/result_df_trial_1_of_3.csv
new file mode 100644
index 00000000..4095252c
--- /dev/null
+++ b/tests/data/error_analysis/result_df_trial_1_of_3.csv
@@ -0,0 +1,5 @@
+env.task_name,agent.agent_name,env.benchmark,index,exp_dir,agent.chat_model.model_name,agent.chat_model.max_total_tokens,agent.chat_model.max_input_tokens,agent.chat_model.max_new_tokens,agent.chat_model.temperature,agent.chat_model.vision_support,agent.chat_model.deployment_name,agent.flags.obs.use_html,agent.flags.obs.use_ax_tree,agent.flags.obs.use_tabs,agent.flags.obs.use_focused_element,agent.flags.obs.use_error_logs,agent.flags.obs.use_history,agent.flags.obs.use_past_error_logs,agent.flags.obs.use_action_history,agent.flags.obs.use_think_history,agent.flags.obs.use_diff,agent.flags.obs.html_type,agent.flags.obs.use_screenshot,agent.flags.obs.use_som,agent.flags.obs.extract_visible_tag,agent.flags.obs.extract_clickable_tag,agent.flags.obs.extract_coords,agent.flags.obs.filter_visible_elements_only,agent.flags.obs.openai_vision_detail,agent.flags.obs.filter_with_bid_only,agent.flags.obs.filter_som_only,agent.flags.action.action_set.subsets,agent.flags.action.action_set.multiaction,agent.flags.action.action_set.strict,agent.flags.action.action_set.retry_with_force,agent.flags.action.action_set.demo_mode,agent.flags.action.long_description,agent.flags.action.individual_examples,agent.flags.action.multi_actions,agent.flags.action.is_strict,agent.flags.use_plan,agent.flags.use_criticise,agent.flags.use_thinking,agent.flags.use_memory,agent.flags.use_concrete_example,agent.flags.use_abstract_example,agent.flags.use_hints,agent.flags.enable_chat,agent.flags.max_prompt_tokens,agent.flags.be_cautious,agent.flags.extra_instructions,agent.flags.add_missparsed_messages,agent.flags.max_trunc_itr,agent.flags.flag_group,agent.max_retry,env.task_seed,env.max_steps,env.headless,env.record_video,env.wait_for_user_message,env.viewport,env.slow_mo,env.storage_state,env.task_kwargs,exp_name,enable_debug,err_msg,stack_trace,order,logging_level,logging_level_stdout,exp_id,depends_on,save_screenshot,save_som,n_steps,cum_reward,cum_raw_reward,stats.cum_steps,stats.cum_n_token_goal,stats.max_n_token_goal,stats.cum_n_token_url,stats.max_n_token_url,stats.cum_n_token_focused_element_bid,stats.max_n_token_focused_element_bid,stats.cum_n_token_last_action,stats.max_n_token_last_action,stats.cum_n_token_last_action_error,stats.max_n_token_last_action_error,stats.cum_n_token_dom_txt,stats.max_n_token_dom_txt,stats.cum_n_token_axtree_txt,stats.max_n_token_axtree_txt,stats.cum_n_token_pruned_html,stats.max_n_token_pruned_html,stats.cum_n_retry_llm,stats.max_n_retry_llm,stats.cum_n_retry,stats.max_n_retry,stats.cum_busted_retry,stats.max_busted_retry,stats.cum_input_tokens,stats.max_input_tokens,stats.cum_output_tokens,stats.max_output_tokens,stats.cum_cost,stats.max_cost,stats.cum_n_token_agent_messages,stats.max_n_token_agent_messages,stats.cum_step_elapsed,stats.max_step_elapsed,stats.cum_agent_elapsed,stats.max_agent_elapsed,terminated,truncated,err_key
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,1,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,7,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,True,,,2,10,30,dd9e91e0-75ef-4bb4-9db1-f91f06848dcb,(),True,False,2,1.0,0,3,12,6,48,24,2,1,4,4,0,0,1902,952,400,201,650,326,2,1,0.0,0.0,0,0,2789,1404,128,65,0.00049515,0.00024839999999999997,2902,1459,6.860883951187134,5.8696064949035645,3.769465684890747,2.946484327316284,True,False,
+miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,2,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,20,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,True,,,3,10,30,187f0f01-a240-419c-a65e-0058a14f639d,(),True,False,3,1.0,0,4,27,9,72,24,3,1,8,4,0,0,2892,966,667,223,1014,340,3,1,0.0,0.0,0,0,4339,1464,225,84,0.00078585,0.0002646,4512,1517,3.0203144550323486,1.3659462928771973,3.8209800720214844,1.8219048976898193,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,0,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,28,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,True,,,0,10,30,b403cfca-4647-48fb-98f2-57e94306a38a,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1250,1250,71,71,651,651,1,1,0.0,0.0,0,0,1589,1589,63,63,0.00027614999999999996,0.00027614999999999996,1641,1641,5.891982078552246,5.891982078552246,3.4504799842834473,3.4504799842834473,True,False,
+miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,3,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,14,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,True,,,1,10,30,4c89cb70-0bf8-42c2-be39-a9c1a39ffe8d,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1257,1257,75,75,658,658,1,1,0.0,0.0,0,0,1594,1594,64,64,0.00027749999999999997,0.00027749999999999997,1653,1653,5.879024505615234,5.879024505615234,3.029170036315918,3.029170036315918,True,False,
diff --git a/tests/data/error_analysis/study.pkl.gz b/tests/data/error_analysis/study.pkl.gz
new file mode 100644
index 00000000..8611c7d3
Binary files /dev/null and b/tests/data/error_analysis/study.pkl.gz differ
diff --git a/tests/data/error_analysis/summary_df_trial_1_of_3.csv b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
new file mode 100644
index 00000000..545cfc29
--- /dev/null
+++ b/tests/data/error_analysis/summary_df_trial_1_of_3.csv
@@ -0,0 +1,2 @@
+agent.agent_name,env.benchmark,avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost
+GenericAgent-gpt-4o-mini,miniwob,1.0,0.0,1.75,4/4,0,0.0018