diff --git a/pyproject.toml b/pyproject.toml index 2a1e06c3..782b1f26 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,3 +57,4 @@ exclude = ''' [project.scripts] agentlab-assistant = "agentlab.ui_assistant:main" agentlab-xray = "agentlab.analyze.agent_xray:main" +agentlab-analyze = "agentlab.analyze.error_analysis.pipeline:main" diff --git a/requirements.txt b/requirements.txt index c598b342..a59d4a4a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -black[jupyter]>=24.2.0 +black[jupyter]>=24.2.0,<25 blacken-docs pre-commit pytest==7.3.2 diff --git a/src/agentlab/agents/generic_agent/agent_configs.py b/src/agentlab/agents/generic_agent/agent_configs.py index e21ada58..f34d630c 100644 --- a/src/agentlab/agents/generic_agent/agent_configs.py +++ b/src/agentlab/agents/generic_agent/agent_configs.py @@ -257,7 +257,7 @@ ) AGENT_4o_MINI = GenericAgentArgs( - chat_model_args=CHAT_MODEL_ARGS_DICT["openai/gpt-4o-mini-2024-07-18"], + chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-4o-mini-2024-07-18"], flags=FLAGS_GPT_4o, ) AGENT_CLAUDE_SONNET_35 = GenericAgentArgs( diff --git a/src/agentlab/analyze/agent_xray.py b/src/agentlab/analyze/agent_xray.py index 9764898c..7466db87 100644 --- a/src/agentlab/analyze/agent_xray.py +++ b/src/agentlab/analyze/agent_xray.py @@ -1,4 +1,5 @@ import base64 +import json import os import traceback from copy import deepcopy @@ -30,6 +31,32 @@ TASK_SEED_KEY = "env.task_seed" +def dict_to_markdown(data, level=1): + """ + Convert a nested dictionary to a Markdown string with hierarchical headers. + + Parameters: + data (dict): The dictionary to convert. + level (int): The current header level (default is 1). + + Returns: + str: The formatted Markdown string. + """ + markdown = "" + + for key, value in data.items(): + if isinstance(value, dict): + # Add a header for the key and recursively process the dictionary + markdown += f"{'#' * level} {key}\n" + markdown += dict_to_markdown(value, level + 1) + else: + # Add the key-value pair with indentation + markdown += f"{'#' * level} {key}\n" + markdown += f" {value}\n" + + return markdown + + def display_table(df: pd.DataFrame): df = df.copy() df.columns = clean_column_names(df.columns) @@ -358,6 +385,9 @@ def run_gradio(results_dir: Path): with gr.Tab("Task Error") as tab_error: task_error = gr.Markdown() + with gr.Tab("Error Analysis") as tab_error_analysis: + error_analysis = gr.Markdown() + with gr.Tab("Logs") as tab_logs: logs = gr.Code(language=None, **code_args) @@ -485,6 +515,7 @@ def run_gradio(results_dir: Path): tab_axtree.select(fn=update_axtree, outputs=axtree_code) tab_chat.select(fn=update_chat_messages, outputs=chat_messages) tab_error.select(fn=update_task_error, outputs=task_error) + tab_error_analysis.select(fn=update_error_analysis, outputs=error_analysis) tab_logs.select(fn=update_logs, outputs=logs) tab_stats.select(fn=update_stats, outputs=stats) tab_agent_info_html.select(fn=update_agent_info_html, outputs=agent_info_html) @@ -612,6 +643,20 @@ def update_task_error(): return "No Task Error" +def update_error_analysis(): + global info + try: + error_analysis = info.exp_result.exp_dir / "error_analysis.json" + if not error_analysis.exists(): + return "No Error Analysis Found" + with error_analysis.open("r") as f: + json_data = json.load(f) + res = dict_to_markdown(json_data) + return res + except FileNotFoundError: + return "No Error Analysis" + + def update_logs(): global info try: @@ -1200,3 +1245,4 @@ def main(): if __name__ == "__main__": main() + main() diff --git a/src/agentlab/analyze/error_analysis/__init__.py b/src/agentlab/analyze/error_analysis/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/agentlab/analyze/error_analysis/pipeline.py b/src/agentlab/analyze/error_analysis/pipeline.py new file mode 100644 index 00000000..32e5e9df --- /dev/null +++ b/src/agentlab/analyze/error_analysis/pipeline.py @@ -0,0 +1,110 @@ +import json +import re +from dataclasses import dataclass +from pathlib import Path +from typing import Generator + +from bgym import ExpResult + +from agentlab.analyze.error_analysis.summarizer import ( + ChangeSummarizer, + EpisodeErrorSummarizer, + EpisodeSummarizer, +) +from agentlab.analyze.inspect_results import yield_all_exp_results + + +@dataclass +class Analyzer: + prompt: str + llm = None + + def __call__(self, *args, **kwds): + return "analysis" + + +def analyze(exp_result, episode_summarizer, save_analysis_func): + error_analysis = episode_summarizer(exp_result) + save_analysis_func(exp_result, error_analysis) + + +@dataclass +class ErrorAnalysisPipeline: + exp_dir: Path + filter: str = None + episode_summarizer: EpisodeSummarizer = None + + def filter_exp_results(self) -> Generator[ExpResult, None, None]: + # TODO:(thibault) improve filtering + exp_results = yield_all_exp_results(self.exp_dir) + for exp_result in exp_results: + if self.filter is None or self.filter in str(exp_result.exp_dir): + yield exp_result + + def run_analysis(self, parallel=False, jobs=-1): + filtered_results = self.filter_exp_results() + + if parallel: + import joblib + + joblib.Parallel(n_jobs=jobs, backend="threading")( + joblib.delayed(analyze)(exp_result, self.episode_summarizer, self.save_analysis) + for exp_result in filtered_results + ) + + else: + for exp_result in filtered_results: + error_analysis = self.episode_summarizer(exp_result) + self.save_analysis(exp_result, error_analysis) + + def save_analysis(self, exp_result: ExpResult, error_analysis: dict, exists_ok=True): + """Save the analysis to json""" + analysis_path = exp_result.exp_dir / "error_analysis.json" + if not exists_ok and analysis_path.exists(): + raise FileExistsError(f"{analysis_path} already exists") + with analysis_path.open("w") as f: + json.dump(error_analysis, f, indent=4) + + +AXTREE_FORMATTER = lambda x: x.get("axtree_txt", "No AXTREE available") +HTML_FORMATTER = lambda x: x.get("pruned_html", "No HTML available") + + +def main(): + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-e", "--exp_dir", type=str) + parser.add_argument("-f", "--filter", type=str, default=None) + parser.add_argument("-p", "--parallel", action="store_true") + parser.add_argument("-j", "--jobs", type=int, default=-1) + parser.add_argument("-g", "--guess_success", action="store_true") + + args = parser.parse_args() + + assert args.exp_dir is not None, "Please provide an exp_dir, e.g., -e /path/to/exp_dir" + + exp_dir = Path(args.exp_dir) + filter = args.filter + parallel = args.parallel + jobs = args.jobs + guess_success = args.guess_success + + from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT + + llm = CHAT_MODEL_ARGS_DICT["azure/gpt-4o-2024-08-06"].make_model() + + pipeline = ErrorAnalysisPipeline( + exp_dir=exp_dir, + filter=filter, + episode_summarizer=EpisodeErrorSummarizer( + ChangeSummarizer(llm, AXTREE_FORMATTER), llm, guess_success=guess_success + ), + ) + + pipeline.run_analysis(parallel=parallel, jobs=jobs) + + +if __name__ == "__main__": + + main() diff --git a/src/agentlab/analyze/error_analysis/summarizer.py b/src/agentlab/analyze/error_analysis/summarizer.py new file mode 100644 index 00000000..2919f052 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/summarizer.py @@ -0,0 +1,178 @@ +from dataclasses import dataclass + +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.summarizer_prompts import ( + CHANGE_SUMMARIZER_PROMPT, + ERROR_CLASSIFICATION_PROMPT, + ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT, +) +from agentlab.llm.llm_utils import json_parser, parse_html_tags +from agentlab.llm.tracking import set_tracker + + +def _diff(past_obs, current_obs): + """TODO: Implement the diff function. + + Returns a diff version of current_obs compares to past_obs, unless there is too many changes. + + Args: + past_obs: The past observation. + current_obs: The current observation. + + Raises: + ValueError: Not implemented yet. + """ + raise ValueError("Not implemented yet.") + + +@dataclass +class ChangeSummarizer: + + llm: callable # language model + obs_formatter: callable = lambda x: x.get("dom_txt", "No AXTREE available") + use_diff: bool = False + + def summarize(self, obs: StepInfo, next_obs: StepInfo, past_summaries: list[str]) -> str: + """Produces, a summary of the effect of an action.""" + obs_message = self.obs_formatter(obs.obs) + next_obs_message = self.obs_formatter(next_obs.obs) + + action = obs.action + + goal = obs.obs["goal"] # Use goal object from agentlab + # TODO(thibault): switch to 'goal_object' + # Outsource everything to formatter + + if self.use_diff: + next_obs_message = _diff(obs_message, next_obs_message) + + return self.parse( + self.llm( + self.make_prompt( + obs_message, + action, + next_obs_message, + past_summaries, + goal, + obs.obs.get("plan", "No plan available"), + ) + )["content"] + ) + + def make_prompt( + self, past_obs_message, action, current_obs_message, past_summaries, goal, plan + ): + """TODO: Implement the prompt.""" + return CHANGE_SUMMARIZER_PROMPT.format( + goal=goal, + plan=plan, + past_observation=past_obs_message, + current_observation=current_obs_message, + past_summaries=past_summaries, + action=action, + ) + + def parse(self, raw_output: str) -> dict: + parsed_result = parse_html_tags( + raw_output, keys=["changeSummary", "actionAssessment", "explanation", "suggestion"] + )[0] + return parsed_result + + +@dataclass +class EpisodeAnalysis: + analysis: str # complete analysis of the episode + summary: str # short summary of the analysis + categories: dict[str, float] # score for each category e.g. type of error or difficulty levels + + +@dataclass +class EpisodeSummarizer: + + change_summarizer: ChangeSummarizer = None + llm: callable = None + parser: callable = lambda x: json_parser(x)[0] + guess_success: bool = False + + def make_prompt(self, exp_results: ExpResult, summaries: list[str]): ... + + def __call__(self, exp_results: ExpResult) -> EpisodeAnalysis: + """Run Change Summarizer for every step in the episode or extract a pre-computed one.""" + + if not self.guess_success: + if exp_results.steps_info[-1].reward == 1: + return {"analysis": "Success", "summaries": {}} + + with set_tracker("summary") as summaries_tracker: + summaries = self.make_change_summaries(exp_results) + prompt = self.make_prompt(exp_results, summaries) + + with set_tracker("analysis") as analysis_tracker: + raw_analysis = self.llm(prompt)["content"] + analysis = self.parse(raw_analysis) + res = { + "analysis": analysis, + "summaries": {i: a for i, a in enumerate(summaries)}, + } + res.update(analysis_tracker.stats) + res.update(summaries_tracker.stats) + return res + + def make_change_summaries(self, exp_result: ExpResult) -> list[str]: + summaries = [] # type: list[str] + # this assumes that there is always an extra step at the end of the episode + # it is generally the case, but exps can sometimes fail in a weird way and not save the last step_info + # TODO:(thibault) make some checks or w/e + for step, next_step in zip(exp_result.steps_info[:-1], exp_result.steps_info[1:]): + summaries.append(self.change_summarizer.summarize(step, next_step, summaries)) + return summaries + + def parse(self, raw_output: str) -> dict: + parsed_result = parse_html_tags(raw_output, keys=["explanation", "errorCategory"])[0] + return parsed_result + + +@dataclass +class EpisodeErrorSummarizer(EpisodeSummarizer): + + change_summarizer: ChangeSummarizer = None + + def make_prompt(self, exp_results: ExpResult, summaries: list[str]): + """TODO: Implement the prompt.""" + goal = exp_results.steps_info[0].obs["goal"] + + def format_summary(summary): + res = "" + for key, value in summary.items(): + res += f"{key}: {value}\n" + return res + + txt_summaries = "\n".join([format_summary(summary) for summary in summaries]) + + actions = [step.action for step in exp_results.steps_info[:-1]] + action_errors = "\n".join( + [step.obs["last_action_error"] for step in exp_results.steps_info[1:]] + ) + + txt_actions = "\n".join( + [ + f"Action: {action}\nAction Error: {action_error}" + for action, action_error in zip(actions, action_errors) + ] + ) + + extra_info = exp_results.steps_info[-1].task_info + + prompt = ( + ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT + if self.guess_success + else ERROR_CLASSIFICATION_PROMPT + ) + + return prompt.format( + goal=goal, + historical_summaries=txt_summaries, + action_history=txt_actions, + extra_info=extra_info, + ) diff --git a/src/agentlab/analyze/error_analysis/summarizer_prompts.py b/src/agentlab/analyze/error_analysis/summarizer_prompts.py new file mode 100644 index 00000000..a0df9fc9 --- /dev/null +++ b/src/agentlab/analyze/error_analysis/summarizer_prompts.py @@ -0,0 +1,268 @@ +CHANGE_SUMMARIZER_PROMPT = """ +You are a specialized 'change summarizer' model. At a given step in the agent's interaction with the website, +you will receive the following pieces of information: + +1. The user's MAIN GOAL (e.g., "Open a GitLab issue with label 'help wanted'"). +2. The AGENT'S PREVIOUS OBSERVATION (HTML or AX Tree snippet) or a 'DIFF' that shows what changed since the last step, and the corresponding change summaries. +3. The AGENT'S CURRENT OBSERVATION (HTML or AX Tree snippet). +4. The ACTION the agent just took (e.g., "Clicked the button labeled 'Show report'"). +5. (Optionally) The agent's CHAIN OF THOUGHT or short planning notes for this single step, if available. + +YOUR TASK (each step): +A) SUMMARIZE THE CHANGE + - Describe what visibly changed between the previous observation (or diff) and the current observation. + For example, did a new panel open, did the form reset, did nothing happen, etc.? + +B) ASSESS THE ACTION + - Decide whether the agent's action seems helpful or correct given the user's main goal, + or if it appears incorrect/unhelpful. + - Briefly explain why. + +OUTPUT FORMAT (per step): +Return your analysis as a JSON-like structure, for example: + +A new search results panel appeared on the right side. +Correct +Clicking 'Search' was appropriate to display the results. + +Or for an incorrect action: + +The page reloaded but the date fields were reset to defaults. +Incorrect +The agent should have fixed the date format first instead of re-clicking 'Show report'. +Correct the date format or check for error messages. + + +Please use single quotes '' to quote elements from the page, so as not to create parsing issues. + +Please follow this structure at every step. Keep your responses concise and clear. Below are the details. + +Goal: {goal} + +LLM Plan: {plan} + +Current Observation: {past_observation} + +Next Observation: {current_observation} + +Past summaries: {past_summaries} + +Action: {action} +""" + +ERROR_CLASSIFICATION_PROMPT_SUCCESS_OR_NOT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each category, +followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + +2. Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + +3. Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + +4. Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +5. Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + +6. Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +3. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +2) EXAMPLE B (Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. In case you think the task was unsuccessful, decide the category, or a combination thereof, under which the reason for failure lies. + If the task is successful, you can keep the error category as blank. + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +Output format example for an unsuccessful interaction: + +The agent opened the wrong GitLab page and never recovered... +False +["Navigation & Planning"] + +Output format example for a successful interaction: + +The agent opened the correct GitLab page and ... +True +[] + +Please follow this structure at every step. Keep your responses concise and clear. + +Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant. + +Overall goal: {goal} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} + +Extra information: {extra_info} +""" + + +ERROR_CLASSIFICATION_PROMPT = """ +You are an expert evaluator that classifies web agent failures according to a predefined taxonomy. +Below are the high-level definitions of each category, +followed by an explanation of the inputs of the interaction you will receive (planning history, chain of thought, etc.), +a set of labeled examples for reference (few-shot), and finally the classification task you must complete. + +-------------------------------------------------------------------------------- +TAXONOMY DEFINITIONS +-------------------------------------------------------------------------------- + +1. Navigation & Planning Errors + The agent cannot construct or execute a correct sequence of actions to reach its goal + (e.g., getting lost on a website, failing to recover from missteps, or using incorrect search terms). + +2. Interaction Execution Errors + The agent enters data in the wrong format, forgets to click "Submit" after typing, + repeats the same failing action without adaptation, or loses track of the changing webpage state. + +3. Information Processing Errors + The agent misreads or misinterprets visible data (e.g., extracting the wrong field values), + misconstrues relationships between pieces of information, or fails to validate data against task requirements. + +4. Observation & Action Errors + The agent fails to observe important updates in the environment (e.g., not noticing the page reloaded) + or misaligns its actions (clicks the wrong element or stale link). + +5. Task Understanding Errors + The agent misreads or misunderstands the user's objective (goal interpretation), + loses crucial context (context loss), or performs actions beyond or short of the intended scope. + +6. Reasoning Failures + The agent's logic is flawed (logical inference errors), behaves inconsistently across multiple steps, + or fails to prioritize important subtasks when handling complex goals. + +-------------------------------------------------------------------------------- +INPUT DESCRIPTION +-------------------------------------------------------------------------------- + +You will receive the following for each scenario: +1. User Goal + - The original objective provided by the user (e.g., "Open a GitLab issue labeled 'help wanted'"). + +2. Historical change summaries + - A list of summaries of changes in the observation that the agent has seen during the course of actions. + +3. Action History + - A record of the agent's step-by-step actions in the web environment (clicks, form entries, navigations, etc.) + along with immediate outcomes or errors. + +Using these inputs, you must categorize the observed failure (or success) under the appropriate category or categories. + +-------------------------------------------------------------------------------- +FEW-SHOT CLASSIFICATION EXAMPLES +-------------------------------------------------------------------------------- + +1) EXAMPLE A (Interaction Execution) + • Context: The agent repeatedly clicks "Show report" after entering dates in the wrong format. + Each time, the site resets to default dates. The agent never notices and keeps doing the same thing. + • Classification: ["Interaction Execution"] + • Justification: The agent used an invalid input format ("Format Errors"), then repeated the failing action + without adaptation ("Action Repetition"). + +2) EXAMPLE B (Task Understanding) + • Context: The user says, "In the repository myorg/myrepo, locate any issues labeled 'help wanted' + that are older than 30 days and add a comment saying 'I can help fix this.'" + The agent's planning notes mention searching for existing issues but quickly pivot to creating a brand-new issue + with label 'help wanted,' ignoring the user's actual request to find and comment on old issues. + • Classification: ["Task Understanding"] + • Justification: The agent misunderstood the user's goal. Instead of searching for and commenting on existing issues, + it focused on creating a new issue. This is a misinterpretation of the instructions, + not a mechanical error in clicking or input format. + +-------------------------------------------------------------------------------- +CLASSIFICATION TASK +-------------------------------------------------------------------------------- + +1. Read through: + - The planning and thought history + - The action history + - The current HTML or AX Tree observation + - The user goal + +2. Decide the error category, or a combination thereof, under which the reason for failure lies. + +3. Provide a brief explanation justifying your classification, referencing specific steps if helpful. + +Output format example for an interaction: + +The agent opened the wrong GitLab page and never recovered... +["Navigation & Planning"] + +Please follow this structure at every step. Keep your responses concise and clear. + +Below are the details for the interaction. Extra information yields additional information from the environment. It might not always be present or relevant. + +Overall goal: {goal} + +Historical change summaries: {historical_summaries} + +Action history: {action_history} + +Extra information: {extra_info} +""" diff --git a/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py new file mode 100644 index 00000000..af5613bb --- /dev/null +++ b/src/agentlab/analyze/error_analysis/validate_analysis_predictions.py @@ -0,0 +1,31 @@ +from pathlib import Path +from agentlab.analyze.inspect_results import ( + load_result_df, +) +import json + + +def get_aggregate_statistics(exp_dir: Path): + """Get aggregate statistics for the experiment results.""" + results = load_result_df(exp_dir, filter=filter) + + +if __name__ == "__main__": + path = Path( + "/mnt/colab_public/data/ui_copilot/thibault/tmlr_exps/2024-10-23_14-17-47_5_agents_on_workarena_l1" + ) + results = load_result_df(path).reset_index() + results = results.loc[results["agent.chat_model.model_name"].str.contains("anthropic")] + success_predictions = [] + for dir in results["exp_dir"]: + error_analysis = Path(dir) / "error_analysis.json" + if error_analysis.exists(): + with open(error_analysis, "r") as f: + error_analysis = json.load(f) + task_success_prediction_str = error_analysis["analysis"]["success"] + task_success_prediction = True if task_success_prediction_str == "True" else False + success_predictions.append(task_success_prediction) + else: + success_predictions.append(None) + results["success_predictions"] = success_predictions + a = 1 diff --git a/src/agentlab/llm/chat_api.py b/src/agentlab/llm/chat_api.py index 096aae00..567e0798 100644 --- a/src/agentlab/llm/chat_api.py +++ b/src/agentlab/llm/chat_api.py @@ -4,7 +4,7 @@ import time from dataclasses import dataclass from functools import partial -from typing import Optional +from typing import Optional, Union import openai from huggingface_hub import InferenceClient @@ -13,7 +13,7 @@ import agentlab.llm.tracking as tracking from agentlab.llm.base_api import AbstractChatModel, BaseModelArgs from agentlab.llm.huggingface_utils import HFBaseChatModel -from agentlab.llm.llm_utils import AIMessage, Discussion +from agentlab.llm.llm_utils import AIMessage, Discussion, HumanMessage def make_system_message(content: str) -> dict: @@ -268,7 +268,13 @@ def __init__( **client_args, ) - def __call__(self, messages: list[dict], n_samples: int = 1, temperature: float = None) -> dict: + def __call__( + self, messages: Union[str, list[dict]], n_samples: int = 1, temperature: float = None + ) -> dict: + + if isinstance(messages, str): + messages = [HumanMessage(messages)] + # Initialize retry tracking attributes self.retries = 0 self.success = False diff --git a/tests/analyze/error_analysis/test_pipeline.py b/tests/analyze/error_analysis/test_pipeline.py new file mode 100644 index 00000000..a2f6295d --- /dev/null +++ b/tests/analyze/error_analysis/test_pipeline.py @@ -0,0 +1,63 @@ +from pathlib import Path + +import pytest +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.pipeline import ErrorAnalysisPipeline + +exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis" + + +class MockStepSummarizer: + def summarize( + self, step: StepInfo, action: str, next_step: StepInfo, step_summaries: list[str] + ) -> str: + return f"Agent took action {action} at step {len(step_summaries)}" + + +class MockEpisodeSummarizer: + def __call__(self, exp_result: ExpResult) -> str: + return f"Agent did actions {', '.join(step.action for step in exp_result.steps_info if step.action)}" + + +class MockAnalyzer: + def __call__( + self, exp_result: ExpResult, episode_analysis: str, step_analysis: list[str] + ) -> str: + return {"error": "analysis", "episode": episode_analysis} + + +@pytest.fixture(scope="module") +def pipeline() -> ErrorAnalysisPipeline: + return ErrorAnalysisPipeline( + exp_dir=exp_dir, + filter=None, + episode_summarizer=MockEpisodeSummarizer(), + ) + + +def test_yield_no_filter(pipeline: ErrorAnalysisPipeline): + assert len(list(pipeline.filter_exp_results())) == 4 + + +def test_yield_with_filter(pipeline: ErrorAnalysisPipeline): + pattern = "click-dialog" + pipeline.filter = pattern + assert len(list(pipeline.filter_exp_results())) == 2 + pipeline.filter = None + + +def test_save_analysis(pipeline: ErrorAnalysisPipeline): + exp_result = next(pipeline.filter_exp_results()) + + error_analysis = pipeline.episode_summarizer(exp_result) + pipeline.save_analysis(exp_result, error_analysis, exists_ok=False) + + assert (exp_result.exp_dir / "error_analysis.json").exists() + + # remove the file + (exp_result.exp_dir / "error_analysis.json").unlink() + + +if __name__ == "__main__": + test_yield_with_filter() diff --git a/tests/analyze/error_analysis/test_summarizer.py b/tests/analyze/error_analysis/test_summarizer.py new file mode 100644 index 00000000..83418496 --- /dev/null +++ b/tests/analyze/error_analysis/test_summarizer.py @@ -0,0 +1,30 @@ +from pathlib import Path + +import pytest +from bgym import ExpResult, StepInfo + +from agentlab.analyze.error_analysis.summarizer import ChangeSummarizer +from agentlab.analyze.inspect_results import yield_all_exp_results + + +@pytest.fixture(scope="module") +def exp_results() -> list[ExpResult]: + exp_dir = Path(__file__).parent.parent.parent / "data/error_analysis" + return list(yield_all_exp_results(exp_dir)) + + +@pytest.mark.pricy +def test_change_summarizer(exp_results: list[ExpResult]): + summarizer = ChangeSummarizer(llm=lambda x: {"content": x}) + step = exp_results[0].steps_info[0] + next_step = exp_results[0].steps_info[1] + past_summaries = [] + summary = summarizer.summarize(step, next_step, past_summaries) + assert isinstance(summary, dict) + + +if __name__ == "__main__": + exp_res = list( + yield_all_exp_results(Path(__file__).parent.parent.parent / "data/error_analysis") + ) + test_change_summarizer(exp_res) diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl new file mode 100644 index 00000000..b2856641 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/exp_args.pkl differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz new file mode 100644 index 00000000..482f9b3d Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/goal_object.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz new file mode 100644 index 00000000..00267af0 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_0.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz new file mode 100644 index 00000000..52c5209d Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_1.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz new file mode 100644 index 00000000..b00c7bcf Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_2.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz new file mode 100644 index 00000000..d7f75bd6 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/step_3.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json new file mode 100644 index 00000000..34e6f226 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 3, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 4, + "stats.cum_n_token_goal": 27, + "stats.max_n_token_goal": 9, + "stats.cum_n_token_url": 72, + "stats.max_n_token_url": 24, + "stats.cum_n_token_focused_element_bid": 3, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 8, + "stats.max_n_token_last_action": 4, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 2892, + "stats.max_n_token_dom_txt": 966, + "stats.cum_n_token_axtree_txt": 667, + "stats.max_n_token_axtree_txt": 223, + "stats.cum_n_token_pruned_html": 1014, + "stats.max_n_token_pruned_html": 340, + "stats.cum_n_retry_llm": 3, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 4339, + "stats.max_input_tokens": 1464, + "stats.cum_output_tokens": 225, + "stats.max_output_tokens": 84, + "stats.cum_cost": 0.00078585, + "stats.max_cost": 0.0002646, + "stats.cum_n_token_agent_messages": 4512, + "stats.max_n_token_agent_messages": 1517, + "stats.cum_step_elapsed": 3.0203144550323486, + "stats.max_step_elapsed": 1.3659462928771973, + "stats.cum_agent_elapsed": 3.8209800720214844, + "stats.max_agent_elapsed": 1.8219048976898193, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl new file mode 100644 index 00000000..6bdd8639 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/exp_args.pkl differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz new file mode 100644 index 00000000..45522b9e Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/goal_object.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz new file mode 100644 index 00000000..1d3d08b1 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_0.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz new file mode 100644 index 00000000..18c107bd Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_1.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz new file mode 100644 index 00000000..d55bd69a Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/step_2.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json new file mode 100644 index 00000000..6f351629 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 2, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 3, + "stats.cum_n_token_goal": 12, + "stats.max_n_token_goal": 6, + "stats.cum_n_token_url": 48, + "stats.max_n_token_url": 24, + "stats.cum_n_token_focused_element_bid": 2, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 4, + "stats.max_n_token_last_action": 4, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1902, + "stats.max_n_token_dom_txt": 952, + "stats.cum_n_token_axtree_txt": 400, + "stats.max_n_token_axtree_txt": 201, + "stats.cum_n_token_pruned_html": 650, + "stats.max_n_token_pruned_html": 326, + "stats.cum_n_retry_llm": 2, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 2789, + "stats.max_input_tokens": 1404, + "stats.cum_output_tokens": 128, + "stats.max_output_tokens": 65, + "stats.cum_cost": 0.00049515, + "stats.max_cost": 0.00024839999999999997, + "stats.cum_n_token_agent_messages": 2902, + "stats.max_n_token_agent_messages": 1459, + "stats.cum_step_elapsed": 6.860883951187134, + "stats.max_step_elapsed": 5.8696064949035645, + "stats.cum_agent_elapsed": 3.769465684890747, + "stats.max_agent_elapsed": 2.946484327316284, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl new file mode 100644 index 00000000..71da24d7 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/exp_args.pkl differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz new file mode 100644 index 00000000..6f8de674 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/goal_object.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz new file mode 100644 index 00000000..94b8701c Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_0.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz new file mode 100644 index 00000000..636120ba Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/step_1.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json new file mode 100644 index 00000000..351aa01c --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 1, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 2, + "stats.cum_n_token_goal": 10, + "stats.max_n_token_goal": 10, + "stats.cum_n_token_url": 23, + "stats.max_n_token_url": 23, + "stats.cum_n_token_focused_element_bid": 1, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 0, + "stats.max_n_token_last_action": 0, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1257, + "stats.max_n_token_dom_txt": 1257, + "stats.cum_n_token_axtree_txt": 75, + "stats.max_n_token_axtree_txt": 75, + "stats.cum_n_token_pruned_html": 658, + "stats.max_n_token_pruned_html": 658, + "stats.cum_n_retry_llm": 1, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 1594, + "stats.max_input_tokens": 1594, + "stats.cum_output_tokens": 64, + "stats.max_output_tokens": 64, + "stats.cum_cost": 0.00027749999999999997, + "stats.max_cost": 0.00027749999999999997, + "stats.cum_n_token_agent_messages": 1653, + "stats.max_n_token_agent_messages": 1653, + "stats.cum_step_elapsed": 5.879024505615234, + "stats.max_step_elapsed": 5.879024505615234, + "stats.cum_agent_elapsed": 3.029170036315918, + "stats.max_agent_elapsed": 3.029170036315918, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl new file mode 100644 index 00000000..3399e40c Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/exp_args.pkl differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz new file mode 100644 index 00000000..ef19e47f Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/goal_object.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt new file mode 100644 index 00000000..512944ab --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/package_versions.txt @@ -0,0 +1,287 @@ +Faker==30.6.0 +Farama-Notifications==0.0.4 +Flask==3.0.3 +GitPython==3.1.43 +Jinja2==3.1.4 +MarkupSafe==2.1.5 +PyYAML==6.0.2 +Pygments==2.18.0 +SQLAlchemy==2.0.36 +Send2Trash==1.8.3 +Werkzeug==3.0.4 +agentlab==0.3.2 +agentlab==0.3.2 +aiofiles==23.2.1 +aiohappyeyeballs==2.4.3 +aiohttp-cors==0.7.0 +aiohttp==3.10.10 +aiolimiter==1.1.0 +aiosignal==1.3.1 +annotated-types==0.7.0 +anthropic==0.37.1 +anyio==4.6.2.post1 +argcomplete==3.5.1 +argon2-cffi-bindings==21.2.0 +argon2-cffi==23.1.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beartype==0.12.0 +beautifulsoup4==4.12.3 +black==24.2.0 +blacken-docs==1.19.0 +bleach==6.1.0 +blinker==1.8.2 +browsergym-assistantbench==0.12.0 +browsergym-core==0.12.0 +browsergym-experiments==0.12.0 +browsergym-miniwob==0.12.0 +browsergym-visualwebarena==0.12.0 +browsergym-webarena==0.12.0 +browsergym-workarena==0.4.1 +browsergym==0.12.0 +cachetools==5.5.0 +certifi==2024.8.30 +cffi==1.17.1 +cfgv==3.4.0 +charset-normalizer==3.4.0 +click==8.1.7 +cloudpickle==3.1.0 +colorama==0.4.6 +colorama==0.4.6 +colorful==0.5.6 +comm==0.2.2 +contexttimer==0.3.3 +contourpy==1.3.0 +cycler==0.12.1 +dask==2024.10.0 +dataclasses-json==0.6.7 +datasets==3.0.1 +debugpy==1.8.7 +decorator==5.1.1 +defusedxml==0.7.1 +dill==0.3.8 +distlib==0.3.9 +distributed==2024.10.0 +distro==1.9.0 +english-words==2.0.1 +evaluate==0.4.3 +execnet==2.1.1 +executing==2.1.0 +fastapi==0.115.2 +fastjsonschema==2.20.0 +ffmpy==0.4.0 +filelock==3.16.1 +fonttools==4.54.1 +fqdn==1.5.1 +frozenlist==1.4.1 +fsspec==2024.6.1 +gitdb==4.0.11 +google-api-core==2.23.0 +google-auth==2.36.0 +googleapis-common-protos==1.66.0 +gradio==5.7.1 +gradio_client==1.5.0 +greenlet==3.0.0 +grpcio==1.68.0 +gymnasium==1.0.0 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.26.0 +identify==2.6.1 +idna==3.10 +imageio==2.36.0 +importlib_resources==6.4.5 +iniconfig==2.0.0 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +itsdangerous==2.2.0 +jedi==0.19.1 +jiter==0.6.1 +joblib==1.4.2 +json5==0.9.25 +jsonpatch==1.33 +jsonpointer==3.0.0 +jsonschema-specifications==2024.10.1 +jsonschema==4.23.0 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +kiwisolver==1.4.7 +langchain-community==0.3.3 +langchain-core==0.3.12 +langchain-text-splitters==0.3.0 +langchain==0.3.4 +langsmith==0.1.136 +lazy_loader==0.4 +libvisualwebarena==0.0.14 +libwebarena==0.0.3 +linkify-it-py==2.0.3 +locket==1.0.0 +lxml==5.3.0 +markdown-it-py==3.0.0 +marshmallow==3.23.0 +matplotlib-inline==0.1.7 +matplotlib==3.9.2 +mdit-py-plugins==0.4.2 +mdurl==0.1.2 +memray==1.14.0 +mistune==3.0.2 +mpmath==1.3.0 +msgpack==1.1.0 +multidict==6.1.0 +multiprocess==0.70.16 +mypy-extensions==1.0.0 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +networkx==3.4.1 +nltk==3.9.1 +nodeenv==1.9.1 +notebook_shim==0.2.4 +numpy==1.26.4 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.52.0 +opencensus-context==0.1.3 +opencensus==0.11.4 +orjson==3.10.7 +overrides==7.7.0 +packaging==24.1 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +partd==1.4.2 +pathspec==0.12.1 +pexpect==4.9.0 +pillow==10.4.0 +pip==24.2 +platformdirs==4.3.6 +playwright==1.39.0 +pluggy==1.5.0 +portalocker==2.10.1 +pre_commit==4.0.1 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 +proto-plus==1.25.0 +protobuf==5.28.3 +psutil==6.1.0 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-spy==0.4.0 +pyarrow==17.0.0 +pyasn1==0.6.1 +pyasn1_modules==0.4.1 +pycparser==2.22 +pydantic-settings==2.6.0 +pydantic==2.9.2 +pydantic_core==2.23.4 +pydub==0.25.1 +pyee==11.0.1 +pyparsing==3.2.0 +pytest-base-url==2.1.0 +pytest-playwright==0.5.2 +pytest-xdist==3.6.1 +pytest==7.3.2 +python-dateutil==2.9.0.post0 +python-dotenv==1.0.1 +python-json-logger==2.0.7 +python-multipart==0.0.12 +python-slugify==8.0.4 +pytz==2024.2 +pyzmq==26.2.0 +ray==2.39.0 +referencing==0.35.1 +regex==2024.9.11 +requests-toolbelt==1.0.0 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 +rsa==4.9 +ruff==0.7.0 +sacrebleu==2.4.3 +safehttpx==0.1.6 +safetensors==0.4.5 +scikit-image==0.24.0 +scipy==1.14.1 +semantic-version==2.10.0 +setproctitle==1.2.2 +setuptools==75.1.0 +shellingham==1.5.4 +six==1.16.0 +smart-open==7.0.5 +smmap==5.0.1 +sniffio==1.3.1 +sortedcontainers==2.4.0 +soupsieve==2.6 +stack-data==0.6.3 +starlette==0.40.0 +sympy==1.13.1 +tabulate==0.9.0 +tblib==3.0.0 +tenacity==9.0.0 +terminado==0.18.1 +text-generation==0.7.0 +text-unidecode==1.3 +textual==0.86.2 +tifffile==2024.9.20 +tiktoken==0.8.0 +tinycss2==1.3.0 +tokenize-rt==6.0.0 +tokenizers==0.20.1 +tomlkit==0.12.0 +toolz==1.0.0 +torch==2.5.1 +tornado==6.4.1 +tqdm==4.66.5 +traitlets==5.14.3 +transformers==4.45.2 +triton==3.1.0 +typer==0.12.5 +types-python-dateutil==2.9.0.20241003 +types-tqdm==4.66.0.20240417 +typing-inspect==0.9.0 +typing_extensions==4.12.2 +tzdata==2024.2 +uc-micro-py==1.0.3 +uri-template==1.3.0 +urllib3==2.2.3 +uvicorn==0.32.0 +virtualenv==20.27.0 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +weblinx-browsergym==0.0.1.dev10 +weblinx==0.3.2 +websocket-client==1.8.0 +websockets==12.0 +wheel==0.44.0 +wrapt==1.16.0 +xxhash==3.5.0 +yarl==1.15.5 +zict==3.0.0 \ No newline at end of file diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz new file mode 100644 index 00000000..2aac84fd Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_0.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz new file mode 100644 index 00000000..a426bd07 Binary files /dev/null and b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/step_1.pkl.gz differ diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv new file mode 100644 index 00000000..85b34311 --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_df.csv @@ -0,0 +1,2 @@ +avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost +1.0,0.0,1.0,1/1,0,0.0003 diff --git a/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json new file mode 100644 index 00000000..a17872af --- /dev/null +++ b/tests/data/error_analysis/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28/summary_info.json @@ -0,0 +1,44 @@ +{ + "n_steps": 1, + "cum_reward": 1.0, + "cum_raw_reward": 0, + "err_msg": null, + "stack_trace": null, + "stats.cum_steps": 2, + "stats.cum_n_token_goal": 10, + "stats.max_n_token_goal": 10, + "stats.cum_n_token_url": 23, + "stats.max_n_token_url": 23, + "stats.cum_n_token_focused_element_bid": 1, + "stats.max_n_token_focused_element_bid": 1, + "stats.cum_n_token_last_action": 0, + "stats.max_n_token_last_action": 0, + "stats.cum_n_token_last_action_error": 0, + "stats.max_n_token_last_action_error": 0, + "stats.cum_n_token_dom_txt": 1250, + "stats.max_n_token_dom_txt": 1250, + "stats.cum_n_token_axtree_txt": 71, + "stats.max_n_token_axtree_txt": 71, + "stats.cum_n_token_pruned_html": 651, + "stats.max_n_token_pruned_html": 651, + "stats.cum_n_retry_llm": 1, + "stats.max_n_retry_llm": 1, + "stats.cum_n_retry": 0.0, + "stats.max_n_retry": 0.0, + "stats.cum_busted_retry": 0, + "stats.max_busted_retry": 0, + "stats.cum_input_tokens": 1589, + "stats.max_input_tokens": 1589, + "stats.cum_output_tokens": 63, + "stats.max_output_tokens": 63, + "stats.cum_cost": 0.00027614999999999996, + "stats.max_cost": 0.00027614999999999996, + "stats.cum_n_token_agent_messages": 1641, + "stats.max_n_token_agent_messages": 1641, + "stats.cum_step_elapsed": 5.891982078552246, + "stats.max_step_elapsed": 5.891982078552246, + "stats.cum_agent_elapsed": 3.4504799842834473, + "stats.max_agent_elapsed": 3.4504799842834473, + "terminated": true, + "truncated": false +} \ No newline at end of file diff --git a/tests/data/error_analysis/error_report_trial_1_of_3.md b/tests/data/error_analysis/error_report_trial_1_of_3.md new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/error_analysis/result_df_trial_1_of_3.csv b/tests/data/error_analysis/result_df_trial_1_of_3.csv new file mode 100644 index 00000000..4095252c --- /dev/null +++ b/tests/data/error_analysis/result_df_trial_1_of_3.csv @@ -0,0 +1,5 @@ +env.task_name,agent.agent_name,env.benchmark,index,exp_dir,agent.chat_model.model_name,agent.chat_model.max_total_tokens,agent.chat_model.max_input_tokens,agent.chat_model.max_new_tokens,agent.chat_model.temperature,agent.chat_model.vision_support,agent.chat_model.deployment_name,agent.flags.obs.use_html,agent.flags.obs.use_ax_tree,agent.flags.obs.use_tabs,agent.flags.obs.use_focused_element,agent.flags.obs.use_error_logs,agent.flags.obs.use_history,agent.flags.obs.use_past_error_logs,agent.flags.obs.use_action_history,agent.flags.obs.use_think_history,agent.flags.obs.use_diff,agent.flags.obs.html_type,agent.flags.obs.use_screenshot,agent.flags.obs.use_som,agent.flags.obs.extract_visible_tag,agent.flags.obs.extract_clickable_tag,agent.flags.obs.extract_coords,agent.flags.obs.filter_visible_elements_only,agent.flags.obs.openai_vision_detail,agent.flags.obs.filter_with_bid_only,agent.flags.obs.filter_som_only,agent.flags.action.action_set.subsets,agent.flags.action.action_set.multiaction,agent.flags.action.action_set.strict,agent.flags.action.action_set.retry_with_force,agent.flags.action.action_set.demo_mode,agent.flags.action.long_description,agent.flags.action.individual_examples,agent.flags.action.multi_actions,agent.flags.action.is_strict,agent.flags.use_plan,agent.flags.use_criticise,agent.flags.use_thinking,agent.flags.use_memory,agent.flags.use_concrete_example,agent.flags.use_abstract_example,agent.flags.use_hints,agent.flags.enable_chat,agent.flags.max_prompt_tokens,agent.flags.be_cautious,agent.flags.extra_instructions,agent.flags.add_missparsed_messages,agent.flags.max_trunc_itr,agent.flags.flag_group,agent.max_retry,env.task_seed,env.max_steps,env.headless,env.record_video,env.wait_for_user_message,env.viewport,env.slow_mo,env.storage_state,env.task_kwargs,exp_name,enable_debug,err_msg,stack_trace,order,logging_level,logging_level_stdout,exp_id,depends_on,save_screenshot,save_som,n_steps,cum_reward,cum_raw_reward,stats.cum_steps,stats.cum_n_token_goal,stats.max_n_token_goal,stats.cum_n_token_url,stats.max_n_token_url,stats.cum_n_token_focused_element_bid,stats.max_n_token_focused_element_bid,stats.cum_n_token_last_action,stats.max_n_token_last_action,stats.cum_n_token_last_action_error,stats.max_n_token_last_action_error,stats.cum_n_token_dom_txt,stats.max_n_token_dom_txt,stats.cum_n_token_axtree_txt,stats.max_n_token_axtree_txt,stats.cum_n_token_pruned_html,stats.max_n_token_pruned_html,stats.cum_n_retry_llm,stats.max_n_retry_llm,stats.cum_n_retry,stats.max_n_retry,stats.cum_busted_retry,stats.max_busted_retry,stats.cum_input_tokens,stats.max_input_tokens,stats.cum_output_tokens,stats.max_output_tokens,stats.cum_cost,stats.max_cost,stats.cum_n_token_agent_messages,stats.max_n_token_agent_messages,stats.cum_step_elapsed,stats.max_step_elapsed,stats.cum_agent_elapsed,stats.max_agent_elapsed,terminated,truncated,err_key +miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,1,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,7,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_7,True,,,2,10,30,dd9e91e0-75ef-4bb4-9db1-f91f06848dcb,(),True,False,2,1.0,0,3,12,6,48,24,2,1,4,4,0,0,1902,952,400,201,650,326,2,1,0.0,0.0,0,0,2789,1404,128,65,0.00049515,0.00024839999999999997,2902,1459,6.860883951187134,5.8696064949035645,3.769465684890747,2.946484327316284,True,False, +miniwob.click-checkboxes,GenericAgent-gpt-4o-mini,miniwob,2,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,20,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-checkboxes_20,True,,,3,10,30,187f0f01-a240-419c-a65e-0058a14f639d,(),True,False,3,1.0,0,4,27,9,72,24,3,1,8,4,0,0,2892,966,667,223,1014,340,3,1,0.0,0.0,0,0,4339,1464,225,84,0.00078585,0.0002646,4512,1517,3.0203144550323486,1.3659462928771973,3.8209800720214844,1.8219048976898193,True,False, +miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,0,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,28,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_28,True,,,0,10,30,b403cfca-4647-48fb-98f2-57e94306a38a,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1250,1250,71,71,651,651,1,1,0.0,0.0,0,0,1589,1589,63,63,0.00027614999999999996,0.00027614999999999996,1641,1641,5.891982078552246,5.891982078552246,3.4504799842834473,3.4504799842834473,True,False, +miniwob.click-dialog,GenericAgent-gpt-4o-mini,miniwob,3,/home/t/agentlab_results/2025-01-22_11-03-29_genericagent-gpt-4o-mini-on-miniwob-tiny-test/2025-01-22_11-03-29_GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,gpt-4o-mini,128000,128000,16384,0.1,True,gpt-4o-mini-2024-07-18,True,True,False,True,True,True,False,True,False,False,pruned_html,False,False,True,True,False,False,auto,False,False,"('miniwob_all',)",False,False,True,off,False,False,,,False,False,True,False,True,True,True,False,40000,True,,True,20,,4,14,5,True,False,False,,,,,GenericAgent-gpt-4o-mini_on_miniwob.click-dialog_14,True,,,1,10,30,4c89cb70-0bf8-42c2-be39-a9c1a39ffe8d,(),True,False,1,1.0,0,2,10,10,23,23,1,1,0,0,0,0,1257,1257,75,75,658,658,1,1,0.0,0.0,0,0,1594,1594,64,64,0.00027749999999999997,0.00027749999999999997,1653,1653,5.879024505615234,5.879024505615234,3.029170036315918,3.029170036315918,True,False, diff --git a/tests/data/error_analysis/study.pkl.gz b/tests/data/error_analysis/study.pkl.gz new file mode 100644 index 00000000..8611c7d3 Binary files /dev/null and b/tests/data/error_analysis/study.pkl.gz differ diff --git a/tests/data/error_analysis/summary_df_trial_1_of_3.csv b/tests/data/error_analysis/summary_df_trial_1_of_3.csv new file mode 100644 index 00000000..545cfc29 --- /dev/null +++ b/tests/data/error_analysis/summary_df_trial_1_of_3.csv @@ -0,0 +1,2 @@ +agent.agent_name,env.benchmark,avg_reward,std_err,avg_steps,n_completed,n_err,cum_cost +GenericAgent-gpt-4o-mini,miniwob,1.0,0.0,1.75,4/4,0,0.0018