LINs-lab
diff --git a/‎docs/aflow_optimize.md‎
Lines changed: 9 additions & 9 deletions b/‎docs/aflow_optimize.md‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎docs/quick_start/usage.md‎
Lines changed: 11 additions & 0 deletions b/‎docs/quick_start/usage.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎example/aflow/run_aflow_optimize.py‎
Lines changed: 21 additions & 6 deletions b/‎example/aflow/run_aflow_optimize.py‎
Lines changed: 21 additions & 6 deletions
diff --git a/‎example/aflow/run_aflow_optimize.sh‎
Lines changed: 7 additions & 1 deletion b/‎example/aflow/run_aflow_optimize.sh‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎main.py‎
Lines changed: 15 additions & 2 deletions b/‎main.py‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎mas_arena/agent_flow/workflow_evaluator.py‎
Lines changed: 9 additions & 6 deletions b/‎mas_arena/agent_flow/workflow_evaluator.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎mas_arena/agent_flow/workflow_runner.py‎
Lines changed: 2 additions & 11 deletions b/‎mas_arena/agent_flow/workflow_runner.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎mas_arena/agents/base.py‎
Lines changed: 3 additions & 2 deletions b/‎mas_arena/agents/base.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎mas_arena/agents/single_agent.py‎
Lines changed: 1 addition & 1 deletion b/‎mas_arena/agents/single_agent.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎mas_arena/benchmark_runner.py‎
Lines changed: 16 additions & 6 deletions b/‎mas_arena/benchmark_runner.py‎
Lines changed: 16 additions & 6 deletions
@@ -50,15 +50,15 @@ This command will:
 
 The following arguments in `main.py` control the optimization process.
 
-| Argument                | Type   | Default                                      | Description                                                  |
-|-------------------------|--------|----------------------------------------------|--------------------------------------------------------------|
-| `--run-optimizer`       | str    | `None`                                       | Specifies the optimizer to run. Use `aflow`.                 |
-| `--benchmark`           | str    | `math`                                       | Benchmark to optimize for.                                   |
-| `--graph_path`          | str    | `mas_arena/configs/aflow`                    | Path to the base AFlow graph configuration.                  |
-| `--optimized_path`      | str    | `example/aflow/humaneval/optimization`       | Path to save the optimized AFlow graph and intermediate files. |
-| `--validation_rounds`   | int    | 1                                            | Number of validation rounds per optimization cycle.          |
-| `--eval_rounds`         | int    | 1                                            | Number of evaluation rounds per optimization cycle.          |
-| `--max_rounds`          | int    | 3                                            | Maximum number of optimization rounds.                       |
+| Argument                | Type   | Default                                | Description                                                  |
+|-------------------------|--------|----------------------------------------|--------------------------------------------------------------|
+| `--run-optimizer`       | str    | `None`                                 | Specifies the optimizer to run. Use `aflow`.                 |
+| `--benchmark`           | str    | `humaneval`                            | Benchmark to optimize for.                                   |
+| `--graph_path`          | str    | `mas_arena/configs/aflow`              | Path to the base AFlow graph configuration.                  |
+| `--optimized_path`      | str    | `example/aflow/humaneval/optimization` | Path to save the optimized AFlow graph and intermediate files. |
+| `--validation_rounds`   | int    | 1                                      | Number of validation rounds per optimization cycle.          |
+| `--eval_rounds`         | int    | 1                                      | Number of evaluation rounds per optimization cycle.          |
+| `--max_rounds`          | int    | 3                                      | Maximum number of optimization rounds.                       |
 
 ---
 
 
@@ -55,7 +55,16 @@ To run AFlow to optimize an agent for the `humaneval` benchmark, provide `aflow`
 ./run_benchmark.sh humaneval single_agent 10 "" "" aflow
 ```
 
+You can also specify the training and test set sizes for the optimizer. Note that when using the `aflow` optimizer, the number of problems is determined by `train_size` and `test_size`, and the `limit` argument is ignored for data selection.
 
+**Example with custom training and test sizes:**
+
+```bash
+# Run AFlow with a training set of 30 and a test set of 15.
+# The "" arguments are placeholders for mcp_config and concurrency.
+# The limit argument (10) is ignored in this case.
+./run_benchmark.sh humaneval single_agent 10 "" "" aflow 30 15
+```
 
 ## Command-Line Arguments
 
@@ -90,6 +99,8 @@ These arguments are used when running an optimizer like AFlow via `--run-optimiz
 | `--validation_rounds`| int | 1 | Number of validation rounds per optimization cycle. |
 | `--eval_rounds` | int | 1 | Number of evaluation rounds per optimization cycle. |
 | `--max_rounds` | int | 3 | Maximum number of optimization rounds. |
+| `--train_size` | int | 40 | Size of the training set for evaluation. |
+| `--test_size` | int | 20 | Size of the test set for evaluation. |
 
 ## Example Output
 
 
@@ -1,5 +1,7 @@
 import argparse
 import os
+import time
+
 from dotenv import load_dotenv
 from typing import Dict, Any
 
@@ -70,15 +72,19 @@ def run_aflow_optimization(args: argparse.Namespace) -> str:
         validation_rounds=args.validation_rounds,
         eval_rounds=args.eval_rounds,
         max_rounds=args.max_rounds,
+        train_size=args.train_size,
+        test_size=args.test_size,
         **EXPERIMENTAL_CONFIG.get(args.benchmark, {}),
     )
 
     optimizer.setup()
     optimizer.optimize(evaluator)
-    
-    final_graph_path = os.path.join(args.optimized_path, "final_graph.json")
-    print(f"\n[AFlow] Optimization complete. Optimized graph saved to: {final_graph_path}")
-    return final_graph_path
+    optimizer.test(evaluator)
+
+    best_round = optimizer.find_best_performing_round()
+    best_graph_path = f"{args.optimized_path}/round_{best_round}/graph.py"
+    print(f"\n[AFlow] Optimization complete. Best workflow graph: {best_graph_path}")
+    return best_graph_path
 
 
 def main():
@@ -88,7 +94,7 @@ def main():
         "--benchmark",
         type=str,
         default="humaneval",
-        choices=list(["humaneval"]),
+        choices=list(["humaneval","mbpp"]),
         help="Benchmark to run.",
     )
     parser.add_argument(
@@ -100,14 +106,23 @@ def main():
     parser.add_argument(
         "--optimized_path",
         type=str,
-        default="example/aflow/humaneval/optimization",
+        default=None,
         help="Path to save the optimized agent flow graph.",
     )
     parser.add_argument("--validation_rounds", type=int, default=1, help="Number of validation rounds.")
     parser.add_argument("--eval_rounds", type=int, default=1, help="Number of evaluation rounds.")
     parser.add_argument("--max_rounds", type=int, default=3, help="Maximum number of optimization rounds.")
+    parser.add_argument("--train_size", type=int, default=40, help="Size of the training set for evaluation.")
+    parser.add_argument("--test_size", type=int, default=20, help="Size of the test set for evaluation.")
     args = parser.parse_args()
 
+    if not args.optimized_path:
+        args.optimized_path = f"example/aflow/{args.benchmark}/optimization"
+
+    if os.path.exists(args.optimized_path):
+        timestamp = time.strftime("%Y%m%d_%H%M%S")
+        args.optimized_path = f"{args.optimized_path}_{timestamp}"
+
     optimized_graph_path = run_aflow_optimization(args)
 
     print("\n" + "=" * 80)
 
@@ -10,6 +10,8 @@ OPTIMIZED_PATH=${3:-"example/aflow/humaneval/optimization"}
 VALIDATION_ROUNDS=${4:-1}
 EVAL_ROUNDS=${5:-1}
 MAX_ROUNDS=${6:-3}
+TRAIN_SIZE=${7:-40}
+TEST_SIZE=${8:-20}
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
@@ -41,6 +43,8 @@ echo "Optimized Path: $OPTIMIZED_PATH"
 echo "Validation Rounds: $VALIDATION_ROUNDS"
 echo "Evaluation Rounds: $EVAL_ROUNDS"
 echo "Max Optimization Rounds: $MAX_ROUNDS"
+echo "Size of the training set for evaluation: $TRAIN_SIZE"
+echo "Size of the training set for evaluation: $TEST_SIZE"
 echo "====================================================="
 echo ""
 
@@ -50,6 +54,8 @@ python example/aflow/run_aflow_optimize.py \
     --optimized_path "$OPTIMIZED_PATH" \
     --validation_rounds "$VALIDATION_ROUNDS" \
     --eval_rounds "$EVAL_ROUNDS" \
-    --max_rounds "$MAX_ROUNDS"
+    --max_rounds "$MAX_ROUNDS" \
+    --train_size "$TRAIN_SIZE" \
+    --test_size "$TEST_SIZE"
 
 exit $?
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 import argparse
 import datetime
+import os
 import sys
+import time
 from pathlib import Path
 import asyncio
 
@@ -93,18 +95,27 @@ def main():
     optimizer_group.add_argument(
         "--optimized_path",
         type=str,
-        default="example/aflow/humaneval/optimization",
+        default=None,
         help="Path to save the optimized agent flow graph.",
     )
     optimizer_group.add_argument("--validation_rounds", type=int, default=1, help="Number of validation rounds.")
     optimizer_group.add_argument("--eval_rounds", type=int, default=1, help="Number of evaluation rounds.")
     optimizer_group.add_argument("--max_rounds", type=int, default=3, help="Maximum number of optimization rounds.")
+    optimizer_group.add_argument("--train_size", type=int, default=40, help="Size of the training set for evaluation.")
+    optimizer_group.add_argument("--test_size", type=int, default=20, help="Size of the test set for evaluation.")
 
     # Parse arguments
     args = parser.parse_args()
 
     if args.run_optimizer:
         if args.run_optimizer == "aflow":
+            if not args.optimized_path:
+                args.optimized_path = f"example/aflow/{args.benchmark}/optimization"
+
+            if os.path.exists(args.optimized_path):
+                timestamp = time.strftime("%Y%m%d_%H%M%S")
+                args.optimized_path = f"{args.optimized_path}_{timestamp}"
+
             from example.aflow.run_aflow_optimize import run_aflow_optimization
             print("\n" + "=" * 80)
             print(f"Running AFlow Optimizer ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
@@ -193,6 +204,7 @@ def main():
                 agent_system=args.agent_system,
                 agent_config=agent_config if agent_config else None,
                 verbose=args.verbose,
+                data_id=args.data_id,
                 concurrency=args.concurrency,
             ))
         else:
@@ -202,7 +214,8 @@ def main():
                 limit=args.limit,
                 agent_system=args.agent_system,
                 agent_config=agent_config if agent_config else None,
-                verbose=args.verbose
+                verbose=args.verbose,
+                data_id=args.data_id,
             )
         logger.info(f"Benchmark summary: {summary}")
         return 0
 
@@ -6,17 +6,18 @@ class EvaluationUtils:
     def __init__(self, root_path: str):
         self.root_path = root_path
 
-    async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False):
+    async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False, train_size=40, test_size=20):
 
         workflow_runner = WorkflowRunner(agent=optimizer.executor_agent)
         sum_score = 0
 
         for _ in range(validation_n):
 
-
             score, avg_cost, total_cost, all_failed = await workflow_runner.graph_evaluate_async(optimizer.evaluator,
-                                                                                               optimizer.graph,
-                                                                                               is_test=False)
+                                                                                                 optimizer.graph,
+                                                                                                 is_test=False,
+                                                                                                 train_size=train_size,
+                                                                                                 test_size=test_size)
             cur_round = optimizer.round + 1 if initial is False else optimizer.round
             new_data = optimizer.data_utils.create_result_data(cur_round, score, avg_cost, total_cost)
             data.append(new_data)
@@ -37,6 +38,8 @@ async def evaluate_graph_test_async(self, optimizer):
         evaluator = WorkflowRunner(agent=optimizer.executor_agent)
 
         score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.evaluator,
-                                                                                           optimizer.graph,
-                                                                                           is_test=True)
+                                                                                       optimizer.graph,
+                                                                                       is_test=True,
+                                                                                       train_size=optimizer.train_size,
+                                                                                       test_size=optimizer.test_size)
         return score, avg_cost, total_cost
@@ -21,19 +21,10 @@ def _configure_graph(self, graph, evaluator):
         return graph(name=evaluator.name, agent_name="single_agent", evaluator=evaluator)
 
     async def graph_evaluate_async(self, evaluator: BaseEvaluator, graph: Callable, is_test: bool = False,
-                                   max_concurrent_tasks: int = 20) -> Tuple[float, float, float]:
+                                   max_concurrent_tasks: int = 20, train_size: int = 40, test_size: int = 20) -> Tuple[float, float, float]:
         configured_graph = self._configure_graph(graph=graph, evaluator=evaluator)
 
-        # get data for evaluation
-        from mas_arena.evaluators import BENCHMARKS
-        benchmark_config = BENCHMARKS[evaluator.name]
-        data_path = benchmark_config.get("data_path", f"data/{evaluator.name}_test.jsonl")
-        data = []
-        try:
-            with open(data_path, "r") as f:
-                data = [json.loads(line) for line in f]
-        except FileNotFoundError:
-            raise FileNotFoundError(f"Data file not found: {data_path}")
+        data = evaluator.get_test_data(sample_size=test_size) if is_test else evaluator.get_dev_data(sample_size=train_size)
         if not data or len(data) == 0:
             print("No data to evaluate. Returning zeros.")
             return (0.0, 0.0, 0.0, True)
 
@@ -5,6 +5,7 @@
 """
 
 import abc
+import logging
 from typing import Dict, Any, Optional, Type, Callable
 import uuid
 import os
@@ -17,7 +18,7 @@
 import aiofiles
 
 from mas_arena.utils.llm_parser import LLMOutputParser
-
+logger = logging.getLogger(__name__)
 
 class AgentSystem(abc.ABC):
     """Base class for all agent systems in the benchmark framework
@@ -45,7 +46,7 @@ def __init__(self, name: str = None, config: Dict[str, Any] = None):
         self.config = config or {}
         self.evaluator_name = self.config.get("evaluator", None)
         if self.evaluator_name is None:
-            print("Evaluator name is not set in the configuration.")
+            logger.info("Evaluator name is not set in the configuration. Defaulting to None.")
 
         self.metrics_registry = None
         self.evaluator = None
 
@@ -85,7 +85,7 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
 
         if "parser" in kwargs or "parse_mode" in kwargs:
             parser = kwargs.get("parser", None)
-            parse_mode = kwargs.get("parser_mode", "str")
+            parse_mode = kwargs.get("parse_mode", "str")
             response_format = self.parse_generated_text(response_content, parser=parser, parse_mode=parse_mode)
             return {
                 "messages": [ai_message],
 
@@ -86,7 +86,7 @@ def _setup_metrics(self):
         registry = MetricsRegistry()
         return registry
 
-    def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, agent_config, verbose):
+    def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, agent_config, verbose, data_id=None):
         """
         Run a benchmark with the specified configuration.
 
@@ -131,6 +131,15 @@ def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, age
         except FileNotFoundError:
             raise FileNotFoundError(f"Data file not found: {data_path}")
 
+        if data_id:
+            primary_id = benchmark_config.get("normalization_keys", {}).get("id", None)
+            if primary_id is not None:
+                for problem in problems:
+                    if str(problem["task_id"]) == data_id:
+                        problems = [problem]
+                        break
+
+
         if limit and limit < len(problems):
             problems = random.sample(problems, limit)
 
@@ -364,16 +373,16 @@ def _run_failure_attribution(self, all_results, agent_system, verbose):
             print(f"    --output_dir {failure_output_dir}")
             # print("-" * 80)
             rprint("\n[bold]Alternative analysis methods:[/bold]")
-            print(f"# For comprehensive analysis:")
+            print(f"#\n For comprehensive analysis:")
             print(f"python {failure_inference_script} --method all_at_once --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
-            print(f"# For efficient error localization in long conversations:")
+            print(f"#\n For efficient error localization in long conversations:")
             print(f"python {failure_inference_script} --method binary_search --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
             print(f"\n# For detailed incremental analysis:")
             print(f"python {failure_inference_script} --method step_by_step --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
 
             print("=" * 80)
 
-    def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True):
+    def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, data_id=None):
         """
         Run a benchmark sequentially. This is a wrapper around arun.
         """
@@ -384,12 +393,13 @@ def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="s
             agent_system=agent_system,
             agent_config=agent_config,
             verbose=verbose,
+            data_id=data_id,
             concurrency=1  # Run sequentially
         ))
 
-    async def arun(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, concurrency=10):
+    async def arun(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, data_id=None,concurrency=10):
         agent, problems, benchmark_config, output_file = self._prepare_benchmark(
-            benchmark_name, data_path, limit, agent_system, agent_config, verbose
+            benchmark_name, data_path, limit, agent_system, agent_config, verbose, data_id
         )
 
         if verbose: