Skip to content

Commit 67b8fa1

Browse files
Merge pull request #42
Aflow optimization for mbpp evaluator
2 parents f829784 + b5d9654 commit 67b8fa1

22 files changed

Lines changed: 321 additions & 239 deletions

docs/aflow_optimize.md

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,15 +50,15 @@ This command will:
5050

5151
The following arguments in `main.py` control the optimization process.
5252

53-
| Argument | Type | Default | Description |
54-
|-------------------------|--------|----------------------------------------------|--------------------------------------------------------------|
55-
| `--run-optimizer` | str | `None` | Specifies the optimizer to run. Use `aflow`. |
56-
| `--benchmark` | str | `math` | Benchmark to optimize for. |
57-
| `--graph_path` | str | `mas_arena/configs/aflow` | Path to the base AFlow graph configuration. |
58-
| `--optimized_path` | str | `example/aflow/humaneval/optimization` | Path to save the optimized AFlow graph and intermediate files. |
59-
| `--validation_rounds` | int | 1 | Number of validation rounds per optimization cycle. |
60-
| `--eval_rounds` | int | 1 | Number of evaluation rounds per optimization cycle. |
61-
| `--max_rounds` | int | 3 | Maximum number of optimization rounds. |
53+
| Argument | Type | Default | Description |
54+
|-------------------------|--------|----------------------------------------|--------------------------------------------------------------|
55+
| `--run-optimizer` | str | `None` | Specifies the optimizer to run. Use `aflow`. |
56+
| `--benchmark` | str | `humaneval` | Benchmark to optimize for. |
57+
| `--graph_path` | str | `mas_arena/configs/aflow` | Path to the base AFlow graph configuration. |
58+
| `--optimized_path` | str | `example/aflow/humaneval/optimization` | Path to save the optimized AFlow graph and intermediate files. |
59+
| `--validation_rounds` | int | 1 | Number of validation rounds per optimization cycle. |
60+
| `--eval_rounds` | int | 1 | Number of evaluation rounds per optimization cycle. |
61+
| `--max_rounds` | int | 3 | Maximum number of optimization rounds. |
6262

6363
---
6464

docs/quick_start/usage.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,16 @@ To run AFlow to optimize an agent for the `humaneval` benchmark, provide `aflow`
5555
./run_benchmark.sh humaneval single_agent 10 "" "" aflow
5656
```
5757

58+
You can also specify the training and test set sizes for the optimizer. Note that when using the `aflow` optimizer, the number of problems is determined by `train_size` and `test_size`, and the `limit` argument is ignored for data selection.
5859

60+
**Example with custom training and test sizes:**
61+
62+
```bash
63+
# Run AFlow with a training set of 30 and a test set of 15.
64+
# The "" arguments are placeholders for mcp_config and concurrency.
65+
# The limit argument (10) is ignored in this case.
66+
./run_benchmark.sh humaneval single_agent 10 "" "" aflow 30 15
67+
```
5968

6069
## Command-Line Arguments
6170

@@ -90,6 +99,8 @@ These arguments are used when running an optimizer like AFlow via `--run-optimiz
9099
| `--validation_rounds`| int | 1 | Number of validation rounds per optimization cycle. |
91100
| `--eval_rounds` | int | 1 | Number of evaluation rounds per optimization cycle. |
92101
| `--max_rounds` | int | 3 | Maximum number of optimization rounds. |
102+
| `--train_size` | int | 40 | Size of the training set for evaluation. |
103+
| `--test_size` | int | 20 | Size of the test set for evaluation. |
93104

94105
## Example Output
95106

example/aflow/run_aflow_optimize.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import argparse
22
import os
3+
import time
4+
35
from dotenv import load_dotenv
46
from typing import Dict, Any
57

@@ -70,15 +72,19 @@ def run_aflow_optimization(args: argparse.Namespace) -> str:
7072
validation_rounds=args.validation_rounds,
7173
eval_rounds=args.eval_rounds,
7274
max_rounds=args.max_rounds,
75+
train_size=args.train_size,
76+
test_size=args.test_size,
7377
**EXPERIMENTAL_CONFIG.get(args.benchmark, {}),
7478
)
7579

7680
optimizer.setup()
7781
optimizer.optimize(evaluator)
78-
79-
final_graph_path = os.path.join(args.optimized_path, "final_graph.json")
80-
print(f"\n[AFlow] Optimization complete. Optimized graph saved to: {final_graph_path}")
81-
return final_graph_path
82+
optimizer.test(evaluator)
83+
84+
best_round = optimizer.find_best_performing_round()
85+
best_graph_path = f"{args.optimized_path}/round_{best_round}/graph.py"
86+
print(f"\n[AFlow] Optimization complete. Best workflow graph: {best_graph_path}")
87+
return best_graph_path
8288

8389

8490
def main():
@@ -88,7 +94,7 @@ def main():
8894
"--benchmark",
8995
type=str,
9096
default="humaneval",
91-
choices=list(["humaneval"]),
97+
choices=list(["humaneval","mbpp"]),
9298
help="Benchmark to run.",
9399
)
94100
parser.add_argument(
@@ -100,14 +106,23 @@ def main():
100106
parser.add_argument(
101107
"--optimized_path",
102108
type=str,
103-
default="example/aflow/humaneval/optimization",
109+
default=None,
104110
help="Path to save the optimized agent flow graph.",
105111
)
106112
parser.add_argument("--validation_rounds", type=int, default=1, help="Number of validation rounds.")
107113
parser.add_argument("--eval_rounds", type=int, default=1, help="Number of evaluation rounds.")
108114
parser.add_argument("--max_rounds", type=int, default=3, help="Maximum number of optimization rounds.")
115+
parser.add_argument("--train_size", type=int, default=40, help="Size of the training set for evaluation.")
116+
parser.add_argument("--test_size", type=int, default=20, help="Size of the test set for evaluation.")
109117
args = parser.parse_args()
110118

119+
if not args.optimized_path:
120+
args.optimized_path = f"example/aflow/{args.benchmark}/optimization"
121+
122+
if os.path.exists(args.optimized_path):
123+
timestamp = time.strftime("%Y%m%d_%H%M%S")
124+
args.optimized_path = f"{args.optimized_path}_{timestamp}"
125+
111126
optimized_graph_path = run_aflow_optimization(args)
112127

113128
print("\n" + "=" * 80)

example/aflow/run_aflow_optimize.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ OPTIMIZED_PATH=${3:-"example/aflow/humaneval/optimization"}
1010
VALIDATION_ROUNDS=${4:-1}
1111
EVAL_ROUNDS=${5:-1}
1212
MAX_ROUNDS=${6:-3}
13+
TRAIN_SIZE=${7:-40}
14+
TEST_SIZE=${8:-20}
1315

1416
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
1517
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
@@ -41,6 +43,8 @@ echo "Optimized Path: $OPTIMIZED_PATH"
4143
echo "Validation Rounds: $VALIDATION_ROUNDS"
4244
echo "Evaluation Rounds: $EVAL_ROUNDS"
4345
echo "Max Optimization Rounds: $MAX_ROUNDS"
46+
echo "Size of the training set for evaluation: $TRAIN_SIZE"
47+
echo "Size of the training set for evaluation: $TEST_SIZE"
4448
echo "====================================================="
4549
echo ""
4650

@@ -50,6 +54,8 @@ python example/aflow/run_aflow_optimize.py \
5054
--optimized_path "$OPTIMIZED_PATH" \
5155
--validation_rounds "$VALIDATION_ROUNDS" \
5256
--eval_rounds "$EVAL_ROUNDS" \
53-
--max_rounds "$MAX_ROUNDS"
57+
--max_rounds "$MAX_ROUNDS" \
58+
--train_size "$TRAIN_SIZE" \
59+
--test_size "$TEST_SIZE"
5460

5561
exit $?

main.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
#!/usr/bin/env python3
22
import argparse
33
import datetime
4+
import os
45
import sys
6+
import time
57
from pathlib import Path
68
import asyncio
79

@@ -93,18 +95,27 @@ def main():
9395
optimizer_group.add_argument(
9496
"--optimized_path",
9597
type=str,
96-
default="example/aflow/humaneval/optimization",
98+
default=None,
9799
help="Path to save the optimized agent flow graph.",
98100
)
99101
optimizer_group.add_argument("--validation_rounds", type=int, default=1, help="Number of validation rounds.")
100102
optimizer_group.add_argument("--eval_rounds", type=int, default=1, help="Number of evaluation rounds.")
101103
optimizer_group.add_argument("--max_rounds", type=int, default=3, help="Maximum number of optimization rounds.")
104+
optimizer_group.add_argument("--train_size", type=int, default=40, help="Size of the training set for evaluation.")
105+
optimizer_group.add_argument("--test_size", type=int, default=20, help="Size of the test set for evaluation.")
102106

103107
# Parse arguments
104108
args = parser.parse_args()
105109

106110
if args.run_optimizer:
107111
if args.run_optimizer == "aflow":
112+
if not args.optimized_path:
113+
args.optimized_path = f"example/aflow/{args.benchmark}/optimization"
114+
115+
if os.path.exists(args.optimized_path):
116+
timestamp = time.strftime("%Y%m%d_%H%M%S")
117+
args.optimized_path = f"{args.optimized_path}_{timestamp}"
118+
108119
from example.aflow.run_aflow_optimize import run_aflow_optimization
109120
print("\n" + "=" * 80)
110121
print(f"Running AFlow Optimizer ({datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')})")
@@ -193,6 +204,7 @@ def main():
193204
agent_system=args.agent_system,
194205
agent_config=agent_config if agent_config else None,
195206
verbose=args.verbose,
207+
data_id=args.data_id,
196208
concurrency=args.concurrency,
197209
))
198210
else:
@@ -202,7 +214,8 @@ def main():
202214
limit=args.limit,
203215
agent_system=args.agent_system,
204216
agent_config=agent_config if agent_config else None,
205-
verbose=args.verbose
217+
verbose=args.verbose,
218+
data_id=args.data_id,
206219
)
207220
logger.info(f"Benchmark summary: {summary}")
208221
return 0

mas_arena/agent_flow/workflow_evaluator.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,18 @@ class EvaluationUtils:
66
def __init__(self, root_path: str):
77
self.root_path = root_path
88

9-
async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False):
9+
async def evaluate_graph_async(self, optimizer, validation_n, data, initial=False, train_size=40, test_size=20):
1010

1111
workflow_runner = WorkflowRunner(agent=optimizer.executor_agent)
1212
sum_score = 0
1313

1414
for _ in range(validation_n):
1515

16-
1716
score, avg_cost, total_cost, all_failed = await workflow_runner.graph_evaluate_async(optimizer.evaluator,
18-
optimizer.graph,
19-
is_test=False)
17+
optimizer.graph,
18+
is_test=False,
19+
train_size=train_size,
20+
test_size=test_size)
2021
cur_round = optimizer.round + 1 if initial is False else optimizer.round
2122
new_data = optimizer.data_utils.create_result_data(cur_round, score, avg_cost, total_cost)
2223
data.append(new_data)
@@ -37,6 +38,8 @@ async def evaluate_graph_test_async(self, optimizer):
3738
evaluator = WorkflowRunner(agent=optimizer.executor_agent)
3839

3940
score, avg_cost, total_cost, all_failed = await evaluator.graph_evaluate_async(optimizer.evaluator,
40-
optimizer.graph,
41-
is_test=True)
41+
optimizer.graph,
42+
is_test=True,
43+
train_size=optimizer.train_size,
44+
test_size=optimizer.test_size)
4245
return score, avg_cost, total_cost

mas_arena/agent_flow/workflow_runner.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -21,19 +21,10 @@ def _configure_graph(self, graph, evaluator):
2121
return graph(name=evaluator.name, agent_name="single_agent", evaluator=evaluator)
2222

2323
async def graph_evaluate_async(self, evaluator: BaseEvaluator, graph: Callable, is_test: bool = False,
24-
max_concurrent_tasks: int = 20) -> Tuple[float, float, float]:
24+
max_concurrent_tasks: int = 20, train_size: int = 40, test_size: int = 20) -> Tuple[float, float, float]:
2525
configured_graph = self._configure_graph(graph=graph, evaluator=evaluator)
2626

27-
# get data for evaluation
28-
from mas_arena.evaluators import BENCHMARKS
29-
benchmark_config = BENCHMARKS[evaluator.name]
30-
data_path = benchmark_config.get("data_path", f"data/{evaluator.name}_test.jsonl")
31-
data = []
32-
try:
33-
with open(data_path, "r") as f:
34-
data = [json.loads(line) for line in f]
35-
except FileNotFoundError:
36-
raise FileNotFoundError(f"Data file not found: {data_path}")
27+
data = evaluator.get_test_data(sample_size=test_size) if is_test else evaluator.get_dev_data(sample_size=train_size)
3728
if not data or len(data) == 0:
3829
print("No data to evaluate. Returning zeros.")
3930
return (0.0, 0.0, 0.0, True)

mas_arena/agents/base.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"""
66

77
import abc
8+
import logging
89
from typing import Dict, Any, Optional, Type, Callable
910
import uuid
1011
import os
@@ -17,7 +18,7 @@
1718
import aiofiles
1819

1920
from mas_arena.utils.llm_parser import LLMOutputParser
20-
21+
logger = logging.getLogger(__name__)
2122

2223
class AgentSystem(abc.ABC):
2324
"""Base class for all agent systems in the benchmark framework
@@ -45,7 +46,7 @@ def __init__(self, name: str = None, config: Dict[str, Any] = None):
4546
self.config = config or {}
4647
self.evaluator_name = self.config.get("evaluator", None)
4748
if self.evaluator_name is None:
48-
print("Evaluator name is not set in the configuration.")
49+
logger.info("Evaluator name is not set in the configuration. Defaulting to None.")
4950

5051
self.metrics_registry = None
5152
self.evaluator = None

mas_arena/agents/single_agent.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ async def run_agent(self, problem: Dict[str, Any], **kwargs) -> Dict[str, Any]:
8585

8686
if "parser" in kwargs or "parse_mode" in kwargs:
8787
parser = kwargs.get("parser", None)
88-
parse_mode = kwargs.get("parser_mode", "str")
88+
parse_mode = kwargs.get("parse_mode", "str")
8989
response_format = self.parse_generated_text(response_content, parser=parser, parse_mode=parse_mode)
9090
return {
9191
"messages": [ai_message],

mas_arena/benchmark_runner.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ def _setup_metrics(self):
8686
registry = MetricsRegistry()
8787
return registry
8888

89-
def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, agent_config, verbose):
89+
def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, agent_config, verbose, data_id=None):
9090
"""
9191
Run a benchmark with the specified configuration.
9292
@@ -131,6 +131,15 @@ def _prepare_benchmark(self, benchmark_name, data_path, limit, agent_system, age
131131
except FileNotFoundError:
132132
raise FileNotFoundError(f"Data file not found: {data_path}")
133133

134+
if data_id:
135+
primary_id = benchmark_config.get("normalization_keys", {}).get("id", None)
136+
if primary_id is not None:
137+
for problem in problems:
138+
if str(problem["task_id"]) == data_id:
139+
problems = [problem]
140+
break
141+
142+
134143
if limit and limit < len(problems):
135144
problems = random.sample(problems, limit)
136145

@@ -364,16 +373,16 @@ def _run_failure_attribution(self, all_results, agent_system, verbose):
364373
print(f" --output_dir {failure_output_dir}")
365374
# print("-" * 80)
366375
rprint("\n[bold]Alternative analysis methods:[/bold]")
367-
print(f"# For comprehensive analysis:")
376+
print(f"#\n For comprehensive analysis:")
368377
print(f"python {failure_inference_script} --method all_at_once --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
369-
print(f"# For efficient error localization in long conversations:")
378+
print(f"#\n For efficient error localization in long conversations:")
370379
print(f"python {failure_inference_script} --method binary_search --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
371380
print(f"\n# For detailed incremental analysis:")
372381
print(f"python {failure_inference_script} --method step_by_step --model gpt-4.1 --directory_path {failed_responses_dir} --output_dir {failure_output_dir}")
373382

374383
print("=" * 80)
375384

376-
def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True):
385+
def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, data_id=None):
377386
"""
378387
Run a benchmark sequentially. This is a wrapper around arun.
379388
"""
@@ -384,12 +393,13 @@ def run(self, benchmark_name="math", data_path=None, limit=None, agent_system="s
384393
agent_system=agent_system,
385394
agent_config=agent_config,
386395
verbose=verbose,
396+
data_id=data_id,
387397
concurrency=1 # Run sequentially
388398
))
389399

390-
async def arun(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, concurrency=10):
400+
async def arun(self, benchmark_name="math", data_path=None, limit=None, agent_system="single_agent", agent_config=None, verbose=True, data_id=None,concurrency=10):
391401
agent, problems, benchmark_config, output_file = self._prepare_benchmark(
392-
benchmark_name, data_path, limit, agent_system, agent_config, verbose
402+
benchmark_name, data_path, limit, agent_system, agent_config, verbose, data_id
393403
)
394404

395405
if verbose:

0 commit comments

Comments
 (0)