forked from openai/SWELancer-Benchmark
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathrun_swelancer.py
More file actions
50 lines (45 loc) · 1.86 KB
/
run_swelancer.py
File metadata and controls
50 lines (45 loc) · 1.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from __future__ import annotations
# Load environment before importing anything else
from dotenv import load_dotenv
load_dotenv()
from swelancer import SWELancerEval
import argparse
import nanoeval
from nanoeval.evaluation import EvalSpec, RunnerArgs
from nanoeval.recorder import dummy_recorder
from nanoeval.setup import nanoeval_entrypoint
from morph_agent import SimpleAgentSolver as MorphAgentSolver
from swelancer_agent import SimpleAgentSolver as SwelancerAgentSolver
def parse_args():
parser = argparse.ArgumentParser(description='Run SWELancer evaluation')
parser.add_argument('--issue_ids', nargs='*', type=str, help='List of ISSUE_IDs to evaluate. If not specified, all issues will be evaluated.')
parser.add_argument('--use_morph', action='store_true', help='Use MorphAgentSolver if specified, otherwise use SwelancerAgentSolver')
parser.add_argument('--model', type=str, default="gpt-4o", help='Model to use for the solver')
return parser.parse_args()
async def main() -> None:
args = parse_args()
taskset = args.issue_ids if args.issue_ids else None
# Select the appropriate solver based on the use_morph flag
if args.use_morph:
solver = MorphAgentSolver(model=args.model)
else:
solver = SwelancerAgentSolver(model=args.model)
report = await nanoeval.run(
EvalSpec(
# taskset is a list of ISSUE_IDs you wish to evaluate (e.g., ["123", "456_789"])
eval=SWELancerEval(
solver=solver,
taskset=taskset,
),
runner=RunnerArgs(
concurrency=1,
experimental_use_multiprocessing=False,
enable_slackbot=False,
recorder=dummy_recorder(),
max_retries=5
),
)
)
print(report)
if __name__ == "__main__":
nanoeval_entrypoint(main())