We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
1 parent 31ea45a commit dec9a84Copy full SHA for dec9a84
1 file changed
benchmarks/healthcare/eval_script_xlam2_70b.sh
@@ -6,8 +6,7 @@ MODEL="xlam_2_70b"
6
MAX_TURNS=30
7
REPORT_MODEL="gpt-4.1-2025-04-14"
8
JUDGE_MODEL="gpt-4o"
9
-SERVER="@openbnb/mcp-server-airbnb"
10
-SERVER_ARGS="--ignore-robots-txt"
+SERVER="mcp_servers/${DOMAIN}/server.py"
11
MODEL_CONFIG="benchmarks/${DOMAIN}/eval_models/${MODEL}.json"
12
TASKS_FILE="data/${DOMAIN}/evaluation_tasks_verified.jsonl"
13
OUTPUT="benchmarks/${DOMAIN}/results/${MODEL}_task_evaluation.json"
@@ -22,7 +21,6 @@ REPORT_DIR="benchmarks/${DOMAIN}/report"
22
21
23
mcp-eval evaluate \
24
--server $SERVER \
25
- --server-args="$SERVER_ARGS" \
26
--model-config $MODEL_CONFIG \
27
--tasks-file $TASKS_FILE \
28
--output $OUTPUT \
0 commit comments