@@ -154,6 +154,31 @@ jobs:
154154
155155 if [ "${{ inputs.eval-only }}" = "true" ]; then
156156 echo "Eval-only mode: skipping benchmark result file check"
157+ # Verify eval produced results
158+ if ! ls results*.json 1>/dev/null 2>&1; then
159+ echo "Eval-only run failed: no results*.json files found." >&2
160+ exit 1
161+ fi
162+ # Verify eval scores meet minimum threshold (85%)
163+ python3 << 'PYEOF'
164+ import json, glob, sys
165+ MIN_SCORE = 0.85
166+ failed = False
167+ for f in glob.glob("results*.json"):
168+ with open(f) as fh:
169+ data = json.load(fh)
170+ for task, metrics in data.get("results", {}).items():
171+ for name, val in metrics.items():
172+ if not name.startswith("exact_match,") or "stderr" in name:
173+ continue
174+ if isinstance(val, (int, float)) and val < MIN_SCORE:
175+ print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr)
176+ failed = True
177+ elif isinstance(val, (int, float)):
178+ print(f"PASS: {task} {name} = {val:.4f}")
179+ if failed:
180+ sys.exit(1)
181+ PYEOF
157182 else
158183 FOUND_RESULT_FILE=
159184 for i in {1..10}; do
@@ -210,7 +235,7 @@ jobs:
210235 meta_env.json
211236 results*.json
212237 sample*.jsonl
213- if-no-files-found : ignore
238+ if-no-files-found : ${{ inputs.eval-only && 'error' || ' ignore' }}
214239
215240 - name : Cleanup eval outputs (post-upload)
216241 if : ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
0 commit comments