test evals

Oseltamivir · Oseltamivir · commit 600582eb1946 · 2026-03-15T15:08:33.000-07:00
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -154,6 +154,31 @@ jobs:
 
           if [ "${{ inputs.eval-only }}" = "true" ]; then
             echo "Eval-only mode: skipping benchmark result file check"
+            # Verify eval produced results
+            if ! ls results*.json 1>/dev/null 2>&1; then
+              echo "Eval-only run failed: no results*.json files found." >&2
+              exit 1
+            fi
+            # Verify eval scores meet minimum threshold (85%)
+            python3 << 'PYEOF'
+          import json, glob, sys
+          MIN_SCORE = 0.85
+          failed = False
+          for f in glob.glob("results*.json"):
+              with open(f) as fh:
+                  data = json.load(fh)
+              for task, metrics in data.get("results", {}).items():
+                  for name, val in metrics.items():
+                      if not name.startswith("exact_match,") or "stderr" in name:
+                          continue
+                      if isinstance(val, (int, float)) and val < MIN_SCORE:
+                          print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr)
+                          failed = True
+                      elif isinstance(val, (int, float)):
+                          print(f"PASS: {task} {name} = {val:.4f}")
+          if failed:
+              sys.exit(1)
+          PYEOF
           else
             FOUND_RESULT_FILE=
             for i in {1..10}; do
@@ -210,7 +235,7 @@ jobs:
             meta_env.json
             results*.json
             sample*.jsonl
-          if-no-files-found: ignore
+          if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
 
       - name: Cleanup eval outputs (post-upload)
         if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -753,9 +753,18 @@ run_eval() {
         compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
     fi
 
+    local eval_rc=0
     case "$framework" in
-        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
-        *)               echo "Unknown framework '${framework}'"; return 1 ;;
+        lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
+        *)               echo "Unknown framework '${framework}'"; eval_rc=1 ;;
     esac
-    return $?
+
+    if [ "$eval_rc" -ne 0 ]; then
+        echo "ERROR: run_eval failed with exit code $eval_rc" >&2
+        if [ "${EVAL_ONLY}" = "true" ]; then
+            echo "Eval-only mode: exiting with failure" >&2
+            exit "$eval_rc"
+        fi
+    fi
+    return $eval_rc
 }
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1017,4 +1017,3 @@
   description:
     - "Separate evals, change to 8k1k"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
-  evals-only: true