Skip to content

Commit 600582e

Browse files
committed
test evals
1 parent ba45203 commit 600582e

3 files changed

Lines changed: 38 additions & 5 deletions

File tree

.github/workflows/benchmark-tmpl.yml

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,31 @@ jobs:
154154
155155
if [ "${{ inputs.eval-only }}" = "true" ]; then
156156
echo "Eval-only mode: skipping benchmark result file check"
157+
# Verify eval produced results
158+
if ! ls results*.json 1>/dev/null 2>&1; then
159+
echo "Eval-only run failed: no results*.json files found." >&2
160+
exit 1
161+
fi
162+
# Verify eval scores meet minimum threshold (85%)
163+
python3 << 'PYEOF'
164+
import json, glob, sys
165+
MIN_SCORE = 0.85
166+
failed = False
167+
for f in glob.glob("results*.json"):
168+
with open(f) as fh:
169+
data = json.load(fh)
170+
for task, metrics in data.get("results", {}).items():
171+
for name, val in metrics.items():
172+
if not name.startswith("exact_match,") or "stderr" in name:
173+
continue
174+
if isinstance(val, (int, float)) and val < MIN_SCORE:
175+
print(f"FAIL: {task} {name} = {val:.4f} (< {MIN_SCORE})", file=sys.stderr)
176+
failed = True
177+
elif isinstance(val, (int, float)):
178+
print(f"PASS: {task} {name} = {val:.4f}")
179+
if failed:
180+
sys.exit(1)
181+
PYEOF
157182
else
158183
FOUND_RESULT_FILE=
159184
for i in {1..10}; do
@@ -210,7 +235,7 @@ jobs:
210235
meta_env.json
211236
results*.json
212237
sample*.jsonl
213-
if-no-files-found: ignore
238+
if-no-files-found: ${{ inputs.eval-only && 'error' || 'ignore' }}
214239

215240
- name: Cleanup eval outputs (post-upload)
216241
if: ${{ env.RUN_EVAL == 'true' || inputs.eval-only }}

benchmarks/benchmark_lib.sh

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -753,9 +753,18 @@ run_eval() {
753753
compute_eval_context_length "$MODEL" "${MAX_MODEL_LEN:-0}" > /dev/null
754754
fi
755755

756+
local eval_rc=0
756757
case "$framework" in
757-
lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" ;;
758-
*) echo "Unknown framework '${framework}'"; return 1 ;;
758+
lm-eval|lm_eval) run_lm_eval "${forwarded[@]}" || eval_rc=$? ;;
759+
*) echo "Unknown framework '${framework}'"; eval_rc=1 ;;
759760
esac
760-
return $?
761+
762+
if [ "$eval_rc" -ne 0 ]; then
763+
echo "ERROR: run_eval failed with exit code $eval_rc" >&2
764+
if [ "${EVAL_ONLY}" = "true" ]; then
765+
echo "Eval-only mode: exiting with failure" >&2
766+
exit "$eval_rc"
767+
fi
768+
fi
769+
return $eval_rc
761770
}

perf-changelog.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1017,4 +1017,3 @@
10171017
description:
10181018
- "Separate evals, change to 8k1k"
10191019
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/911
1020-
evals-only: true

0 commit comments

Comments
 (0)