From d0c3088628bb49945b42755c4ac2d56b18538786 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:37:40 -0400 Subject: [PATCH 1/2] Add RTFx tracking and validation to all benchmark workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: 1. Add RTFx metric extraction to qwen3-asr-benchmark.yml - Extract medianRTFx and overallRTFx from benchmark results - Display metrics in PR comments 2. Add RTFx validation to all 6 benchmark workflows - Fail with exit 1 if RTFx is 0 or N/A - Prevents silent benchmark failures from being reported as success 3. Fix PR comment posting with if: always() - Add "if: always()" to Comment PR steps - Ensures PR comments post even when validation fails - Allows users to see what went wrong Workflows updated: - qwen3-asr-benchmark.yml: Validate 2 RTFx metrics + add tracking - asr-benchmark.yml: Validate 6 RTFx metrics (v2/v3 × clean/other/streaming) - diarizer-benchmark.yml: Validate 1 RTFx metric - parakeet-eou-benchmark.yml: Validate 1 RTFx metric - sortformer-benchmark.yml: Validate 1 RTFx metric - vad-benchmark.yml: Validate 2 RTFx metrics (MUSAN + VOiCES) If RTFx is 0, it means: - Benchmark didn't run properly - Audio duration was 0 - Processing failed silently - Metric extraction failed Co-Authored-By: Claude Sonnet 4.5 --- .github/workflows/asr-benchmark.yml | 41 ++++++++++++++++++-- .github/workflows/diarizer-benchmark.yml | 7 ++++ .github/workflows/parakeet-eou-benchmark.yml | 9 ++++- .github/workflows/qwen3-asr-benchmark.yml | 35 ++++++++++++++++- .github/workflows/sortformer-benchmark.yml | 7 ++++ .github/workflows/vad-benchmark.yml | 26 +++++++++++++ 6 files changed, 120 insertions(+), 5 deletions(-) diff --git a/.github/workflows/asr-benchmark.yml b/.github/workflows/asr-benchmark.yml index 302577ffb..bf201c414 100644 --- a/.github/workflows/asr-benchmark.yml +++ b/.github/workflows/asr-benchmark.yml @@ -198,9 +198,38 @@ jobs: echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT + # Validate RTFx values - 0 indicates benchmark failure + if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "N/A" ]; then + echo "⚠️ test-clean RTFx is 0 or N/A - benchmark may have failed" + CLEAN_RTFX_FAILED=1 + fi + if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then + echo "⚠️ test-clean (v2) RTFx is 0 or N/A - benchmark may have failed" + CLEAN_V2_RTFX_FAILED=1 + fi + if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "N/A" ]; then + echo "⚠️ test-other RTFx is 0 or N/A - benchmark may have failed" + OTHER_RTFX_FAILED=1 + fi + if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then + echo "⚠️ test-other (v2) RTFx is 0 or N/A - benchmark may have failed" + OTHER_V2_RTFX_FAILED=1 + fi + if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "N/A" ]; then + echo "⚠️ streaming RTFx is 0 or N/A - benchmark may have failed" + STREAMING_RTFX_FAILED=1 + fi + if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then + echo "⚠️ streaming (v2) RTFx is 0 or N/A - benchmark may have failed" + STREAMING_V2_RTFX_FAILED=1 + fi + # Report failures summary if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ] || \ - [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ]; then + [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ] || \ + [ ! -z "$CLEAN_RTFX_FAILED" ] || [ ! -z "$CLEAN_V2_RTFX_FAILED" ] || \ + [ ! -z "$OTHER_RTFX_FAILED" ] || [ ! -z "$OTHER_V2_RTFX_FAILED" ] || \ + [ ! -z "$STREAMING_RTFX_FAILED" ] || [ ! -z "$STREAMING_V2_RTFX_FAILED" ]; then echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT echo "⚠️ Some benchmarks failed:" [ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed" @@ -209,14 +238,20 @@ jobs: [ ! -z "$CLEAN_V2_FAILED" ] && echo " - test-clean (v2) benchmark failed" [ ! -z "$OTHER_V2_FAILED" ] && echo " - test-other (v2) benchmark failed" [ ! -z "$STREAMING_V2_FAILED" ] && echo " - streaming (v2) benchmark failed" - # Don't exit with error to allow PR comment to be posted + [ ! -z "$CLEAN_RTFX_FAILED" ] && echo " - test-clean RTFx is 0" + [ ! -z "$CLEAN_V2_RTFX_FAILED" ] && echo " - test-clean (v2) RTFx is 0" + [ ! -z "$OTHER_RTFX_FAILED" ] && echo " - test-other RTFx is 0" + [ ! -z "$OTHER_V2_RTFX_FAILED" ] && echo " - test-other (v2) RTFx is 0" + [ ! -z "$STREAMING_RTFX_FAILED" ] && echo " - streaming RTFx is 0" + [ ! -z "$STREAMING_V2_RTFX_FAILED" ] && echo " - streaming (v2) RTFx is 0" + exit 1 else echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT echo "✅ All benchmarks completed successfully" fi - name: Comment PR - if: github.event_name == 'pull_request' + if: always() && github.event_name == 'pull_request' continue-on-error: true uses: actions/github-script@v7 with: diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml index 43251f671..0775ad844 100644 --- a/.github/workflows/diarizer-benchmark.yml +++ b/.github/workflows/diarizer-benchmark.yml @@ -115,6 +115,13 @@ jobs: echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT + # Validate RTFx - 0 indicates benchmark failure + if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then + echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" + echo "RTFx value: $RTF" + exit 1 + fi + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 diff --git a/.github/workflows/parakeet-eou-benchmark.yml b/.github/workflows/parakeet-eou-benchmark.yml index bef59e306..0b3bb62ba 100644 --- a/.github/workflows/parakeet-eou-benchmark.yml +++ b/.github/workflows/parakeet-eou-benchmark.yml @@ -104,8 +104,15 @@ jobs: echo "MAX_FILES=$MAX_FILES" >> $GITHUB_OUTPUT echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT + # Validate RTFx - 0 or N/A indicates benchmark failure + if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "N/A" ]; then + echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed" + echo "RTFx value: $RTFx" + exit 1 + fi + - name: Comment PR - if: github.event_name == 'pull_request' + if: always() && github.event_name == 'pull_request' continue-on-error: true uses: actions/github-script@v7 with: diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml index 074bede82..ac1e15580 100644 --- a/.github/workflows/qwen3-asr-benchmark.yml +++ b/.github/workflows/qwen3-asr-benchmark.yml @@ -69,17 +69,44 @@ jobs: echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT fi + # Extract RTFx metrics if results file exists + if [ -f qwen3_results_int8.json ]; then + MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null) + OVERALL_RTFx=$(jq -r '.summary.overallRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null) + + [ "$MEDIAN_RTFx" != "null" ] && [ "$MEDIAN_RTFx" != "N/A" ] && MEDIAN_RTFx=$(printf "%.2f" "$MEDIAN_RTFx") || MEDIAN_RTFx="N/A" + [ "$OVERALL_RTFx" != "null" ] && [ "$OVERALL_RTFx" != "N/A" ] && OVERALL_RTFx=$(printf "%.2f" "$OVERALL_RTFx") || OVERALL_RTFx="N/A" + + echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT + echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT + + # Fail if RTFx is 0 or N/A - indicates benchmark failure + if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ]; then + echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results" + echo "Median RTFx: $MEDIAN_RTFx" + echo "Overall RTFx: $OVERALL_RTFx" + exit 1 + fi + else + echo "❌ CRITICAL: Results file not found - benchmark failed" + echo "MEDIAN_RTFx=N/A" >> $GITHUB_OUTPUT + echo "OVERALL_RTFx=N/A" >> $GITHUB_OUTPUT + exit 1 + fi + EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT - name: Comment PR - if: github.event_name == 'pull_request' + if: always() && github.event_name == 'pull_request' continue-on-error: true uses: actions/github-script@v7 with: script: | const status = '${{ steps.smoketest.outputs.SMOKE_STATUS }}'; const emoji = status === 'PASSED' ? '✅' : '❌'; + const medianRTFx = '${{ steps.smoketest.outputs.MEDIAN_RTFx }}'; + const overallRTFx = '${{ steps.smoketest.outputs.OVERALL_RTFx }}'; const body = `## Qwen3-ASR int8 Smoke Test ${emoji} @@ -91,6 +118,12 @@ jobs: | Transcription pipeline | ${emoji} | | Decoder size | 571 MB (vs 1.1 GB f32) | + ### Performance Metrics + | Metric | CI Value | Expected on Apple Silicon | + |--------|----------|--------------------------| + | Median RTFx | ${medianRTFx}x | ~2.5x | + | Overall RTFx | ${overallRTFx}x | ~2.5x | + Runtime: ${{ steps.smoketest.outputs.EXECUTION_TIME }} **Note:** CI VM lacks physical GPU — CoreML MLState (macOS 15) KV cache produces degraded results on virtualized runners. On Apple Silicon: ~1.3% WER / 2.5x RTFx. diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml index 2f9edd701..a3e04d662 100644 --- a/.github/workflows/sortformer-benchmark.yml +++ b/.github/workflows/sortformer-benchmark.yml @@ -115,6 +115,13 @@ jobs: echo "DETECTED=${DETECTED}" >> $GITHUB_OUTPUT echo "GROUND_TRUTH=${GROUND_TRUTH}" >> $GITHUB_OUTPUT + # Validate RTFx - 0 indicates benchmark failure + if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then + echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" + echo "RTFx value: $RTF" + exit 1 + fi + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 diff --git a/.github/workflows/vad-benchmark.yml b/.github/workflows/vad-benchmark.yml index 3a75a0e60..22806c1fc 100644 --- a/.github/workflows/vad-benchmark.yml +++ b/.github/workflows/vad-benchmark.yml @@ -74,6 +74,32 @@ jobs: --threshold 0.5 \ --output voices_vad_results.json + - name: Validate RTFx metrics + run: | + # Validate MUSAN RTFx + if [ -f musan_vad_results.json ]; then + MUSAN_RTFx=$(jq -r '.rtfx // 0' musan_vad_results.json) + if [ "$MUSAN_RTFx" = "0" ] || [ -z "$MUSAN_RTFx" ]; then + echo "❌ CRITICAL: MUSAN RTFx is 0 or empty - benchmark failed" + exit 1 + fi + else + echo "❌ CRITICAL: musan_vad_results.json not found" + exit 1 + fi + + # Validate VOiCES RTFx + if [ -f voices_vad_results.json ]; then + VOICES_RTFx=$(jq -r '.rtfx // 0' voices_vad_results.json) + if [ "$VOICES_RTFx" = "0" ] || [ -z "$VOICES_RTFx" ]; then + echo "❌ CRITICAL: VOiCES RTFx is 0 or empty - benchmark failed" + exit 1 + fi + else + echo "❌ CRITICAL: voices_vad_results.json not found" + exit 1 + fi + - name: Upload results if: always() uses: actions/upload-artifact@v4 From eea5376e9b660c2d9b5927742c9a55275cd17058 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:58:10 -0400 Subject: [PATCH 2/2] Fix EXECUTION_TIME calculation and improve validation consistency MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes issues identified in review: 1. Move EXECUTION_TIME calculation before RTFx validation (qwen3) - Previously exit 1 prevented EXECUTION_TIME from being set - Now PR comments show proper runtime even when validation fails 2. Standardize error messages to "❌ CRITICAL:" across all workflows - Changed "⚠️" to "❌ CRITICAL:" for consistency - All validation failures now use the same format 3. Add more zero format checks (0, 0.0, 0.00) - Catches edge cases like "0" or "0.0" in addition to "0.00" - More robust string comparison for RTFx validation Workflows updated: - qwen3-asr-benchmark.yml: Move EXECUTION_TIME before validation - asr-benchmark.yml: Standardize error messages, add zero variants - parakeet-eou-benchmark.yml: Add zero variants and empty check Co-Authored-By: Claude Sonnet 4.5 --- .github/workflows/asr-benchmark.yml | 24 ++++++++++---------- .github/workflows/parakeet-eou-benchmark.yml | 2 +- .github/workflows/qwen3-asr-benchmark.yml | 12 ++++++---- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/.github/workflows/asr-benchmark.yml b/.github/workflows/asr-benchmark.yml index bf201c414..dcad9c70b 100644 --- a/.github/workflows/asr-benchmark.yml +++ b/.github/workflows/asr-benchmark.yml @@ -199,28 +199,28 @@ jobs: echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT # Validate RTFx values - 0 indicates benchmark failure - if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "N/A" ]; then - echo "⚠️ test-clean RTFx is 0 or N/A - benchmark may have failed" + if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "0.0" ] || [ "$CLEAN_RTFx" = "0" ] || [ "$CLEAN_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: test-clean RTFx is 0 or N/A - benchmark failed" CLEAN_RTFX_FAILED=1 fi - if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then - echo "⚠️ test-clean (v2) RTFx is 0 or N/A - benchmark may have failed" + if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "0.0" ] || [ "$CLEAN_V2_RTFx" = "0" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: test-clean (v2) RTFx is 0 or N/A - benchmark failed" CLEAN_V2_RTFX_FAILED=1 fi - if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "N/A" ]; then - echo "⚠️ test-other RTFx is 0 or N/A - benchmark may have failed" + if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "0.0" ] || [ "$OTHER_RTFx" = "0" ] || [ "$OTHER_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: test-other RTFx is 0 or N/A - benchmark failed" OTHER_RTFX_FAILED=1 fi - if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then - echo "⚠️ test-other (v2) RTFx is 0 or N/A - benchmark may have failed" + if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "0.0" ] || [ "$OTHER_V2_RTFx" = "0" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: test-other (v2) RTFx is 0 or N/A - benchmark failed" OTHER_V2_RTFX_FAILED=1 fi - if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "N/A" ]; then - echo "⚠️ streaming RTFx is 0 or N/A - benchmark may have failed" + if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "0.0" ] || [ "$STREAMING_RTFx" = "0" ] || [ "$STREAMING_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: streaming RTFx is 0 or N/A - benchmark failed" STREAMING_RTFX_FAILED=1 fi - if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then - echo "⚠️ streaming (v2) RTFx is 0 or N/A - benchmark may have failed" + if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "0.0" ] || [ "$STREAMING_V2_RTFx" = "0" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then + echo "❌ CRITICAL: streaming (v2) RTFx is 0 or N/A - benchmark failed" STREAMING_V2_RTFX_FAILED=1 fi diff --git a/.github/workflows/parakeet-eou-benchmark.yml b/.github/workflows/parakeet-eou-benchmark.yml index 0b3bb62ba..b5a23d4df 100644 --- a/.github/workflows/parakeet-eou-benchmark.yml +++ b/.github/workflows/parakeet-eou-benchmark.yml @@ -105,7 +105,7 @@ jobs: echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT # Validate RTFx - 0 or N/A indicates benchmark failure - if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "N/A" ]; then + if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "0.0" ] || [ "$RTFx" = "0" ] || [ "$RTFx" = "N/A" ] || [ -z "$RTFx" ]; then echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed" echo "RTFx value: $RTFx" exit 1 diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml index ac1e15580..a23795369 100644 --- a/.github/workflows/qwen3-asr-benchmark.yml +++ b/.github/workflows/qwen3-asr-benchmark.yml @@ -69,6 +69,10 @@ jobs: echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT fi + # Calculate execution time before validation (needed for PR comment) + EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s + echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT + # Extract RTFx metrics if results file exists if [ -f qwen3_results_int8.json ]; then MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null) @@ -80,8 +84,9 @@ jobs: echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT - # Fail if RTFx is 0 or N/A - indicates benchmark failure - if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ]; then + # Validate RTFx - fail if 0 or N/A (indicates benchmark failure) + if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$MEDIAN_RTFx" = "0" ] || [ "$MEDIAN_RTFx" = "0.0" ] || \ + [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "0" ] || [ "$OVERALL_RTFx" = "0.0" ]; then echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results" echo "Median RTFx: $MEDIAN_RTFx" echo "Overall RTFx: $OVERALL_RTFx" @@ -94,9 +99,6 @@ jobs: exit 1 fi - EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s - echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT - - name: Comment PR if: always() && github.event_name == 'pull_request' continue-on-error: true