Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 38 additions & 3 deletions .github/workflows/asr-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,9 +198,38 @@ jobs:
echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT
echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT

# Validate RTFx values - 0 indicates benchmark failure
if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "0.0" ] || [ "$CLEAN_RTFx" = "0" ] || [ "$CLEAN_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: test-clean RTFx is 0 or N/A - benchmark failed"
CLEAN_RTFX_FAILED=1
fi
if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "0.0" ] || [ "$CLEAN_V2_RTFx" = "0" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: test-clean (v2) RTFx is 0 or N/A - benchmark failed"
CLEAN_V2_RTFX_FAILED=1
fi
if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "0.0" ] || [ "$OTHER_RTFx" = "0" ] || [ "$OTHER_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: test-other RTFx is 0 or N/A - benchmark failed"
OTHER_RTFX_FAILED=1
fi
if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "0.0" ] || [ "$OTHER_V2_RTFx" = "0" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: test-other (v2) RTFx is 0 or N/A - benchmark failed"
OTHER_V2_RTFX_FAILED=1
fi
if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "0.0" ] || [ "$STREAMING_RTFx" = "0" ] || [ "$STREAMING_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: streaming RTFx is 0 or N/A - benchmark failed"
STREAMING_RTFX_FAILED=1
fi
if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "0.0" ] || [ "$STREAMING_V2_RTFx" = "0" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then
echo "❌ CRITICAL: streaming (v2) RTFx is 0 or N/A - benchmark failed"
STREAMING_V2_RTFX_FAILED=1
fi

# Report failures summary
if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ] || \
[ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ]; then
[ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ] || \
[ ! -z "$CLEAN_RTFX_FAILED" ] || [ ! -z "$CLEAN_V2_RTFX_FAILED" ] || \
[ ! -z "$OTHER_RTFX_FAILED" ] || [ ! -z "$OTHER_V2_RTFX_FAILED" ] || \
[ ! -z "$STREAMING_RTFX_FAILED" ] || [ ! -z "$STREAMING_V2_RTFX_FAILED" ]; then
echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT
echo "⚠️ Some benchmarks failed:"
[ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed"
Expand All @@ -209,14 +238,20 @@ jobs:
[ ! -z "$CLEAN_V2_FAILED" ] && echo " - test-clean (v2) benchmark failed"
[ ! -z "$OTHER_V2_FAILED" ] && echo " - test-other (v2) benchmark failed"
[ ! -z "$STREAMING_V2_FAILED" ] && echo " - streaming (v2) benchmark failed"
# Don't exit with error to allow PR comment to be posted
[ ! -z "$CLEAN_RTFX_FAILED" ] && echo " - test-clean RTFx is 0"
[ ! -z "$CLEAN_V2_RTFX_FAILED" ] && echo " - test-clean (v2) RTFx is 0"
[ ! -z "$OTHER_RTFX_FAILED" ] && echo " - test-other RTFx is 0"
[ ! -z "$OTHER_V2_RTFX_FAILED" ] && echo " - test-other (v2) RTFx is 0"
[ ! -z "$STREAMING_RTFX_FAILED" ] && echo " - streaming RTFx is 0"
[ ! -z "$STREAMING_V2_RTFX_FAILED" ] && echo " - streaming (v2) RTFx is 0"
exit 1
else
echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT
echo "✅ All benchmarks completed successfully"
fi

- name: Comment PR
if: github.event_name == 'pull_request'
if: always() && github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/diarizer-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ jobs:
echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT
echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT

# Validate RTFx - 0 indicates benchmark failure
if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
echo "RTFx value: $RTF"
exit 1
fi

- name: Comment PR with Benchmark Results
if: always()
uses: actions/github-script@v7
Expand Down
9 changes: 8 additions & 1 deletion .github/workflows/parakeet-eou-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,15 @@ jobs:
echo "MAX_FILES=$MAX_FILES" >> $GITHUB_OUTPUT
echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT

# Validate RTFx - 0 or N/A indicates benchmark failure
if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "0.0" ] || [ "$RTFx" = "0" ] || [ "$RTFx" = "N/A" ] || [ -z "$RTFx" ]; then
echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed"
echo "RTFx value: $RTFx"
exit 1
fi

- name: Comment PR
if: github.event_name == 'pull_request'
if: always() && github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
Expand Down
37 changes: 36 additions & 1 deletion .github/workflows/qwen3-asr-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,17 +69,46 @@ jobs:
echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT
fi

# Calculate execution time before validation (needed for PR comment)
EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s
echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT

# Extract RTFx metrics if results file exists
if [ -f qwen3_results_int8.json ]; then
MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null)
OVERALL_RTFx=$(jq -r '.summary.overallRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null)

[ "$MEDIAN_RTFx" != "null" ] && [ "$MEDIAN_RTFx" != "N/A" ] && MEDIAN_RTFx=$(printf "%.2f" "$MEDIAN_RTFx") || MEDIAN_RTFx="N/A"
[ "$OVERALL_RTFx" != "null" ] && [ "$OVERALL_RTFx" != "N/A" ] && OVERALL_RTFx=$(printf "%.2f" "$OVERALL_RTFx") || OVERALL_RTFx="N/A"

echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT
echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT

# Validate RTFx - fail if 0 or N/A (indicates benchmark failure)
if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$MEDIAN_RTFx" = "0" ] || [ "$MEDIAN_RTFx" = "0.0" ] || \
[ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "0" ] || [ "$OVERALL_RTFx" = "0.0" ]; then
echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results"
echo "Median RTFx: $MEDIAN_RTFx"
echo "Overall RTFx: $OVERALL_RTFx"
exit 1
fi
else
echo "❌ CRITICAL: Results file not found - benchmark failed"
echo "MEDIAN_RTFx=N/A" >> $GITHUB_OUTPUT
echo "OVERALL_RTFx=N/A" >> $GITHUB_OUTPUT
exit 1
fi

- name: Comment PR
if: github.event_name == 'pull_request'
if: always() && github.event_name == 'pull_request'
continue-on-error: true
uses: actions/github-script@v7
with:
script: |
const status = '${{ steps.smoketest.outputs.SMOKE_STATUS }}';
const emoji = status === 'PASSED' ? '✅' : '❌';
const medianRTFx = '${{ steps.smoketest.outputs.MEDIAN_RTFx }}';
const overallRTFx = '${{ steps.smoketest.outputs.OVERALL_RTFx }}';

const body = `## Qwen3-ASR int8 Smoke Test ${emoji}

Expand All @@ -91,6 +120,12 @@ jobs:
| Transcription pipeline | ${emoji} |
| Decoder size | 571 MB (vs 1.1 GB f32) |

### Performance Metrics
| Metric | CI Value | Expected on Apple Silicon |
|--------|----------|--------------------------|
| Median RTFx | ${medianRTFx}x | ~2.5x |
| Overall RTFx | ${overallRTFx}x | ~2.5x |

<sub>Runtime: ${{ steps.smoketest.outputs.EXECUTION_TIME }}</sub>

<sub>**Note:** CI VM lacks physical GPU — CoreML MLState (macOS 15) KV cache produces degraded results on virtualized runners. On Apple Silicon: ~1.3% WER / 2.5x RTFx.</sub>
Expand Down
7 changes: 7 additions & 0 deletions .github/workflows/sortformer-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,13 @@ jobs:
echo "DETECTED=${DETECTED}" >> $GITHUB_OUTPUT
echo "GROUND_TRUTH=${GROUND_TRUTH}" >> $GITHUB_OUTPUT

# Validate RTFx - 0 indicates benchmark failure
if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then
echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed"
echo "RTFx value: $RTF"
exit 1
fi

- name: Comment PR with Benchmark Results
if: always()
uses: actions/github-script@v7
Expand Down
26 changes: 26 additions & 0 deletions .github/workflows/vad-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,32 @@ jobs:
--threshold 0.5 \
--output voices_vad_results.json

- name: Validate RTFx metrics
run: |
# Validate MUSAN RTFx
if [ -f musan_vad_results.json ]; then
MUSAN_RTFx=$(jq -r '.rtfx // 0' musan_vad_results.json)
if [ "$MUSAN_RTFx" = "0" ] || [ -z "$MUSAN_RTFx" ]; then
echo "❌ CRITICAL: MUSAN RTFx is 0 or empty - benchmark failed"
exit 1
fi
else
echo "❌ CRITICAL: musan_vad_results.json not found"
exit 1
fi

# Validate VOiCES RTFx
if [ -f voices_vad_results.json ]; then
VOICES_RTFx=$(jq -r '.rtfx // 0' voices_vad_results.json)
if [ "$VOICES_RTFx" = "0" ] || [ -z "$VOICES_RTFx" ]; then
echo "❌ CRITICAL: VOiCES RTFx is 0 or empty - benchmark failed"
exit 1
fi
else
echo "❌ CRITICAL: voices_vad_results.json not found"
exit 1
fi

- name: Upload results
if: always()
uses: actions/upload-artifact@v4
Expand Down
Loading