From 05ac22429a3c21ba94b523ff280d45069d2351e3 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:19:11 -0400 Subject: [PATCH 1/5] Simplify folderName logic by stripping -coreml suffix by default Remove redundant special cases in folderName property. Now only keeps special cases for nested directory structures (EOU and Nemotron variants) and uses a simple default rule: strip "-coreml" suffix from the name. This eliminates the inconsistency raised in #442 by applying a consistent pattern across all models. Before: - Had 10+ special cases explicitly returning shortened names - parakeetTdtCtc110m was inconsistent with other Parakeet models After: - Only 5 special cases for nested directories (parakeet-eou-streaming/*, nemotron-streaming/*) - Default strips -coreml suffix for all other models - All Parakeet models now follow the same pattern Fixes #442 --- Sources/FluidAudio/ModelNames.swift | 14 +------------- .../ASR/Parakeet/ModelNamesTests.swift | 2 +- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 1d4d7e9fc..d5efec205 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -115,8 +115,6 @@ public enum Repo: String, CaseIterable { /// Local folder name used for caching public var folderName: String { switch self { - case .kokoro: - return "kokoro" case .parakeetEou160: return "parakeet-eou-streaming/160ms" case .parakeetEou320: @@ -127,18 +125,8 @@ public enum Repo: String, CaseIterable { return "nemotron-streaming/1120ms" case .nemotronStreaming560: return "nemotron-streaming/560ms" - case .sortformer: - return "sortformer" - case .lseend: - return "ls-eend" - case .pocketTts: - return "pocket-tts" - case .multilingualG2p: - return "charsiu-g2p-byt5" - case .parakeetTdtCtc110m: - return "parakeet-tdt-ctc-110m" default: - return name + return name.replacingOccurrences(of: "-coreml", with: "") } } } diff --git a/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift index 3e3607394..6048ff95c 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift @@ -125,7 +125,7 @@ final class ModelNamesTests: XCTestCase { // Verify name (repo slug with -coreml suffix) XCTAssertEqual(repo.name, "parakeet-tdt-ctc-110m-coreml") - // Verify folder name (simplified local folder name) + // Verify folder name (simplified - strips -coreml suffix by default) XCTAssertEqual(repo.folderName, "parakeet-tdt-ctc-110m") // Should have no subpath (not a variant repo) From c3aba6a96c7fa63fbb1fee75e8b78de312525f99 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:24:11 -0400 Subject: [PATCH 2/5] Update kokoro folderName test expectation to match simplified logic --- Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift b/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift index 8a059833c..1f8eb3274 100644 --- a/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift +++ b/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift @@ -111,7 +111,7 @@ final class MultilingualG2PTests: XCTestCase { func testRepoMultilingualG2P() { // Multilingual G2P models are bundled inside the kokoro repo - XCTAssertEqual(Repo.kokoro.folderName, "kokoro") + XCTAssertEqual(Repo.kokoro.folderName, "kokoro-82m") XCTAssertEqual(Repo.kokoro.remotePath, "FluidInference/kokoro-82m-coreml") } } From 8b79517cde879e4f9c23c910895461343f661971 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:29:29 -0400 Subject: [PATCH 3/5] Keep kokoro and sortformer special cases to avoid breaking changes Add back special cases for kokoro and sortformer to preserve existing folder names and avoid forcing users to re-download models. Still removes redundant special cases (lseend, pocketTts, multilingualG2p, parakeetTdtCtc110m) that can safely use the default -coreml stripping logic. Result: 7 special cases total (kokoro, sortformer, + 5 nested directories) vs 11 special cases before. Still achieves consistency for Parakeet models without breaking existing cached model locations. --- Sources/FluidAudio/ModelNames.swift | 4 ++++ Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index d5efec205..afc7d6af3 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -115,6 +115,8 @@ public enum Repo: String, CaseIterable { /// Local folder name used for caching public var folderName: String { switch self { + case .kokoro: + return "kokoro" case .parakeetEou160: return "parakeet-eou-streaming/160ms" case .parakeetEou320: @@ -125,6 +127,8 @@ public enum Repo: String, CaseIterable { return "nemotron-streaming/1120ms" case .nemotronStreaming560: return "nemotron-streaming/560ms" + case .sortformer: + return "sortformer" default: return name.replacingOccurrences(of: "-coreml", with: "") } diff --git a/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift b/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift index 1f8eb3274..8a059833c 100644 --- a/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift +++ b/Tests/FluidAudioTests/TTS/MultilingualG2PTests.swift @@ -111,7 +111,7 @@ final class MultilingualG2PTests: XCTestCase { func testRepoMultilingualG2P() { // Multilingual G2P models are bundled inside the kokoro repo - XCTAssertEqual(Repo.kokoro.folderName, "kokoro-82m") + XCTAssertEqual(Repo.kokoro.folderName, "kokoro") XCTAssertEqual(Repo.kokoro.remotePath, "FluidInference/kokoro-82m-coreml") } } From 31066deeec3a310e5ed47678376e8274ea434a46 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:37:46 -0400 Subject: [PATCH 4/5] Add RTFx tracking to qwen3-asr-benchmark workflow Extract and display medianRTFx and overallRTFx metrics in the PR comment. Previously the workflow was running benchmarks but not displaying the performance metrics. Co-Authored-By: Claude Sonnet 4.5 --- .github/workflows/qwen3-asr-benchmark.yml | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml index 074bede82..959f77ffa 100644 --- a/.github/workflows/qwen3-asr-benchmark.yml +++ b/.github/workflows/qwen3-asr-benchmark.yml @@ -69,6 +69,21 @@ jobs: echo "SMOKE_STATUS=FAILED" >> $GITHUB_OUTPUT fi + # Extract RTFx metrics if results file exists + if [ -f qwen3_results_int8.json ]; then + MEDIAN_RTFx=$(jq -r '.summary.medianRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null) + OVERALL_RTFx=$(jq -r '.summary.overallRTFx // "N/A"' qwen3_results_int8.json 2>/dev/null) + + [ "$MEDIAN_RTFx" != "null" ] && [ "$MEDIAN_RTFx" != "N/A" ] && MEDIAN_RTFx=$(printf "%.2f" "$MEDIAN_RTFx") || MEDIAN_RTFx="N/A" + [ "$OVERALL_RTFx" != "null" ] && [ "$OVERALL_RTFx" != "N/A" ] && OVERALL_RTFx=$(printf "%.2f" "$OVERALL_RTFx") || OVERALL_RTFx="N/A" + + echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT + echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT + else + echo "MEDIAN_RTFx=N/A" >> $GITHUB_OUTPUT + echo "OVERALL_RTFx=N/A" >> $GITHUB_OUTPUT + fi + EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT @@ -81,6 +96,9 @@ jobs: const status = '${{ steps.smoketest.outputs.SMOKE_STATUS }}'; const emoji = status === 'PASSED' ? '✅' : '❌'; + const medianRTFx = '${{ steps.smoketest.outputs.MEDIAN_RTFx }}'; + const overallRTFx = '${{ steps.smoketest.outputs.OVERALL_RTFx }}'; + const body = `## Qwen3-ASR int8 Smoke Test ${emoji} | Check | Result | @@ -91,6 +109,12 @@ jobs: | Transcription pipeline | ${emoji} | | Decoder size | 571 MB (vs 1.1 GB f32) | + ### Performance Metrics + | Metric | CI Value | Expected on Apple Silicon | + |--------|----------|--------------------------| + | Median RTFx | ${medianRTFx}x | ~2.5x | + | Overall RTFx | ${overallRTFx}x | ~2.5x | + Runtime: ${{ steps.smoketest.outputs.EXECUTION_TIME }} **Note:** CI VM lacks physical GPU — CoreML MLState (macOS 15) KV cache produces degraded results on virtualized runners. On Apple Silicon: ~1.3% WER / 2.5x RTFx. From 8ea07ff5bb26a26c4e5fe56dd418cf5bdcb79b02 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:42:18 -0400 Subject: [PATCH 5/5] Fail benchmark workflows if RTFx is 0 Add validation to all benchmark workflows to fail with exit 1 if RTFx metrics are 0 or N/A, indicating a silent benchmark failure. Changes: - qwen3-asr-benchmark.yml: Validate medianRTFx and overallRTFx - asr-benchmark.yml: Validate all 6 RTFx metrics (v2/v3, clean/other, streaming) - diarizer-benchmark.yml: Validate RTFx - parakeet-eou-benchmark.yml: Validate RTFx - sortformer-benchmark.yml: Validate RTFx - vad-benchmark.yml: Validate MUSAN and VOiCES RTFx If RTFx is 0, it means either: 1. Benchmark didn't run properly 2. Audio duration was 0 3. Processing failed silently Better to fail fast than report misleading metrics. Co-Authored-By: Claude Sonnet 4.5 --- .github/workflows/asr-benchmark.yml | 39 +++++++++++++++++++- .github/workflows/diarizer-benchmark.yml | 7 ++++ .github/workflows/parakeet-eou-benchmark.yml | 7 ++++ .github/workflows/qwen3-asr-benchmark.yml | 10 +++++ .github/workflows/sortformer-benchmark.yml | 7 ++++ .github/workflows/vad-benchmark.yml | 26 +++++++++++++ 6 files changed, 94 insertions(+), 2 deletions(-) diff --git a/.github/workflows/asr-benchmark.yml b/.github/workflows/asr-benchmark.yml index 302577ffb..6702ee1b3 100644 --- a/.github/workflows/asr-benchmark.yml +++ b/.github/workflows/asr-benchmark.yml @@ -198,9 +198,38 @@ jobs: echo "EXECUTION_TIME=$EXECUTION_TIME" >> $GITHUB_OUTPUT echo "FILES_COUNT=$MAX_FILES" >> $GITHUB_OUTPUT + # Validate RTFx values - 0 indicates benchmark failure + if [ "$CLEAN_RTFx" = "0.00" ] || [ "$CLEAN_RTFx" = "N/A" ]; then + echo "⚠️ test-clean RTFx is 0 or N/A - benchmark may have failed" + CLEAN_RTFX_FAILED=1 + fi + if [ "$CLEAN_V2_RTFx" = "0.00" ] || [ "$CLEAN_V2_RTFx" = "N/A" ]; then + echo "⚠️ test-clean (v2) RTFx is 0 or N/A - benchmark may have failed" + CLEAN_V2_RTFX_FAILED=1 + fi + if [ "$OTHER_RTFx" = "0.00" ] || [ "$OTHER_RTFx" = "N/A" ]; then + echo "⚠️ test-other RTFx is 0 or N/A - benchmark may have failed" + OTHER_RTFX_FAILED=1 + fi + if [ "$OTHER_V2_RTFx" = "0.00" ] || [ "$OTHER_V2_RTFx" = "N/A" ]; then + echo "⚠️ test-other (v2) RTFx is 0 or N/A - benchmark may have failed" + OTHER_V2_RTFX_FAILED=1 + fi + if [ "$STREAMING_RTFx" = "0.00" ] || [ "$STREAMING_RTFx" = "N/A" ]; then + echo "⚠️ streaming RTFx is 0 or N/A - benchmark may have failed" + STREAMING_RTFX_FAILED=1 + fi + if [ "$STREAMING_V2_RTFx" = "0.00" ] || [ "$STREAMING_V2_RTFx" = "N/A" ]; then + echo "⚠️ streaming (v2) RTFx is 0 or N/A - benchmark may have failed" + STREAMING_V2_RTFX_FAILED=1 + fi + # Report failures summary if [ ! -z "$CLEAN_FAILED" ] || [ ! -z "$OTHER_FAILED" ] || [ ! -z "$STREAMING_FAILED" ] || \ - [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ]; then + [ ! -z "$CLEAN_V2_FAILED" ] || [ ! -z "$OTHER_V2_FAILED" ] || [ ! -z "$STREAMING_V2_FAILED" ] || \ + [ ! -z "$CLEAN_RTFX_FAILED" ] || [ ! -z "$CLEAN_V2_RTFX_FAILED" ] || \ + [ ! -z "$OTHER_RTFX_FAILED" ] || [ ! -z "$OTHER_V2_RTFX_FAILED" ] || \ + [ ! -z "$STREAMING_RTFX_FAILED" ] || [ ! -z "$STREAMING_V2_RTFX_FAILED" ]; then echo "BENCHMARK_STATUS=PARTIAL_FAILURE" >> $GITHUB_OUTPUT echo "⚠️ Some benchmarks failed:" [ ! -z "$CLEAN_FAILED" ] && echo " - test-clean benchmark failed" @@ -209,7 +238,13 @@ jobs: [ ! -z "$CLEAN_V2_FAILED" ] && echo " - test-clean (v2) benchmark failed" [ ! -z "$OTHER_V2_FAILED" ] && echo " - test-other (v2) benchmark failed" [ ! -z "$STREAMING_V2_FAILED" ] && echo " - streaming (v2) benchmark failed" - # Don't exit with error to allow PR comment to be posted + [ ! -z "$CLEAN_RTFX_FAILED" ] && echo " - test-clean RTFx is 0" + [ ! -z "$CLEAN_V2_RTFX_FAILED" ] && echo " - test-clean (v2) RTFx is 0" + [ ! -z "$OTHER_RTFX_FAILED" ] && echo " - test-other RTFx is 0" + [ ! -z "$OTHER_V2_RTFX_FAILED" ] && echo " - test-other (v2) RTFx is 0" + [ ! -z "$STREAMING_RTFX_FAILED" ] && echo " - streaming RTFx is 0" + [ ! -z "$STREAMING_V2_RTFX_FAILED" ] && echo " - streaming (v2) RTFx is 0" + exit 1 else echo "BENCHMARK_STATUS=SUCCESS" >> $GITHUB_OUTPUT echo "✅ All benchmarks completed successfully" diff --git a/.github/workflows/diarizer-benchmark.yml b/.github/workflows/diarizer-benchmark.yml index 43251f671..0775ad844 100644 --- a/.github/workflows/diarizer-benchmark.yml +++ b/.github/workflows/diarizer-benchmark.yml @@ -115,6 +115,13 @@ jobs: echo "CLUSTERING_TIME=${CLUSTERING_TIME}" >> $GITHUB_OUTPUT echo "INFERENCE_TIME=${INFERENCE_TIME}" >> $GITHUB_OUTPUT + # Validate RTFx - 0 indicates benchmark failure + if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then + echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" + echo "RTFx value: $RTF" + exit 1 + fi + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 diff --git a/.github/workflows/parakeet-eou-benchmark.yml b/.github/workflows/parakeet-eou-benchmark.yml index bef59e306..62f45b556 100644 --- a/.github/workflows/parakeet-eou-benchmark.yml +++ b/.github/workflows/parakeet-eou-benchmark.yml @@ -104,6 +104,13 @@ jobs: echo "MAX_FILES=$MAX_FILES" >> $GITHUB_OUTPUT echo "BENCHMARK_STATUS=$BENCHMARK_STATUS" >> $GITHUB_OUTPUT + # Validate RTFx - 0 or N/A indicates benchmark failure + if [ "$RTFx" = "0.00" ] || [ "$RTFx" = "N/A" ]; then + echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed" + echo "RTFx value: $RTFx" + exit 1 + fi + - name: Comment PR if: github.event_name == 'pull_request' continue-on-error: true diff --git a/.github/workflows/qwen3-asr-benchmark.yml b/.github/workflows/qwen3-asr-benchmark.yml index 959f77ffa..ef62bd7a2 100644 --- a/.github/workflows/qwen3-asr-benchmark.yml +++ b/.github/workflows/qwen3-asr-benchmark.yml @@ -79,9 +79,19 @@ jobs: echo "MEDIAN_RTFx=$MEDIAN_RTFx" >> $GITHUB_OUTPUT echo "OVERALL_RTFx=$OVERALL_RTFx" >> $GITHUB_OUTPUT + + # Fail if RTFx is 0 or N/A - indicates benchmark failure + if [ "$MEDIAN_RTFx" = "N/A" ] || [ "$MEDIAN_RTFx" = "0.00" ] || [ "$OVERALL_RTFx" = "N/A" ] || [ "$OVERALL_RTFx" = "0.00" ]; then + echo "❌ CRITICAL: RTFx is 0 or N/A - benchmark failed to produce valid results" + echo "Median RTFx: $MEDIAN_RTFx" + echo "Overall RTFx: $OVERALL_RTFx" + exit 1 + fi else + echo "❌ CRITICAL: Results file not found - benchmark failed" echo "MEDIAN_RTFx=N/A" >> $GITHUB_OUTPUT echo "OVERALL_RTFx=N/A" >> $GITHUB_OUTPUT + exit 1 fi EXECUTION_TIME=$(( ($(date +%s) - BENCHMARK_START) / 60 ))m$(( ($(date +%s) - BENCHMARK_START) % 60 ))s diff --git a/.github/workflows/sortformer-benchmark.yml b/.github/workflows/sortformer-benchmark.yml index 2f9edd701..a3e04d662 100644 --- a/.github/workflows/sortformer-benchmark.yml +++ b/.github/workflows/sortformer-benchmark.yml @@ -115,6 +115,13 @@ jobs: echo "DETECTED=${DETECTED}" >> $GITHUB_OUTPUT echo "GROUND_TRUTH=${GROUND_TRUTH}" >> $GITHUB_OUTPUT + # Validate RTFx - 0 indicates benchmark failure + if [ "$RTF" = "0" ] || [ -z "$RTF" ]; then + echo "❌ CRITICAL: RTFx is 0 or empty - benchmark failed" + echo "RTFx value: $RTF" + exit 1 + fi + - name: Comment PR with Benchmark Results if: always() uses: actions/github-script@v7 diff --git a/.github/workflows/vad-benchmark.yml b/.github/workflows/vad-benchmark.yml index 3a75a0e60..22806c1fc 100644 --- a/.github/workflows/vad-benchmark.yml +++ b/.github/workflows/vad-benchmark.yml @@ -74,6 +74,32 @@ jobs: --threshold 0.5 \ --output voices_vad_results.json + - name: Validate RTFx metrics + run: | + # Validate MUSAN RTFx + if [ -f musan_vad_results.json ]; then + MUSAN_RTFx=$(jq -r '.rtfx // 0' musan_vad_results.json) + if [ "$MUSAN_RTFx" = "0" ] || [ -z "$MUSAN_RTFx" ]; then + echo "❌ CRITICAL: MUSAN RTFx is 0 or empty - benchmark failed" + exit 1 + fi + else + echo "❌ CRITICAL: musan_vad_results.json not found" + exit 1 + fi + + # Validate VOiCES RTFx + if [ -f voices_vad_results.json ]; then + VOICES_RTFx=$(jq -r '.rtfx // 0' voices_vad_results.json) + if [ "$VOICES_RTFx" = "0" ] || [ -z "$VOICES_RTFx" ]; then + echo "❌ CRITICAL: VOiCES RTFx is 0 or empty - benchmark failed" + exit 1 + fi + else + echo "❌ CRITICAL: voices_vad_results.json not found" + exit 1 + fi + - name: Upload results if: always() uses: actions/upload-artifact@v4