From 2f8a6d5e3f11cccdf471cd48d2830b6eaa558fab Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 13:33:46 -0400 Subject: [PATCH 01/12] Add standalone CTC head inference for custom vocabulary (#435) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Export the CTC decoder head (512→1025 linear projection) as a separate 1MB CoreML model instead of requiring the full 97.5MB CTC encoder. The CtcHead model runs on the existing TDT encoder output, achieving 99.4% Dict Recall at 70.29x RTFx on the earnings benchmark (772 files). - Load optional CtcHead.mlmodelc from model directory in AsrModels - Run CTC head on raw encoder output in AsrTranscription - Add spotKeywordsFromLogProbs() for DP on pre-computed log-probs - Add applyLogSoftmax() for raw logits→log-probs conversion - Expose cached CTC logits via AsrManager for VocabularyRescorer - Update CtcEarningsBenchmark to use standalone CTC head path --- Documentation/ASR/benchmarks100.md | 39 ++++++ .../FluidAudio/ASR/Parakeet/AsrManager.swift | 28 ++++ .../FluidAudio/ASR/Parakeet/AsrModels.swift | 20 +++ .../ASR/Parakeet/AsrTranscription.swift | 97 +++++++++++-- .../WordSpotting/CtcKeywordSpotter.swift | 127 ++++++++++++++++++ Sources/FluidAudio/ModelNames.swift | 2 + .../SlidingWindow/CtcEarningsBenchmark.swift | 74 +++++++--- 7 files changed, 357 insertions(+), 30 deletions(-) diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index be3436ac2..ae806836b 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -41,3 +41,42 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru ## Verdict **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes. + +--- + +# Issue #435: Unified CTC Head Export (Full Dataset) + +Benchmark comparing separate CTC encoder vs unified CTC head exported from TDT-CTC-110M Preprocessor. +See [#435](https://github.com/FluidInference/FluidAudio/issues/435). + +## Environment + +- **Hardware**: MacBook Air M2, 16 GB +- **Build**: `swift build -c release` +- **Date**: 2026-03-28 +- **Branch**: `ctc-head-export` + +## CTC Earnings (Earnings22-KWS, 772 files) + +| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Unified CTC (110m TDT) | +|---|---|---|---| +| WER | 14.67% | 16.08% | 16.88% | +| Dict Recall | 99.3% | 99.4% | 99.4% | +| Vocab Precision | 99.8% | 99.7% | 99.6% | +| Vocab Recall | 73.7% | 70.0% | 59.6% | +| Vocab F-score | 84.8% | 82.0% | 74.6% | +| RTFx | 43.94x | 25.98x | **48.35x** | + +## Analysis + +- **Dict Recall**: Identical at 99.4% between separate and unified 110m paths. The unified CTC head produces equivalent keyword detection quality. +- **RTFx**: **48.35x** (unified) vs **25.98x** (separate 110m) = **86% speedup**. Eliminating the separate CTC encoder run nearly doubles throughput. +- **WER**: Slight increase (16.08% → 16.88%) because the unified CTC head's logits have different characteristics than the separately-trained CTC model, affecting vocabulary rescoring decisions. +- **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue. + +## Key Takeaways + +1. **Unified model eliminates separate CTC encoder** — single Preprocessor outputs both TDT encoder features and CTC logits +2. **Memory reduction**: ~40MB saved by removing duplicate encoder weights +3. **Dict Recall preserved**: Keyword detection quality is identical +4. **RTFx nearly doubled**: No second encoder pass needed for custom vocabulary workloads diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index 503a494b3..a61e335ec 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -53,6 +53,34 @@ public actor AsrManager { internal var vocabSizeConfig: ContextBiasingConstants.VocabSizeConfig? internal var vocabBoostingEnabled: Bool { customVocabulary != nil && vocabularyRescorer != nil } + // Cached CTC logits from fused Preprocessor (unified custom vocabulary) + internal var cachedCtcLogits: MLMultiArray? + internal var cachedCtcFrameDuration: Double? + + /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model). + public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil } + + /// Get cached CTC logits as [[Float]] for external use (e.g. benchmarks). + /// Returns nil if the Preprocessor doesn't output CTC logits. + public func getCachedCtcLogProbs() -> (logProbs: [[Float]], frameDuration: Double)? { + guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil } + let shape = logits.shape + guard shape.count == 3 else { return nil } + let numFrames = shape[1].intValue + let vocabSize = shape[2].intValue + var result: [[Float]] = [] + result.reserveCapacity(numFrames) + for t in 0.. ASRResult { - guard let spotter = ctcSpotter, - let rescorer = vocabularyRescorer, + guard let rescorer = vocabularyRescorer, let vocab = customVocabulary, let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty else { @@ -549,13 +574,30 @@ extension AsrManager { } do { - let spotResult = try await spotter.spotKeywordsWithLogProbs( - audioSamples: audioSamples, - customVocabulary: vocab, - minScore: nil - ) + // Try to use cached CTC logits from unified Preprocessor first + let logProbs: [[Float]] + let frameDuration: Double + + if let cached = cachedCtcLogits, let duration = cachedCtcFrameDuration { + // Convert MLMultiArray to [[Float]] + logProbs = convertCtcLogitsToArray(cached) + frameDuration = duration + logger.debug("Using cached CTC logits from Preprocessor (unified model)") + } else if let spotter = ctcSpotter { + // Fallback: run separate CTC encoder + let spotResult = try await spotter.spotKeywordsWithLogProbs( + audioSamples: audioSamples, + customVocabulary: vocab, + minScore: nil + ) + logProbs = spotResult.logProbs + frameDuration = spotResult.frameDuration + logger.debug("Using separate CTC encoder (legacy dual-model approach)") + } else { + logger.warning("Vocabulary rescoring skipped: no CTC logits available") + return result + } - let logProbs = spotResult.logProbs guard !logProbs.isEmpty else { logger.debug("Vocabulary rescoring skipped: no log probs from CTC") return result @@ -570,7 +612,7 @@ extension AsrManager { transcript: result.text, tokenTimings: tokenTimings, logProbs: logProbs, - frameDuration: spotResult.frameDuration, + frameDuration: frameDuration, cbw: vocabConfig.cbw, marginSeconds: 0.5, minSimilarity: effectiveMinSimilarity @@ -600,4 +642,41 @@ extension AsrManager { } } + /// Convert CTC logits MLMultiArray to log-probabilities [[Float]] for rescoring. + /// Applies log-softmax with temperature scaling and blank bias to match + /// the processing done in `CtcKeywordSpotter.computeLogProbs`. + private func convertCtcLogitsToArray(_ ctcLogits: MLMultiArray) -> [[Float]] { + // Expected shape: [1, T, V] where T = frames, V = vocab size + let shape = ctcLogits.shape + guard shape.count == 3 else { + logger.warning("Unexpected CTC logits shape: \(shape)") + return [] + } + + let numFrames = shape[1].intValue + let vocabSize = shape[2].intValue + + // Extract raw logits + var rawLogits: [[Float]] = [] + rawLogits.reserveCapacity(numFrames) + + for t in 0.. SpotKeywordsResult { + let totalFrames = logProbs.count + guard totalFrames > 0 else { + return SpotKeywordsResult(detections: [], logProbs: [], frameDuration: 0, totalFrames: 0) + } + + var results: [KeywordDetection] = [] + + for term in customVocabulary.terms { + guard term.text.count >= customVocabulary.minTermLength else { + if debugMode { + logger.debug( + " Skipping '\(term.text)': too short (\(term.text.count) < \(customVocabulary.minTermLength) chars)" + ) + } + continue + } + + let ids = term.ctcTokenIds ?? term.tokenIds + guard let ids, !ids.isEmpty else { continue } + + let tokenCount = ids.count + let adjustedThreshold: Float = + minScore.map { base in + let extraTokens = max(0, tokenCount - ContextBiasingConstants.baselineTokenCountForThreshold) + return base - Float(extraTokens) * ContextBiasingConstants.thresholdRelaxationPerToken + } ?? ContextBiasingConstants.defaultMinSpotterScore + + let multipleDetections = ctcWordSpotMultiple( + logProbs: logProbs, + keywordTokens: ids, + minScore: adjustedThreshold, + mergeOverlap: true + ) + + for (score, start, end) in multipleDetections { + let startTime = TimeInterval(start) * frameDuration + let endTime = TimeInterval(end) * frameDuration + + let detection = KeywordDetection( + term: term, + score: score, + totalFrames: totalFrames, + startFrame: start, + endFrame: end, + startTime: startTime, + endTime: endTime + ) + results.append(detection) + } + } + + return SpotKeywordsResult( + detections: results, + logProbs: logProbs, + frameDuration: frameDuration, + totalFrames: totalFrames + ) + } + + // MARK: - Log-Probability Conversion + + /// Convert raw CTC logits to log-probabilities with temperature scaling and blank bias. + /// Use this to post-process raw logits from a unified Preprocessor before passing to + /// `spotKeywordsFromLogProbs` or `VocabularyRescorer.ctcTokenRescore`. + /// + /// - Parameters: + /// - rawLogits: Raw CTC logits [T, V] (before softmax). + /// - blankId: Index of the blank token in the vocabulary. + /// - temperature: Temperature for softmax scaling (default from ContextBiasingConstants). + /// - blankBias: Penalty applied to blank token log-probability (default from ContextBiasingConstants). + /// - Returns: Log-probabilities [T, V] after log-softmax, temperature, and blank bias. + public static func applyLogSoftmax( + rawLogits: [[Float]], + blankId: Int, + temperature: Float = ContextBiasingConstants.ctcTemperature, + blankBias: Float = ContextBiasingConstants.blankBias + ) -> [[Float]] { + var logProbs = [[Float]]() + logProbs.reserveCapacity(rawLogits.count) + + for logits in rawLogits { + guard !logits.isEmpty else { + logProbs.append([]) + continue + } + + // Temperature scaling + let scaled = temperature != 1.0 ? logits.map { $0 / temperature } : logits + + // Log-softmax + let maxVal = scaled.max() ?? 0 + var sumExp: Float = 0 + for v in scaled { sumExp += expf(v - maxVal) } + let logSumExp = logf(sumExp) + + var row = [Float](repeating: 0, count: scaled.count) + for i in 0.. = [ preprocessorFile, diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift index ec2d3ec35..4bb5da505 100644 --- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift @@ -90,8 +90,15 @@ public enum CtcEarningsBenchmark { } case "--tdt-version": if i + 1 < arguments.count { - if arguments[i + 1] == "v2" || arguments[i + 1] == "2" { + switch arguments[i + 1].lowercased() { + case "v2", "2": tdtVersion = .v2 + case "v3", "3": + tdtVersion = .v3 + case "110m", "ctc-110m", "tdt-ctc-110m": + tdtVersion = .tdtCtc110m + default: + break } i += 1 } @@ -144,7 +151,7 @@ public enum CtcEarningsBenchmark { print("Earnings Benchmark (TDT transcription + CTC keyword spotting)") print(" Data directory: \(dataDir ?? "not found")") print(" Output file: \(outputFile)") - print(" TDT version: \(tdtVersion == .v2 ? "v2" : "v3")") + print(" TDT version: \(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")") print(" CTC variant: \(ctcVariant.displayName)") print(" CTC model: \(ctcModelPath ?? "not found")") print(" Keywords mode: \(keywordsMode.rawValue)") @@ -171,7 +178,7 @@ public enum CtcEarningsBenchmark { do { // Load TDT models for transcription - print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...") + print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription...") let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion) let asrManager = AsrManager(config: .default) try await asrManager.initialize(models: tdtModels) @@ -499,22 +506,29 @@ public enum CtcEarningsBenchmark { let customVocab = CustomVocabularyContext(terms: vocabTerms) // 3. CTC keyword spotting for high recall dictionary detection - let spotResult = try await spotter.spotKeywordsWithLogProbs( - audioSamples: samples, - customVocabulary: customVocab, - minScore: nil - ) - - // Debug: Show CTC detections with timestamps - if debugTimings && !spotResult.detections.isEmpty { - print(" CTC Detections:") - for detection in spotResult.detections { - print( - " [\(String(format: "%.2f", detection.startTime))-\(String(format: "%.2f", detection.endTime))s] \"\(detection.term.text)\" (score: \(String(format: "%.2f", detection.score)))" - ) - } + // Use cached CTC logits from unified Preprocessor if available (no separate encoder run needed) + let logProbs: [[Float]] + let frameDuration: Double + if let cached = await asrManager.getCachedCtcLogProbs() { + // Cached values are raw logits - apply log-softmax + temperature + blank bias + logProbs = CtcKeywordSpotter.applyLogSoftmax( + rawLogits: cached.logProbs, + blankId: spotter.blankId + ) + frameDuration = cached.frameDuration + } else { + let spotResult = try await spotter.spotKeywordsWithLogProbs( + audioSamples: samples, + customVocabulary: customVocab, + minScore: nil + ) + logProbs = spotResult.logProbs + frameDuration = spotResult.frameDuration } + // Debug: Show CTC detections with timestamps (only available with separate spotter path) + // When using cached CTC logits, detections are not available + // 4. Post-process: Use VocabularyRescorer with timestamp-based matching (NeMo CTC-WS) // Set USE_TIMESTAMP_RESCORING=1 to use timestamp-based matching (default) // Set USE_TIMESTAMP_RESCORING=0 to use legacy string-similarity based matching @@ -558,8 +572,8 @@ public enum CtcEarningsBenchmark { let rescoreResult = rescorer.ctcTokenRescore( transcript: tdtResult.text, tokenTimings: tokenTimings, - logProbs: spotResult.logProbs, - frameDuration: spotResult.frameDuration, + logProbs: logProbs, + frameDuration: frameDuration, cbw: cbw, marginSeconds: 0.5, minSimilarity: minSimilarity @@ -602,19 +616,37 @@ public enum CtcEarningsBenchmark { let checkWordsLowerSet = Set(checkWords.map { $0.lowercased() }) // 1. CTC detections (deduplicate - only count each word once, only if in checkWords) + // Use pre-computed logProbs for keyword detection when available (unified Preprocessor path) + let spotResult: CtcKeywordSpotter.SpotKeywordsResult + if !logProbs.isEmpty, await asrManager.hasCachedCtcLogits { + // Unified path: run DP keyword detection on cached logProbs (no CTC inference) + spotResult = spotter.spotKeywordsFromLogProbs( + logProbs: logProbs, + frameDuration: frameDuration, + customVocabulary: customVocab, + minScore: nil + ) + } else { + // Separate CTC path: run full CTC inference + keyword detection + spotResult = try await spotter.spotKeywordsWithLogProbs( + audioSamples: samples, + customVocabulary: customVocab, + minScore: nil + ) + } + for detection in spotResult.detections { let detail: [String: Any] = [ "word": detection.term.text, "score": round(Double(detection.score) * 100) / 100, "startTime": round(detection.startTime * 100) / 100, "endTime": round(detection.endTime * 100) / 100, - "source": "ctc", + "source": await asrManager.hasCachedCtcLogits ? "ctc-unified" : "ctc", ] detectionDetails.append(detail) if detection.score >= minCtcScore { let wordLower = detection.term.text.lowercased() - // Only count if word is in checkWords and not already counted if checkWordsLowerSet.contains(wordLower) && !ctcFoundWords.contains(wordLower) { dictFound += 1 ctcFoundWords.insert(wordLower) From d7fa8880f461b23f66455eebd592b253b52fff34 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 13:45:18 -0400 Subject: [PATCH 02/12] Auto-download CTC head from parakeet-ctc-110m HF repo Instead of only loading CtcHead.mlmodelc if manually placed in the model directory, download it on demand from FluidInference/parakeet-ctc-110m-coreml via DownloadUtils.loadModels when the tdtCtc110m model version is used. --- .../FluidAudio/ASR/Parakeet/AsrModels.swift | 28 +++++++++++-------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift index 01b5f1e8c..19e1786f8 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift @@ -211,18 +211,24 @@ extension AsrModels { throw AsrModelsError.loadingFailed("Failed to load decoder or joint model") } - // Optionally load CTC head model if present (for custom vocabulary) - let repoDir = repoPath(from: directory, version: version) - let ctcHeadPath = repoDir.appendingPathComponent(Names.ctcHeadFile) + // Optionally load CTC head model for custom vocabulary. + // The CTC head lives in the parakeetCtc110m HF repo and is downloaded on demand. var ctcHeadModel: MLModel? - if FileManager.default.fileExists(atPath: ctcHeadPath.path) { - let ctcConfig = MLModelConfiguration() - ctcConfig.computeUnits = config.computeUnits - ctcHeadModel = try? MLModel(contentsOf: ctcHeadPath, configuration: ctcConfig) - if ctcHeadModel != nil { - logger.info("Loaded optional CTC head model for custom vocabulary") - } else { - logger.warning("CTC head model found but failed to load: \(ctcHeadPath.path)") + if version == .tdtCtc110m { + do { + let ctcModels = try await DownloadUtils.loadModels( + .parakeetCtc110m, + modelNames: [Names.ctcHeadFile], + directory: parentDirectory, + computeUnits: config.computeUnits, + progressHandler: progressHandler + ) + ctcHeadModel = ctcModels[Names.ctcHeadFile] + if ctcHeadModel != nil { + logger.info("Loaded CTC head model for custom vocabulary") + } + } catch { + logger.warning("CTC head model not available: \(error.localizedDescription)") } } From 6a2e4d23aff03e8946a9b74e4d0e71d61a0ac49f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 13:49:21 -0400 Subject: [PATCH 03/12] Support both local and HF download paths for CTC head (beta) Try loading CtcHead.mlmodelc from the local TDT model directory first (v1), then fall back to auto-downloading from the parakeet-ctc-110m HF repo (v2). Mark CTC head loading as beta in log messages. --- .../FluidAudio/ASR/Parakeet/AsrModels.swift | 47 +++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift index 19e1786f8..cffead51d 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift @@ -211,24 +211,43 @@ extension AsrModels { throw AsrModelsError.loadingFailed("Failed to load decoder or joint model") } - // Optionally load CTC head model for custom vocabulary. - // The CTC head lives in the parakeetCtc110m HF repo and is downloaded on demand. + // [Beta] Optionally load CTC head model for custom vocabulary. + // Supports two paths: + // v1: CtcHead.mlmodelc placed manually in the TDT model directory + // v2: Auto-download from FluidInference/parakeet-ctc-110m-coreml HF repo var ctcHeadModel: MLModel? if version == .tdtCtc110m { - do { - let ctcModels = try await DownloadUtils.loadModels( - .parakeetCtc110m, - modelNames: [Names.ctcHeadFile], - directory: parentDirectory, - computeUnits: config.computeUnits, - progressHandler: progressHandler - ) - ctcHeadModel = ctcModels[Names.ctcHeadFile] + // v1: Check local TDT model directory first + let repoDir = repoPath(from: directory, version: version) + let ctcHeadPath = repoDir.appendingPathComponent(Names.ctcHeadFile) + if FileManager.default.fileExists(atPath: ctcHeadPath.path) { + let ctcConfig = MLModelConfiguration() + ctcConfig.computeUnits = config.computeUnits + ctcHeadModel = try? MLModel(contentsOf: ctcHeadPath, configuration: ctcConfig) if ctcHeadModel != nil { - logger.info("Loaded CTC head model for custom vocabulary") + logger.info("[Beta] Loaded CTC head model from local directory") + } else { + logger.warning("CTC head model found but failed to load: \(ctcHeadPath.path)") + } + } + + // v2: Fall back to downloading from parakeet-ctc-110m HF repo + if ctcHeadModel == nil { + do { + let ctcModels = try await DownloadUtils.loadModels( + .parakeetCtc110m, + modelNames: [Names.ctcHeadFile], + directory: parentDirectory, + computeUnits: config.computeUnits, + progressHandler: progressHandler + ) + ctcHeadModel = ctcModels[Names.ctcHeadFile] + if ctcHeadModel != nil { + logger.info("[Beta] Loaded CTC head model from HF repo") + } + } catch { + logger.warning("CTC head model not available: \(error.localizedDescription)") } - } catch { - logger.warning("CTC head model not available: \(error.localizedDescription)") } } From 55941bf80258aac704df3d0154534d3efdd0259b Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 13:57:32 -0400 Subject: [PATCH 04/12] Document standalone CTC head for custom vocabulary (beta) - Update CustomVocabulary.md with dual architecture diagrams (standalone CTC head vs separate CTC encoder) and approach comparison table - Add CTC head section to TDT-CTC-110M.md covering architecture, loading paths, performance, conversion, and beta status - Update benchmarks100.md with standalone CTC head results (70.29x RTFx, 1MB model, 99.4% Dict Recall) --- Documentation/ASR/CustomVocabulary.md | 83 ++++++++++++++++++++++++--- Documentation/ASR/TDT-CTC-110M.md | 69 ++++++++++++++++++++++ Documentation/ASR/benchmarks100.md | 27 +++++---- 3 files changed, 159 insertions(+), 20 deletions(-) diff --git a/Documentation/ASR/CustomVocabulary.md b/Documentation/ASR/CustomVocabulary.md index b68b55ef2..715eb9cfc 100644 --- a/Documentation/ASR/CustomVocabulary.md +++ b/Documentation/ASR/CustomVocabulary.md @@ -17,6 +17,58 @@ The paper introduces a dynamic programming algorithm for CTC-based keyword spott ## Architecture Overview +FluidAudio supports two approaches for CTC-based custom vocabulary boosting: + +### Approach 1: Standalone CTC Head (Beta, Recommended for TDT-CTC-110M) + +``` + ┌─────────────────────────────────────────┐ + │ Audio Input │ + │ (16kHz, mono) │ + └─────────────────┬───────────────────────┘ + │ + ▼ + ┌─────────────────┐ + │ TDT-CTC-110M │ + │ Preprocessor │ + │ (fused encoder) │ + └────────┬────────┘ + │ + encoder output [1, 512, T] + │ + ┌──────────────┴──────────────┐ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ TDT Decoder │ │ CTC Head │ + │ + Joint Network│ │ (1MB, beta) │ + └────────┬────────┘ └────────┬────────┘ + │ │ + ▼ ctc_logits [1, T, 1025] + ┌─────────────────┐ │ + │ Raw Transcript│ ▼ + │ "in video corp"│ ┌─────────────────┐ + └────────┬────────┘ Custom │ Keyword Spotter │ + │ Vocabulary►│ (DP Algorithm) │ + │ └────────┬────────┘ + └──────────────┬──────────────┘ + ▼ + ┌─────────────────┐ + │ Vocabulary │ + │ Rescorer │ + └────────┬────────┘ + │ + ▼ + ┌─────────────────┐ + │ Final Transcript│ + │ "NVIDIA Corp" │ + └─────────────────┘ +``` + +The standalone CTC head is a single linear projection (512 → 1025) extracted from the hybrid TDT-CTC-110M model. It reuses the TDT encoder output, requiring only ~1MB of additional model weight and no second encoder pass. + +### Approach 2: Separate CTC Encoder (Original) + ``` ┌─────────────────────────────────────────┐ │ Audio Input │ @@ -58,24 +110,37 @@ The paper introduces a dynamic programming algorithm for CTC-based keyword spott └─────────────────┘ ``` -## Dual Encoder Alignment +### Approach Comparison + +| | Standalone CTC Head (beta) | Separate CTC Encoder | +|---|---|---| +| **Additional model size** | 1 MB | 97.5 MB | +| **Second encoder pass** | No | Yes | +| **RTFx (earnings benchmark)** | 70.29x | 25.98x | +| **Dict Recall** | 99.4% | 99.4% | +| **TDT model requirement** | TDT-CTC-110M only | Any TDT model | +| **Status** | Beta | Stable | + +The standalone CTC head is available only with the TDT-CTC-110M model because both the TDT and CTC heads share the same encoder in the hybrid architecture. For Parakeet TDT v2/v3 (0.6B), the separate CTC encoder approach is required. + +## Encoder Alignment + +### Separate CTC Encoder (Approach 2) The system uses two separate neural network encoders that process the same audio: -### 1. TDT Encoder (Primary Transcription) +#### TDT Encoder (Primary Transcription) - **Model**: Parakeet TDT 0.6B (600M parameters) - **Architecture**: Token Duration Transducer with FastConformer - **Output**: High-quality transcription with word timestamps - **Frame Rate**: ~40ms per frame -### 2. CTC Encoder (Keyword Spotting) +#### CTC Encoder (Keyword Spotting) - **Model**: Parakeet CTC 110M (110M parameters) - **Architecture**: FastConformer with CTC head - **Output**: Per-frame log-probabilities over 1024 tokens - **Frame Rate**: ~40ms per frame (aligned with TDT) -### Frame Alignment - Both encoders use the same audio preprocessing (mel spectrogram with identical parameters), producing frames at the same rate. This enables direct timestamp comparison between: - TDT decoder word timestamps - CTC keyword detection timestamps @@ -88,18 +153,20 @@ CTC Frames: [0] [1] [2] ... [374] (375 frames @ 40ms) Aligned timestamps ``` -### Memory Usage +#### Memory Usage Running two encoders in parallel increases peak memory consumption: | Configuration | Peak RAM | Notes | |---------------|----------|-------| | TDT encoder only | ~66 MB | Standard transcription | -| TDT + CTC encoders | ~130 MB | With vocabulary boosting | +| TDT + CTC encoders | ~130 MB | With vocabulary boosting (separate encoder) | +| TDT + CTC head | ~67 MB | With vocabulary boosting (standalone head, beta) | *Measured on iPhone 17 Pro. Memory settles after initial model loading.* -The additional ~64 MB overhead comes from the CTC encoder (Parakeet 110M) being loaded alongside the primary TDT encoder. For memory-constrained scenarios, consider: +The standalone CTC head adds negligible memory (~1MB) since it reuses the existing encoder output. The separate CTC encoder adds ~64MB overhead. For memory-constrained scenarios, consider: +- Using the standalone CTC head with TDT-CTC-110M (beta) - Loading the CTC encoder on-demand rather than at startup - Unloading the CTC encoder after transcription completes - Using vocabulary boosting only for files where domain terms are expected diff --git a/Documentation/ASR/TDT-CTC-110M.md b/Documentation/ASR/TDT-CTC-110M.md index 894efebae..c628a06f4 100644 --- a/Documentation/ASR/TDT-CTC-110M.md +++ b/Documentation/ASR/TDT-CTC-110M.md @@ -465,9 +465,78 @@ Tested on iPhone (iOS 17+): - Highest accuracy required - Extra model size acceptable +## Standalone CTC Head for Custom Vocabulary (Beta) + +The TDT-CTC-110M hybrid model shares one FastConformer encoder between its TDT and CTC decoder heads. FluidAudio exploits this by exporting the CTC decoder head as a standalone 1MB CoreML model (`CtcHead.mlmodelc`) that runs on the existing TDT encoder output, enabling custom vocabulary keyword spotting without a second encoder pass. + +### How It Works + +``` +TDT Preprocessor (fused encoder) + │ + ▼ +encoder output [1, 512, T] + │ + ┌────┴────┐ + │ │ + ▼ ▼ +TDT Decoder CtcHead (1MB, beta) + │ │ + ▼ ▼ +transcript ctc_logits [1, T, 1025] + │ + ▼ + Keyword Spotter / VocabularyRescorer +``` + +The CTC head is a single linear projection (512 → 1025) that maps the 512-dimensional encoder features to log-probabilities over 1024 BPE tokens + 1 blank token. + +### Performance + +Benchmarked on 772 earnings call files (Earnings22-KWS): + +| Approach | Model Size | Dict Recall | RTFx | +|----------|-----------|-------------|------| +| Separate CTC encoder | 97.5 MB | 99.4% | 25.98x | +| **Standalone CTC head** | **1 MB** | **99.4%** | **70.29x** | + +The standalone CTC head achieves identical keyword detection quality at 2.7x the speed, using 97x less model weight. + +### Loading + +The CTC head model auto-downloads from [FluidInference/parakeet-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-ctc-110m-coreml) when loading the TDT-CTC-110M model. It also supports manual placement in the TDT model directory. + +Two loading paths are supported: +1. **Local (v1):** Place `CtcHead.mlmodelc` in the TDT model directory (`parakeet-tdt-ctc-110m/`) +2. **Auto-download (v2):** Automatically downloaded from the `parakeet-ctc-110m-coreml` HuggingFace repo + +```swift +// CTC head loads automatically with TDT-CTC-110M models +let models = try await AsrModels.downloadAndLoad(version: .tdtCtc110m) +// models.ctcHead is non-nil when CtcHead.mlmodelc is available +``` + +### Conversion + +The CTC head is exported using the conversion script in the mobius repo: + +```bash +cd mobius/models/stt/parakeet-tdt-ctc-110m/coreml/ +uv run python export-ctc-head.py --output-dir ./ctc-head-build +xcrun coremlcompiler compile ctc-head-build/CtcHead.mlpackage ctc-head-build/ +``` + +See [mobius PR #36](https://github.com/FluidInference/mobius/pull/36) for the conversion script. + +### Status + +This feature is **beta**. The CTC head produces identical keyword detection results to the separate CTC encoder, but the auto-download pathway and integration are new. See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450) for details. + ## Resources - **Model:** [FluidInference/parakeet-tdt-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml) +- **CTC Head model:** [FluidInference/parakeet-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-ctc-110m-coreml) (includes CtcHead.mlmodelc) - **Benchmark results:** See `benchmarks.md` - **PR:** [#433 - Add TDT-CTC-110M support](https://github.com/FluidInference/FluidAudio/pull/433) +- **CTC Head PR:** [#450 - Add standalone CTC head for custom vocabulary](https://github.com/FluidInference/FluidAudio/pull/450) - **Original NVIDIA model:** [nvidia/parakeet-tdt-1.1b](https://huggingface.co/nvidia/parakeet-tdt-1.1b) diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index ae806836b..2bb646b4f 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -44,10 +44,10 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru --- -# Issue #435: Unified CTC Head Export (Full Dataset) +# Issue #435: Standalone CTC Head for Custom Vocabulary (Beta) -Benchmark comparing separate CTC encoder vs unified CTC head exported from TDT-CTC-110M Preprocessor. -See [#435](https://github.com/FluidInference/FluidAudio/issues/435). +Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model. +See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450). ## Environment @@ -58,25 +58,28 @@ See [#435](https://github.com/FluidInference/FluidAudio/issues/435). ## CTC Earnings (Earnings22-KWS, 772 files) -| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Unified CTC (110m TDT) | +| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) | |---|---|---|---| | WER | 14.67% | 16.08% | 16.88% | | Dict Recall | 99.3% | 99.4% | 99.4% | | Vocab Precision | 99.8% | 99.7% | 99.6% | | Vocab Recall | 73.7% | 70.0% | 59.6% | | Vocab F-score | 84.8% | 82.0% | 74.6% | -| RTFx | 43.94x | 25.98x | **48.35x** | +| RTFx | 43.94x | 25.98x | **70.29x** | +| Additional model size | 97.5 MB | 97.5 MB | **1 MB** | ## Analysis -- **Dict Recall**: Identical at 99.4% between separate and unified 110m paths. The unified CTC head produces equivalent keyword detection quality. -- **RTFx**: **48.35x** (unified) vs **25.98x** (separate 110m) = **86% speedup**. Eliminating the separate CTC encoder run nearly doubles throughput. -- **WER**: Slight increase (16.08% → 16.88%) because the unified CTC head's logits have different characteristics than the separately-trained CTC model, affecting vocabulary rescoring decisions. +- **Dict Recall**: Identical at 99.4% between separate CTC encoder and standalone CTC head. The CTC head produces equivalent keyword detection quality. +- **RTFx**: **70.29x** (standalone head) vs **25.98x** (separate encoder) = **2.7x speedup**. The CTC head runs on the existing TDT encoder output with no second encoder pass. +- **Model size**: 1 MB (standalone head) vs 97.5 MB (separate CTC encoder) = **97x smaller**. +- **WER**: Slight increase (16.08% → 16.88%) because the CTC head's logits have different characteristics than the separately-trained CTC encoder, affecting vocabulary rescoring decisions. - **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue. ## Key Takeaways -1. **Unified model eliminates separate CTC encoder** — single Preprocessor outputs both TDT encoder features and CTC logits -2. **Memory reduction**: ~40MB saved by removing duplicate encoder weights -3. **Dict Recall preserved**: Keyword detection quality is identical -4. **RTFx nearly doubled**: No second encoder pass needed for custom vocabulary workloads +1. **Standalone CTC head eliminates separate CTC encoder** — a 1MB linear projection on the shared TDT encoder output +2. **97x smaller**: 1 MB vs 97.5 MB additional model weight +3. **Dict Recall preserved**: Keyword detection quality is identical at 99.4% +4. **2.7x faster**: No second encoder pass needed for custom vocabulary workloads +5. **Beta status**: Auto-download from HuggingFace and local file loading both supported From 4e787f78656fe613ff8f6d15d8f6f7890ba3ea95 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 13:59:11 -0400 Subject: [PATCH 05/12] Format CtcEarningsBenchmark.swift --- .../ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift index 4bb5da505..784e9f6fe 100644 --- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift @@ -178,7 +178,9 @@ public enum CtcEarningsBenchmark { do { // Load TDT models for transcription - print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription...") + print( + "Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription..." + ) let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion) let asrManager = AsrManager(config: .default) try await asrManager.initialize(models: tdtModels) From d83a893958dbc7a0bfe5acbe649e020cda7600c3 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 14:24:14 -0400 Subject: [PATCH 06/12] Fix review issues: stale cache, memory leak, naming, duplicate inference - Skip CTC head caching for multi-chunk audio (>15s) to prevent stale logits from last chunk being used for full-audio rescoring - Clear cachedCtcLogits in resetState() and cleanup() to prevent leak - Rename getCachedCtcLogProbs() to getCachedCtcRawLogits() to accurately reflect that values are raw logits, not log-probabilities - Remove duplicate CTC inference in benchmark by reusing pre-computed logProbs via spotKeywordsFromLogProbs() for both paths --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 16 +++++++--- .../ASR/Parakeet/AsrTranscription.swift | 13 +++++--- .../SlidingWindow/CtcEarningsBenchmark.swift | 31 ++++++------------- 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index a61e335ec..1b83ba99f 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -60,9 +60,11 @@ public actor AsrManager { /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model). public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil } - /// Get cached CTC logits as [[Float]] for external use (e.g. benchmarks). - /// Returns nil if the Preprocessor doesn't output CTC logits. - public func getCachedCtcLogProbs() -> (logProbs: [[Float]], frameDuration: Double)? { + /// Get cached CTC raw logits as [[Float]] for external use (e.g. benchmarks). + /// These are raw logits — callers must apply `CtcKeywordSpotter.applyLogSoftmax()` + /// to convert to log-probabilities before use in keyword detection. + /// Returns nil if the CTC head model is not available or audio was multi-chunk. + public func getCachedCtcRawLogits() -> (rawLogits: [[Float]], frameDuration: Double)? { guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil } let shape = logits.shape guard shape.count == 3 else { return nil } @@ -78,7 +80,7 @@ public actor AsrManager { } result.append(frame) } - return (logProbs: result, frameDuration: duration) + return (rawLogits: result, frameDuration: duration) } // Cached prediction options for reuse @@ -336,6 +338,8 @@ public actor AsrManager { let layers = asrModels?.version.decoderLayers ?? 2 microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers) systemDecoderState = TdtDecoderState.make(decoderLayers: layers) + cachedCtcLogits = nil + cachedCtcFrameDuration = nil Task { await sharedMLArrayCache.clear() } } @@ -350,7 +354,9 @@ public actor AsrManager { // Reset decoder states using fresh allocations for deterministic behavior microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers) systemDecoderState = TdtDecoderState.make(decoderLayers: layers) - // Release vocabulary boosting resources + // Release vocabulary boosting resources and cached CTC data + cachedCtcLogits = nil + cachedCtcFrameDuration = nil disableVocabularyBoosting() Task { await sharedMLArrayCache.clear() } logger.info("AsrManager resources cleaned up") diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index 914beb3ae..c49f918ea 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -150,8 +150,10 @@ extension AsrManager { let encoderSequenceLength = encoderLength[0].intValue - // Run CTC head on encoder output if available (for custom vocabulary) - if let ctcHeadModel = asrModels?.ctcHead { + // Run CTC head on encoder output if available (for custom vocabulary). + // Only cache for single-chunk audio — multi-chunk would overwrite per chunk, + // leaving only the last chunk's logits which is incorrect for full-audio rescoring. + if let ctcHeadModel = asrModels?.ctcHead, isLastChunk { do { let ctcInput = try MLDictionaryFeatureProvider( dictionary: ["encoder_output": MLFeatureValue(multiArray: rawEncoderOutput)] @@ -159,9 +161,12 @@ extension AsrManager { let ctcOutput = try await ctcHeadModel.compatPrediction( from: ctcInput, options: predictionOptions ) - if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue { + if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue, + globalFrameOffset == 0 + { + // Only cache when this is both the first and last chunk (single-chunk audio) cachedCtcLogits = ctcLogits - cachedCtcFrameDuration = 0.04 // 40ms per frame (80ms encoder / 2x CTC subsampling) + cachedCtcFrameDuration = 0.04 // 40ms per frame } else { cachedCtcLogits = nil cachedCtcFrameDuration = nil diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift index 784e9f6fe..05d774b0e 100644 --- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift +++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift @@ -511,10 +511,10 @@ public enum CtcEarningsBenchmark { // Use cached CTC logits from unified Preprocessor if available (no separate encoder run needed) let logProbs: [[Float]] let frameDuration: Double - if let cached = await asrManager.getCachedCtcLogProbs() { + if let cached = await asrManager.getCachedCtcRawLogits() { // Cached values are raw logits - apply log-softmax + temperature + blank bias logProbs = CtcKeywordSpotter.applyLogSoftmax( - rawLogits: cached.logProbs, + rawLogits: cached.rawLogits, blankId: spotter.blankId ) frameDuration = cached.frameDuration @@ -618,24 +618,13 @@ public enum CtcEarningsBenchmark { let checkWordsLowerSet = Set(checkWords.map { $0.lowercased() }) // 1. CTC detections (deduplicate - only count each word once, only if in checkWords) - // Use pre-computed logProbs for keyword detection when available (unified Preprocessor path) - let spotResult: CtcKeywordSpotter.SpotKeywordsResult - if !logProbs.isEmpty, await asrManager.hasCachedCtcLogits { - // Unified path: run DP keyword detection on cached logProbs (no CTC inference) - spotResult = spotter.spotKeywordsFromLogProbs( - logProbs: logProbs, - frameDuration: frameDuration, - customVocabulary: customVocab, - minScore: nil - ) - } else { - // Separate CTC path: run full CTC inference + keyword detection - spotResult = try await spotter.spotKeywordsWithLogProbs( - audioSamples: samples, - customVocabulary: customVocab, - minScore: nil - ) - } + // Reuse pre-computed logProbs for keyword detection (avoids duplicate CTC inference) + let spotResult = spotter.spotKeywordsFromLogProbs( + logProbs: logProbs, + frameDuration: frameDuration, + customVocabulary: customVocab, + minScore: nil + ) for detection in spotResult.detections { let detail: [String: Any] = [ @@ -643,7 +632,7 @@ public enum CtcEarningsBenchmark { "score": round(Double(detection.score) * 100) / 100, "startTime": round(detection.startTime * 100) / 100, "endTime": round(detection.endTime * 100) / 100, - "source": await asrManager.hasCachedCtcLogits ? "ctc-unified" : "ctc", + "source": await asrManager.hasCachedCtcLogits ? "ctc-head" : "ctc", ] detectionDetails.append(detail) From f0e3dab03c40c11707036c0b47e7fa41c9c7e707 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:07:52 -0400 Subject: [PATCH 07/12] Pass isLastChunk: true in single-chunk transcription path The CTC head guard requires isLastChunk to be true, but the single-chunk path in transcribeWithState did not pass it, causing the CTC head to never execute for single-chunk audio (the primary use case). --- Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index c49f918ea..4c5238e36 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -41,7 +41,8 @@ extension AsrManager { paddedAudio, originalLength: frameAlignedLength, actualAudioFrames: nil, // Will be calculated from originalLength - decoderState: &decoderState + decoderState: &decoderState, + isLastChunk: true // Single-chunk: always first and last ) var result = processTranscriptionResult( From e8c0a7139f79c740478dcb0adcdc95fe9ef65758 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:09:12 -0400 Subject: [PATCH 08/12] Remove #435 benchmark section from benchmarks100.md --- Documentation/ASR/benchmarks100.md | 41 ------------------------------ 1 file changed, 41 deletions(-) diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 2bb646b4f..99c1113f1 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -42,44 +42,3 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes. ---- - -# Issue #435: Standalone CTC Head for Custom Vocabulary (Beta) - -Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model. -See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450). - -## Environment - -- **Hardware**: MacBook Air M2, 16 GB -- **Build**: `swift build -c release` -- **Date**: 2026-03-28 -- **Branch**: `ctc-head-export` - -## CTC Earnings (Earnings22-KWS, 772 files) - -| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) | -|---|---|---|---| -| WER | 14.67% | 16.08% | 16.88% | -| Dict Recall | 99.3% | 99.4% | 99.4% | -| Vocab Precision | 99.8% | 99.7% | 99.6% | -| Vocab Recall | 73.7% | 70.0% | 59.6% | -| Vocab F-score | 84.8% | 82.0% | 74.6% | -| RTFx | 43.94x | 25.98x | **70.29x** | -| Additional model size | 97.5 MB | 97.5 MB | **1 MB** | - -## Analysis - -- **Dict Recall**: Identical at 99.4% between separate CTC encoder and standalone CTC head. The CTC head produces equivalent keyword detection quality. -- **RTFx**: **70.29x** (standalone head) vs **25.98x** (separate encoder) = **2.7x speedup**. The CTC head runs on the existing TDT encoder output with no second encoder pass. -- **Model size**: 1 MB (standalone head) vs 97.5 MB (separate CTC encoder) = **97x smaller**. -- **WER**: Slight increase (16.08% → 16.88%) because the CTC head's logits have different characteristics than the separately-trained CTC encoder, affecting vocabulary rescoring decisions. -- **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue. - -## Key Takeaways - -1. **Standalone CTC head eliminates separate CTC encoder** — a 1MB linear projection on the shared TDT encoder output -2. **97x smaller**: 1 MB vs 97.5 MB additional model weight -3. **Dict Recall preserved**: Keyword detection quality is identical at 99.4% -4. **2.7x faster**: No second encoder pass needed for custom vocabulary workloads -5. **Beta status**: Auto-download from HuggingFace and local file loading both supported From d9fbbb005bc21126d2d2c43951b3b02ca976412d Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:12:00 -0400 Subject: [PATCH 09/12] Add CTC head benchmark data to Benchmarks.md --- Documentation/Benchmarks.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 7a5976a41..580cb8776 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -150,6 +150,17 @@ Derived metrics: | Recall | TP / (TP + FN) | "Of words that should appear, how many did we find?" | | F-Score | 2 × P × R / (P + R) | Harmonic mean of precision and recall | +### Issue #435: Standalone CTC Head for Custom Vocabulary (Beta) + +Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model. +See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450). + +| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) | +|---|---|---|---| +| Dict Recall | 99.3% | 99.4% | 99.4% | +| RTFx | 43.94x | 25.98x | 70.29x | +| Additional model size | 97.5 MB | 97.5 MB | 1 MB | + ## Text-to-Speech We generated the same strings with to generate audio between 1s to ~300s in order to test the speed across a range of varying inputs on Pytorch CPU, MPS, and MLX pipeline, and compared it against the native Swift version with Core ML models. From cb4f293c3ad2cf8efd29c4e89f16f47e3ce1bff1 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:15:53 -0400 Subject: [PATCH 10/12] Move CTC head benchmarks to benchmarks100.md --- Documentation/ASR/benchmarks100.md | 11 +++++++++++ Documentation/Benchmarks.md | 11 ----------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 99c1113f1..6220c153e 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -42,3 +42,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes. +## Issue #435: Standalone CTC Head for Custom Vocabulary (Beta) + +Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model. +See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450). + +| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) | +|---|---|---|---| +| Dict Recall | 99.3% | 99.4% | 99.4% | +| RTFx | 43.94x | 25.98x | 70.29x | +| Additional model size | 97.5 MB | 97.5 MB | 1 MB | + diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md index 580cb8776..7a5976a41 100644 --- a/Documentation/Benchmarks.md +++ b/Documentation/Benchmarks.md @@ -150,17 +150,6 @@ Derived metrics: | Recall | TP / (TP + FN) | "Of words that should appear, how many did we find?" | | F-Score | 2 × P × R / (P + R) | Harmonic mean of precision and recall | -### Issue #435: Standalone CTC Head for Custom Vocabulary (Beta) - -Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model. -See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450). - -| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) | -|---|---|---|---| -| Dict Recall | 99.3% | 99.4% | 99.4% | -| RTFx | 43.94x | 25.98x | 70.29x | -| Additional model size | 97.5 MB | 97.5 MB | 1 MB | - ## Text-to-Speech We generated the same strings with to generate audio between 1s to ~300s in order to test the speed across a range of varying inputs on Pytorch CPU, MPS, and MLX pipeline, and compared it against the native Swift version with Core ML models. From e093ed2b078d85b4405b55bab3da44f278e2581a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 15:36:54 -0400 Subject: [PATCH 11/12] Trim CTC head logits to valid encoder frames, excluding padding --- Sources/FluidAudio/ASR/Parakeet/AsrManager.swift | 5 ++++- Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 6 +++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index 1b83ba99f..5d82a8678 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -56,6 +56,7 @@ public actor AsrManager { // Cached CTC logits from fused Preprocessor (unified custom vocabulary) internal var cachedCtcLogits: MLMultiArray? internal var cachedCtcFrameDuration: Double? + internal var cachedCtcValidFrames: Int? /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model). public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil } @@ -68,7 +69,7 @@ public actor AsrManager { guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil } let shape = logits.shape guard shape.count == 3 else { return nil } - let numFrames = shape[1].intValue + let numFrames = min(shape[1].intValue, cachedCtcValidFrames ?? shape[1].intValue) let vocabSize = shape[2].intValue var result: [[Float]] = [] result.reserveCapacity(numFrames) @@ -340,6 +341,7 @@ public actor AsrManager { systemDecoderState = TdtDecoderState.make(decoderLayers: layers) cachedCtcLogits = nil cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil Task { await sharedMLArrayCache.clear() } } @@ -357,6 +359,7 @@ public actor AsrManager { // Release vocabulary boosting resources and cached CTC data cachedCtcLogits = nil cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil disableVocabularyBoosting() Task { await sharedMLArrayCache.clear() } logger.info("AsrManager resources cleaned up") diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index 4c5238e36..2b7b81276 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -168,18 +168,22 @@ extension AsrManager { // Only cache when this is both the first and last chunk (single-chunk audio) cachedCtcLogits = ctcLogits cachedCtcFrameDuration = 0.04 // 40ms per frame + cachedCtcValidFrames = encoderSequenceLength } else { cachedCtcLogits = nil cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil } } catch { logger.warning("CTC head inference failed: \(error.localizedDescription)") cachedCtcLogits = nil cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil } } else { cachedCtcLogits = nil cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil } // Calculate actual audio frames if not provided using shared constants @@ -659,7 +663,7 @@ extension AsrManager { return [] } - let numFrames = shape[1].intValue + let numFrames = min(shape[1].intValue, cachedCtcValidFrames ?? shape[1].intValue) let vocabSize = shape[2].intValue // Extract raw logits From 1adfd8f16a2f8fbffafcafeb54002b91b71dc63b Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 16:04:30 -0400 Subject: [PATCH 12/12] Skip CTC head inference on multi-chunk audio --- Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index 2b7b81276..55f94be18 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -154,7 +154,7 @@ extension AsrManager { // Run CTC head on encoder output if available (for custom vocabulary). // Only cache for single-chunk audio — multi-chunk would overwrite per chunk, // leaving only the last chunk's logits which is incorrect for full-audio rescoring. - if let ctcHeadModel = asrModels?.ctcHead, isLastChunk { + if let ctcHeadModel = asrModels?.ctcHead, isLastChunk, globalFrameOffset == 0 { do { let ctcInput = try MLDictionaryFeatureProvider( dictionary: ["encoder_output": MLFeatureValue(multiArray: rawEncoderOutput)] @@ -162,10 +162,7 @@ extension AsrManager { let ctcOutput = try await ctcHeadModel.compatPrediction( from: ctcInput, options: predictionOptions ) - if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue, - globalFrameOffset == 0 - { - // Only cache when this is both the first and last chunk (single-chunk audio) + if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue { cachedCtcLogits = ctcLogits cachedCtcFrameDuration = 0.04 // 40ms per frame cachedCtcValidFrames = encoderSequenceLength