From 2f8a6d5e3f11cccdf471cd48d2830b6eaa558fab Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 13:33:46 -0400
Subject: [PATCH 01/12] Add standalone CTC head inference for custom vocabulary
 (#435)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Export the CTC decoder head (512→1025 linear projection) as a separate
1MB CoreML model instead of requiring the full 97.5MB CTC encoder. The
CtcHead model runs on the existing TDT encoder output, achieving 99.4%
Dict Recall at 70.29x RTFx on the earnings benchmark (772 files).

- Load optional CtcHead.mlmodelc from model directory in AsrModels
- Run CTC head on raw encoder output in AsrTranscription
- Add spotKeywordsFromLogProbs() for DP on pre-computed log-probs
- Add applyLogSoftmax() for raw logits→log-probs conversion
- Expose cached CTC logits via AsrManager for VocabularyRescorer
- Update CtcEarningsBenchmark to use standalone CTC head path
---
 Documentation/ASR/benchmarks100.md            |  39 ++++++
 .../FluidAudio/ASR/Parakeet/AsrManager.swift  |  28 ++++
 .../FluidAudio/ASR/Parakeet/AsrModels.swift   |  20 +++
 .../ASR/Parakeet/AsrTranscription.swift       |  97 +++++++++++--
 .../WordSpotting/CtcKeywordSpotter.swift      | 127 ++++++++++++++++++
 Sources/FluidAudio/ModelNames.swift           |   2 +
 .../SlidingWindow/CtcEarningsBenchmark.swift  |  74 +++++++---
 7 files changed, 357 insertions(+), 30 deletions(-)

diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md
index be3436ac2..ae806836b 100644
--- a/Documentation/ASR/benchmarks100.md
+++ b/Documentation/ASR/benchmarks100.md
@@ -41,3 +41,42 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru
 ## Verdict
 
 **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes.
+
+---
+
+# Issue #435: Unified CTC Head Export (Full Dataset)
+
+Benchmark comparing separate CTC encoder vs unified CTC head exported from TDT-CTC-110M Preprocessor.
+See [#435](https://github.com/FluidInference/FluidAudio/issues/435).
+
+## Environment
+
+- **Hardware**: MacBook Air M2, 16 GB
+- **Build**: `swift build -c release`
+- **Date**: 2026-03-28
+- **Branch**: `ctc-head-export`
+
+## CTC Earnings (Earnings22-KWS, 772 files)
+
+| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Unified CTC (110m TDT) |
+|---|---|---|---|
+| WER | 14.67% | 16.08% | 16.88% |
+| Dict Recall | 99.3% | 99.4% | 99.4% |
+| Vocab Precision | 99.8% | 99.7% | 99.6% |
+| Vocab Recall | 73.7% | 70.0% | 59.6% |
+| Vocab F-score | 84.8% | 82.0% | 74.6% |
+| RTFx | 43.94x | 25.98x | **48.35x** |
+
+## Analysis
+
+- **Dict Recall**: Identical at 99.4% between separate and unified 110m paths. The unified CTC head produces equivalent keyword detection quality.
+- **RTFx**: **48.35x** (unified) vs **25.98x** (separate 110m) = **86% speedup**. Eliminating the separate CTC encoder run nearly doubles throughput.
+- **WER**: Slight increase (16.08% → 16.88%) because the unified CTC head's logits have different characteristics than the separately-trained CTC model, affecting vocabulary rescoring decisions.
+- **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue.
+
+## Key Takeaways
+
+1. **Unified model eliminates separate CTC encoder** — single Preprocessor outputs both TDT encoder features and CTC logits
+2. **Memory reduction**: ~40MB saved by removing duplicate encoder weights
+3. **Dict Recall preserved**: Keyword detection quality is identical
+4. **RTFx nearly doubled**: No second encoder pass needed for custom vocabulary workloads
diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
index 503a494b3..a61e335ec 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
@@ -53,6 +53,34 @@ public actor AsrManager {
     internal var vocabSizeConfig: ContextBiasingConstants.VocabSizeConfig?
     internal var vocabBoostingEnabled: Bool { customVocabulary != nil && vocabularyRescorer != nil }
 
+    // Cached CTC logits from fused Preprocessor (unified custom vocabulary)
+    internal var cachedCtcLogits: MLMultiArray?
+    internal var cachedCtcFrameDuration: Double?
+
+    /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model).
+    public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil }
+
+    /// Get cached CTC logits as [[Float]] for external use (e.g. benchmarks).
+    /// Returns nil if the Preprocessor doesn't output CTC logits.
+    public func getCachedCtcLogProbs() -> (logProbs: [[Float]], frameDuration: Double)? {
+        guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil }
+        let shape = logits.shape
+        guard shape.count == 3 else { return nil }
+        let numFrames = shape[1].intValue
+        let vocabSize = shape[2].intValue
+        var result: [[Float]] = []
+        result.reserveCapacity(numFrames)
+        for t in 0..<numFrames {
+            var frame: [Float] = []
+            frame.reserveCapacity(vocabSize)
+            for v in 0..<vocabSize {
+                frame.append(logits[[0, t, v] as [NSNumber]].floatValue)
+            }
+            result.append(frame)
+        }
+        return (logProbs: result, frameDuration: duration)
+    }
+
     // Cached prediction options for reuse
     internal lazy var predictionOptions: MLPredictionOptions = {
         AsrModels.optimizedPredictionOptions()
diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
index 67129c6bd..01b5f1e8c 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
@@ -60,6 +60,8 @@ public struct AsrModels: Sendable {
     public let preprocessor: MLModel
     public let decoder: MLModel
     public let joint: MLModel
+    /// Optional CTC decoder head for custom vocabulary (encoder features → CTC logits)
+    public let ctcHead: MLModel?
     public let configuration: MLModelConfiguration
     public let vocabulary: [Int: String]
     public let version: AsrModelVersion
@@ -71,6 +73,7 @@ public struct AsrModels: Sendable {
         preprocessor: MLModel,
         decoder: MLModel,
         joint: MLModel,
+        ctcHead: MLModel? = nil,
         configuration: MLModelConfiguration,
         vocabulary: [Int: String],
         version: AsrModelVersion
@@ -79,6 +82,7 @@ public struct AsrModels: Sendable {
         self.preprocessor = preprocessor
         self.decoder = decoder
         self.joint = joint
+        self.ctcHead = ctcHead
         self.configuration = configuration
         self.vocabulary = vocabulary
         self.version = version
@@ -207,11 +211,27 @@ extension AsrModels {
             throw AsrModelsError.loadingFailed("Failed to load decoder or joint model")
         }
 
+        // Optionally load CTC head model if present (for custom vocabulary)
+        let repoDir = repoPath(from: directory, version: version)
+        let ctcHeadPath = repoDir.appendingPathComponent(Names.ctcHeadFile)
+        var ctcHeadModel: MLModel?
+        if FileManager.default.fileExists(atPath: ctcHeadPath.path) {
+            let ctcConfig = MLModelConfiguration()
+            ctcConfig.computeUnits = config.computeUnits
+            ctcHeadModel = try? MLModel(contentsOf: ctcHeadPath, configuration: ctcConfig)
+            if ctcHeadModel != nil {
+                logger.info("Loaded optional CTC head model for custom vocabulary")
+            } else {
+                logger.warning("CTC head model found but failed to load: \(ctcHeadPath.path)")
+            }
+        }
+
         let asrModels = AsrModels(
             encoder: encoderModel,
             preprocessor: preprocessorModel,
             decoder: decoderModel,
             joint: jointModel,
+            ctcHead: ctcHeadModel,
             configuration: config,
             vocabulary: try loadVocabulary(from: directory, version: version),
             version: version
diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
index 5574abfc8..914beb3ae 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
@@ -150,6 +150,32 @@ extension AsrManager {
 
             let encoderSequenceLength = encoderLength[0].intValue
 
+            // Run CTC head on encoder output if available (for custom vocabulary)
+            if let ctcHeadModel = asrModels?.ctcHead {
+                do {
+                    let ctcInput = try MLDictionaryFeatureProvider(
+                        dictionary: ["encoder_output": MLFeatureValue(multiArray: rawEncoderOutput)]
+                    )
+                    let ctcOutput = try await ctcHeadModel.compatPrediction(
+                        from: ctcInput, options: predictionOptions
+                    )
+                    if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue {
+                        cachedCtcLogits = ctcLogits
+                        cachedCtcFrameDuration = 0.04  // 40ms per frame (80ms encoder / 2x CTC subsampling)
+                    } else {
+                        cachedCtcLogits = nil
+                        cachedCtcFrameDuration = nil
+                    }
+                } catch {
+                    logger.warning("CTC head inference failed: \(error.localizedDescription)")
+                    cachedCtcLogits = nil
+                    cachedCtcFrameDuration = nil
+                }
+            } else {
+                cachedCtcLogits = nil
+                cachedCtcFrameDuration = nil
+            }
+
             // Calculate actual audio frames if not provided using shared constants
             let actualFrames =
                 actualAudioFrames ?? ASRConstants.calculateEncoderFrames(from: originalLength ?? paddedAudio.count)
@@ -540,8 +566,7 @@ extension AsrManager {
     internal func applyVocabularyRescoring(
         result: ASRResult, audioSamples: [Float]
     ) async -> ASRResult {
-        guard let spotter = ctcSpotter,
-            let rescorer = vocabularyRescorer,
+        guard let rescorer = vocabularyRescorer,
             let vocab = customVocabulary,
             let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty
         else {
@@ -549,13 +574,30 @@ extension AsrManager {
         }
 
         do {
-            let spotResult = try await spotter.spotKeywordsWithLogProbs(
-                audioSamples: audioSamples,
-                customVocabulary: vocab,
-                minScore: nil
-            )
+            // Try to use cached CTC logits from unified Preprocessor first
+            let logProbs: [[Float]]
+            let frameDuration: Double
+
+            if let cached = cachedCtcLogits, let duration = cachedCtcFrameDuration {
+                // Convert MLMultiArray to [[Float]]
+                logProbs = convertCtcLogitsToArray(cached)
+                frameDuration = duration
+                logger.debug("Using cached CTC logits from Preprocessor (unified model)")
+            } else if let spotter = ctcSpotter {
+                // Fallback: run separate CTC encoder
+                let spotResult = try await spotter.spotKeywordsWithLogProbs(
+                    audioSamples: audioSamples,
+                    customVocabulary: vocab,
+                    minScore: nil
+                )
+                logProbs = spotResult.logProbs
+                frameDuration = spotResult.frameDuration
+                logger.debug("Using separate CTC encoder (legacy dual-model approach)")
+            } else {
+                logger.warning("Vocabulary rescoring skipped: no CTC logits available")
+                return result
+            }
 
-            let logProbs = spotResult.logProbs
             guard !logProbs.isEmpty else {
                 logger.debug("Vocabulary rescoring skipped: no log probs from CTC")
                 return result
@@ -570,7 +612,7 @@ extension AsrManager {
                 transcript: result.text,
                 tokenTimings: tokenTimings,
                 logProbs: logProbs,
-                frameDuration: spotResult.frameDuration,
+                frameDuration: frameDuration,
                 cbw: vocabConfig.cbw,
                 marginSeconds: 0.5,
                 minSimilarity: effectiveMinSimilarity
@@ -600,4 +642,41 @@ extension AsrManager {
         }
     }
 
+    /// Convert CTC logits MLMultiArray to log-probabilities [[Float]] for rescoring.
+    /// Applies log-softmax with temperature scaling and blank bias to match
+    /// the processing done in `CtcKeywordSpotter.computeLogProbs`.
+    private func convertCtcLogitsToArray(_ ctcLogits: MLMultiArray) -> [[Float]] {
+        // Expected shape: [1, T, V] where T = frames, V = vocab size
+        let shape = ctcLogits.shape
+        guard shape.count == 3 else {
+            logger.warning("Unexpected CTC logits shape: \(shape)")
+            return []
+        }
+
+        let numFrames = shape[1].intValue
+        let vocabSize = shape[2].intValue
+
+        // Extract raw logits
+        var rawLogits: [[Float]] = []
+        rawLogits.reserveCapacity(numFrames)
+
+        for t in 0..<numFrames {
+            var frameLogits: [Float] = []
+            frameLogits.reserveCapacity(vocabSize)
+
+            for v in 0..<vocabSize {
+                let index = [0, t, v] as [NSNumber]
+                frameLogits.append(ctcLogits[index].floatValue)
+            }
+
+            rawLogits.append(frameLogits)
+        }
+
+        // Apply log-softmax + temperature + blank bias (same as CtcKeywordSpotter.makeLogProbs)
+        return CtcKeywordSpotter.applyLogSoftmax(
+            rawLogits: rawLogits,
+            blankId: ContextBiasingConstants.defaultBlankId
+        )
+    }
+
 }
diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcKeywordSpotter.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcKeywordSpotter.swift
index f1ca4f25a..d55d29065 100644
--- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcKeywordSpotter.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcKeywordSpotter.swift
@@ -178,6 +178,133 @@ public struct CtcKeywordSpotter: Sendable {
         )
     }
 
+    /// Spot keywords using pre-computed log-probabilities (no CTC inference).
+    /// Use this when CTC logProbs are already available (e.g. from a unified Preprocessor
+    /// that exports CTC logits alongside encoder features).
+    ///
+    /// - Parameters:
+    ///   - logProbs: Pre-computed CTC log-probabilities [T, V].
+    ///   - frameDuration: Duration of each CTC frame in seconds.
+    ///   - customVocabulary: Vocabulary context with pre-tokenized terms.
+    ///   - minScore: Optional minimum score threshold for detections.
+    /// - Returns: SpotKeywordsResult containing detections and the same log-probs passed in.
+    public func spotKeywordsFromLogProbs(
+        logProbs: [[Float]],
+        frameDuration: Double,
+        customVocabulary: CustomVocabularyContext,
+        minScore: Float? = nil
+    ) -> SpotKeywordsResult {
+        let totalFrames = logProbs.count
+        guard totalFrames > 0 else {
+            return SpotKeywordsResult(detections: [], logProbs: [], frameDuration: 0, totalFrames: 0)
+        }
+
+        var results: [KeywordDetection] = []
+
+        for term in customVocabulary.terms {
+            guard term.text.count >= customVocabulary.minTermLength else {
+                if debugMode {
+                    logger.debug(
+                        "  Skipping '\(term.text)': too short (\(term.text.count) < \(customVocabulary.minTermLength) chars)"
+                    )
+                }
+                continue
+            }
+
+            let ids = term.ctcTokenIds ?? term.tokenIds
+            guard let ids, !ids.isEmpty else { continue }
+
+            let tokenCount = ids.count
+            let adjustedThreshold: Float =
+                minScore.map { base in
+                    let extraTokens = max(0, tokenCount - ContextBiasingConstants.baselineTokenCountForThreshold)
+                    return base - Float(extraTokens) * ContextBiasingConstants.thresholdRelaxationPerToken
+                } ?? ContextBiasingConstants.defaultMinSpotterScore
+
+            let multipleDetections = ctcWordSpotMultiple(
+                logProbs: logProbs,
+                keywordTokens: ids,
+                minScore: adjustedThreshold,
+                mergeOverlap: true
+            )
+
+            for (score, start, end) in multipleDetections {
+                let startTime = TimeInterval(start) * frameDuration
+                let endTime = TimeInterval(end) * frameDuration
+
+                let detection = KeywordDetection(
+                    term: term,
+                    score: score,
+                    totalFrames: totalFrames,
+                    startFrame: start,
+                    endFrame: end,
+                    startTime: startTime,
+                    endTime: endTime
+                )
+                results.append(detection)
+            }
+        }
+
+        return SpotKeywordsResult(
+            detections: results,
+            logProbs: logProbs,
+            frameDuration: frameDuration,
+            totalFrames: totalFrames
+        )
+    }
+
+    // MARK: - Log-Probability Conversion
+
+    /// Convert raw CTC logits to log-probabilities with temperature scaling and blank bias.
+    /// Use this to post-process raw logits from a unified Preprocessor before passing to
+    /// `spotKeywordsFromLogProbs` or `VocabularyRescorer.ctcTokenRescore`.
+    ///
+    /// - Parameters:
+    ///   - rawLogits: Raw CTC logits [T, V] (before softmax).
+    ///   - blankId: Index of the blank token in the vocabulary.
+    ///   - temperature: Temperature for softmax scaling (default from ContextBiasingConstants).
+    ///   - blankBias: Penalty applied to blank token log-probability (default from ContextBiasingConstants).
+    /// - Returns: Log-probabilities [T, V] after log-softmax, temperature, and blank bias.
+    public static func applyLogSoftmax(
+        rawLogits: [[Float]],
+        blankId: Int,
+        temperature: Float = ContextBiasingConstants.ctcTemperature,
+        blankBias: Float = ContextBiasingConstants.blankBias
+    ) -> [[Float]] {
+        var logProbs = [[Float]]()
+        logProbs.reserveCapacity(rawLogits.count)
+
+        for logits in rawLogits {
+            guard !logits.isEmpty else {
+                logProbs.append([])
+                continue
+            }
+
+            // Temperature scaling
+            let scaled = temperature != 1.0 ? logits.map { $0 / temperature } : logits
+
+            // Log-softmax
+            let maxVal = scaled.max() ?? 0
+            var sumExp: Float = 0
+            for v in scaled { sumExp += expf(v - maxVal) }
+            let logSumExp = logf(sumExp)
+
+            var row = [Float](repeating: 0, count: scaled.count)
+            for i in 0..<scaled.count {
+                row[i] = (scaled[i] - maxVal) - logSumExp
+            }
+
+            // Blank bias
+            if blankBias != 0.0 && blankId < row.count {
+                row[blankId] -= blankBias
+            }
+
+            logProbs.append(row)
+        }
+
+        return logProbs
+    }
+
     // MARK: - NeMo-compatible DP (delegated to CtcDPAlgorithm)
 
     func ctcWordSpotConstrained(
diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift
index 1d4d7e9fc..bb52aa574 100644
--- a/Sources/FluidAudio/ModelNames.swift
+++ b/Sources/FluidAudio/ModelNames.swift
@@ -191,6 +191,7 @@ public enum ModelNames {
         public static let encoder = "Encoder"
         public static let decoder = "Decoder"
         public static let joint = "JointDecision"
+        public static let ctcHead = "CtcHead"
 
         // Shared vocabulary file across all model versions
         public static let vocabularyFile = "parakeet_vocab.json"
@@ -199,6 +200,7 @@ public enum ModelNames {
         public static let encoderFile = encoder + ".mlmodelc"
         public static let decoderFile = decoder + ".mlmodelc"
         public static let jointFile = joint + ".mlmodelc"
+        public static let ctcHeadFile = ctcHead + ".mlmodelc"
 
         public static let requiredModels: Set<String> = [
             preprocessorFile,
diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
index ec2d3ec35..4bb5da505 100644
--- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
@@ -90,8 +90,15 @@ public enum CtcEarningsBenchmark {
                 }
             case "--tdt-version":
                 if i + 1 < arguments.count {
-                    if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
+                    switch arguments[i + 1].lowercased() {
+                    case "v2", "2":
                         tdtVersion = .v2
+                    case "v3", "3":
+                        tdtVersion = .v3
+                    case "110m", "ctc-110m", "tdt-ctc-110m":
+                        tdtVersion = .tdtCtc110m
+                    default:
+                        break
                     }
                     i += 1
                 }
@@ -144,7 +151,7 @@ public enum CtcEarningsBenchmark {
         print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
         print("  Data directory: \(dataDir ?? "not found")")
         print("  Output file: \(outputFile)")
-        print("  TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
+        print("  TDT version: \(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")")
         print("  CTC variant: \(ctcVariant.displayName)")
         print("  CTC model: \(ctcModelPath ?? "not found")")
         print("  Keywords mode: \(keywordsMode.rawValue)")
@@ -171,7 +178,7 @@ public enum CtcEarningsBenchmark {
 
         do {
             // Load TDT models for transcription
-            print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
+            print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription...")
             let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
             let asrManager = AsrManager(config: .default)
             try await asrManager.initialize(models: tdtModels)
@@ -499,22 +506,29 @@ public enum CtcEarningsBenchmark {
         let customVocab = CustomVocabularyContext(terms: vocabTerms)
 
         // 3. CTC keyword spotting for high recall dictionary detection
-        let spotResult = try await spotter.spotKeywordsWithLogProbs(
-            audioSamples: samples,
-            customVocabulary: customVocab,
-            minScore: nil
-        )
-
-        // Debug: Show CTC detections with timestamps
-        if debugTimings && !spotResult.detections.isEmpty {
-            print("  CTC Detections:")
-            for detection in spotResult.detections {
-                print(
-                    "    [\(String(format: "%.2f", detection.startTime))-\(String(format: "%.2f", detection.endTime))s] \"\(detection.term.text)\" (score: \(String(format: "%.2f", detection.score)))"
-                )
-            }
+        // Use cached CTC logits from unified Preprocessor if available (no separate encoder run needed)
+        let logProbs: [[Float]]
+        let frameDuration: Double
+        if let cached = await asrManager.getCachedCtcLogProbs() {
+            // Cached values are raw logits - apply log-softmax + temperature + blank bias
+            logProbs = CtcKeywordSpotter.applyLogSoftmax(
+                rawLogits: cached.logProbs,
+                blankId: spotter.blankId
+            )
+            frameDuration = cached.frameDuration
+        } else {
+            let spotResult = try await spotter.spotKeywordsWithLogProbs(
+                audioSamples: samples,
+                customVocabulary: customVocab,
+                minScore: nil
+            )
+            logProbs = spotResult.logProbs
+            frameDuration = spotResult.frameDuration
         }
 
+        // Debug: Show CTC detections with timestamps (only available with separate spotter path)
+        // When using cached CTC logits, detections are not available
+
         // 4. Post-process: Use VocabularyRescorer with timestamp-based matching (NeMo CTC-WS)
         // Set USE_TIMESTAMP_RESCORING=1 to use timestamp-based matching (default)
         // Set USE_TIMESTAMP_RESCORING=0 to use legacy string-similarity based matching
@@ -558,8 +572,8 @@ public enum CtcEarningsBenchmark {
                 let rescoreResult = rescorer.ctcTokenRescore(
                     transcript: tdtResult.text,
                     tokenTimings: tokenTimings,
-                    logProbs: spotResult.logProbs,
-                    frameDuration: spotResult.frameDuration,
+                    logProbs: logProbs,
+                    frameDuration: frameDuration,
                     cbw: cbw,
                     marginSeconds: 0.5,
                     minSimilarity: minSimilarity
@@ -602,19 +616,37 @@ public enum CtcEarningsBenchmark {
         let checkWordsLowerSet = Set(checkWords.map { $0.lowercased() })
 
         // 1. CTC detections (deduplicate - only count each word once, only if in checkWords)
+        // Use pre-computed logProbs for keyword detection when available (unified Preprocessor path)
+        let spotResult: CtcKeywordSpotter.SpotKeywordsResult
+        if !logProbs.isEmpty, await asrManager.hasCachedCtcLogits {
+            // Unified path: run DP keyword detection on cached logProbs (no CTC inference)
+            spotResult = spotter.spotKeywordsFromLogProbs(
+                logProbs: logProbs,
+                frameDuration: frameDuration,
+                customVocabulary: customVocab,
+                minScore: nil
+            )
+        } else {
+            // Separate CTC path: run full CTC inference + keyword detection
+            spotResult = try await spotter.spotKeywordsWithLogProbs(
+                audioSamples: samples,
+                customVocabulary: customVocab,
+                minScore: nil
+            )
+        }
+
         for detection in spotResult.detections {
             let detail: [String: Any] = [
                 "word": detection.term.text,
                 "score": round(Double(detection.score) * 100) / 100,
                 "startTime": round(detection.startTime * 100) / 100,
                 "endTime": round(detection.endTime * 100) / 100,
-                "source": "ctc",
+                "source": await asrManager.hasCachedCtcLogits ? "ctc-unified" : "ctc",
             ]
             detectionDetails.append(detail)
 
             if detection.score >= minCtcScore {
                 let wordLower = detection.term.text.lowercased()
-                // Only count if word is in checkWords and not already counted
                 if checkWordsLowerSet.contains(wordLower) && !ctcFoundWords.contains(wordLower) {
                     dictFound += 1
                     ctcFoundWords.insert(wordLower)

From d7fa8880f461b23f66455eebd592b253b52fff34 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 13:45:18 -0400
Subject: [PATCH 02/12] Auto-download CTC head from parakeet-ctc-110m HF repo

Instead of only loading CtcHead.mlmodelc if manually placed in the model
directory, download it on demand from FluidInference/parakeet-ctc-110m-coreml
via DownloadUtils.loadModels when the tdtCtc110m model version is used.
---
 .../FluidAudio/ASR/Parakeet/AsrModels.swift   | 28 +++++++++++--------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
index 01b5f1e8c..19e1786f8 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
@@ -211,18 +211,24 @@ extension AsrModels {
             throw AsrModelsError.loadingFailed("Failed to load decoder or joint model")
         }
 
-        // Optionally load CTC head model if present (for custom vocabulary)
-        let repoDir = repoPath(from: directory, version: version)
-        let ctcHeadPath = repoDir.appendingPathComponent(Names.ctcHeadFile)
+        // Optionally load CTC head model for custom vocabulary.
+        // The CTC head lives in the parakeetCtc110m HF repo and is downloaded on demand.
         var ctcHeadModel: MLModel?
-        if FileManager.default.fileExists(atPath: ctcHeadPath.path) {
-            let ctcConfig = MLModelConfiguration()
-            ctcConfig.computeUnits = config.computeUnits
-            ctcHeadModel = try? MLModel(contentsOf: ctcHeadPath, configuration: ctcConfig)
-            if ctcHeadModel != nil {
-                logger.info("Loaded optional CTC head model for custom vocabulary")
-            } else {
-                logger.warning("CTC head model found but failed to load: \(ctcHeadPath.path)")
+        if version == .tdtCtc110m {
+            do {
+                let ctcModels = try await DownloadUtils.loadModels(
+                    .parakeetCtc110m,
+                    modelNames: [Names.ctcHeadFile],
+                    directory: parentDirectory,
+                    computeUnits: config.computeUnits,
+                    progressHandler: progressHandler
+                )
+                ctcHeadModel = ctcModels[Names.ctcHeadFile]
+                if ctcHeadModel != nil {
+                    logger.info("Loaded CTC head model for custom vocabulary")
+                }
+            } catch {
+                logger.warning("CTC head model not available: \(error.localizedDescription)")
             }
         }
 

From 6a2e4d23aff03e8946a9b74e4d0e71d61a0ac49f Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 13:49:21 -0400
Subject: [PATCH 03/12] Support both local and HF download paths for CTC head
 (beta)

Try loading CtcHead.mlmodelc from the local TDT model directory first
(v1), then fall back to auto-downloading from the parakeet-ctc-110m HF
repo (v2). Mark CTC head loading as beta in log messages.
---
 .../FluidAudio/ASR/Parakeet/AsrModels.swift   | 47 +++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
index 19e1786f8..cffead51d 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift
@@ -211,24 +211,43 @@ extension AsrModels {
             throw AsrModelsError.loadingFailed("Failed to load decoder or joint model")
         }
 
-        // Optionally load CTC head model for custom vocabulary.
-        // The CTC head lives in the parakeetCtc110m HF repo and is downloaded on demand.
+        // [Beta] Optionally load CTC head model for custom vocabulary.
+        // Supports two paths:
+        //   v1: CtcHead.mlmodelc placed manually in the TDT model directory
+        //   v2: Auto-download from FluidInference/parakeet-ctc-110m-coreml HF repo
         var ctcHeadModel: MLModel?
         if version == .tdtCtc110m {
-            do {
-                let ctcModels = try await DownloadUtils.loadModels(
-                    .parakeetCtc110m,
-                    modelNames: [Names.ctcHeadFile],
-                    directory: parentDirectory,
-                    computeUnits: config.computeUnits,
-                    progressHandler: progressHandler
-                )
-                ctcHeadModel = ctcModels[Names.ctcHeadFile]
+            // v1: Check local TDT model directory first
+            let repoDir = repoPath(from: directory, version: version)
+            let ctcHeadPath = repoDir.appendingPathComponent(Names.ctcHeadFile)
+            if FileManager.default.fileExists(atPath: ctcHeadPath.path) {
+                let ctcConfig = MLModelConfiguration()
+                ctcConfig.computeUnits = config.computeUnits
+                ctcHeadModel = try? MLModel(contentsOf: ctcHeadPath, configuration: ctcConfig)
                 if ctcHeadModel != nil {
-                    logger.info("Loaded CTC head model for custom vocabulary")
+                    logger.info("[Beta] Loaded CTC head model from local directory")
+                } else {
+                    logger.warning("CTC head model found but failed to load: \(ctcHeadPath.path)")
+                }
+            }
+
+            // v2: Fall back to downloading from parakeet-ctc-110m HF repo
+            if ctcHeadModel == nil {
+                do {
+                    let ctcModels = try await DownloadUtils.loadModels(
+                        .parakeetCtc110m,
+                        modelNames: [Names.ctcHeadFile],
+                        directory: parentDirectory,
+                        computeUnits: config.computeUnits,
+                        progressHandler: progressHandler
+                    )
+                    ctcHeadModel = ctcModels[Names.ctcHeadFile]
+                    if ctcHeadModel != nil {
+                        logger.info("[Beta] Loaded CTC head model from HF repo")
+                    }
+                } catch {
+                    logger.warning("CTC head model not available: \(error.localizedDescription)")
                 }
-            } catch {
-                logger.warning("CTC head model not available: \(error.localizedDescription)")
             }
         }
 

From 55941bf80258aac704df3d0154534d3efdd0259b Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 13:57:32 -0400
Subject: [PATCH 04/12] Document standalone CTC head for custom vocabulary
 (beta)

- Update CustomVocabulary.md with dual architecture diagrams (standalone
  CTC head vs separate CTC encoder) and approach comparison table
- Add CTC head section to TDT-CTC-110M.md covering architecture, loading
  paths, performance, conversion, and beta status
- Update benchmarks100.md with standalone CTC head results (70.29x RTFx,
  1MB model, 99.4% Dict Recall)
---
 Documentation/ASR/CustomVocabulary.md | 83 ++++++++++++++++++++++++---
 Documentation/ASR/TDT-CTC-110M.md     | 69 ++++++++++++++++++++++
 Documentation/ASR/benchmarks100.md    | 27 +++++----
 3 files changed, 159 insertions(+), 20 deletions(-)

diff --git a/Documentation/ASR/CustomVocabulary.md b/Documentation/ASR/CustomVocabulary.md
index b68b55ef2..715eb9cfc 100644
--- a/Documentation/ASR/CustomVocabulary.md
+++ b/Documentation/ASR/CustomVocabulary.md
@@ -17,6 +17,58 @@ The paper introduces a dynamic programming algorithm for CTC-based keyword spott
 
 ## Architecture Overview
 
+FluidAudio supports two approaches for CTC-based custom vocabulary boosting:
+
+### Approach 1: Standalone CTC Head (Beta, Recommended for TDT-CTC-110M)
+
+```
+                  ┌─────────────────────────────────────────┐
+                  │            Audio Input                  │
+                  │           (16kHz, mono)                 │
+                  └─────────────────┬───────────────────────┘
+                                    │
+                                    ▼
+                          ┌─────────────────┐
+                          │  TDT-CTC-110M   │
+                          │  Preprocessor   │
+                          │ (fused encoder) │
+                          └────────┬────────┘
+                                   │
+                          encoder output [1, 512, T]
+                                   │
+                    ┌──────────────┴──────────────┐
+                    │                             │
+                    ▼                             ▼
+          ┌─────────────────┐           ┌─────────────────┐
+          │   TDT Decoder   │           │    CTC Head     │
+          │  + Joint Network│           │ (1MB, beta)     │
+          └────────┬────────┘           └────────┬────────┘
+                   │                             │
+                   ▼                    ctc_logits [1, T, 1025]
+          ┌─────────────────┐                    │
+          │   Raw Transcript│                    ▼
+          │  "in video corp"│           ┌─────────────────┐
+          └────────┬────────┘  Custom   │ Keyword Spotter │
+                   │         Vocabulary►│   (DP Algorithm) │
+                   │                    └────────┬────────┘
+                   └──────────────┬──────────────┘
+                                  ▼
+                        ┌─────────────────┐
+                        │   Vocabulary    │
+                        │    Rescorer     │
+                        └────────┬────────┘
+                                 │
+                                 ▼
+                        ┌─────────────────┐
+                        │ Final Transcript│
+                        │   "NVIDIA Corp" │
+                        └─────────────────┘
+```
+
+The standalone CTC head is a single linear projection (512 → 1025) extracted from the hybrid TDT-CTC-110M model. It reuses the TDT encoder output, requiring only ~1MB of additional model weight and no second encoder pass.
+
+### Approach 2: Separate CTC Encoder (Original)
+
 ```
                   ┌─────────────────────────────────────────┐
                   │            Audio Input                  │
@@ -58,24 +110,37 @@ The paper introduces a dynamic programming algorithm for CTC-based keyword spott
                             └─────────────────┘
 ```
 
-## Dual Encoder Alignment
+### Approach Comparison
+
+| | Standalone CTC Head (beta) | Separate CTC Encoder |
+|---|---|---|
+| **Additional model size** | 1 MB | 97.5 MB |
+| **Second encoder pass** | No | Yes |
+| **RTFx (earnings benchmark)** | 70.29x | 25.98x |
+| **Dict Recall** | 99.4% | 99.4% |
+| **TDT model requirement** | TDT-CTC-110M only | Any TDT model |
+| **Status** | Beta | Stable |
+
+The standalone CTC head is available only with the TDT-CTC-110M model because both the TDT and CTC heads share the same encoder in the hybrid architecture. For Parakeet TDT v2/v3 (0.6B), the separate CTC encoder approach is required.
+
+## Encoder Alignment
+
+### Separate CTC Encoder (Approach 2)
 
 The system uses two separate neural network encoders that process the same audio:
 
-### 1. TDT Encoder (Primary Transcription)
+#### TDT Encoder (Primary Transcription)
 - **Model**: Parakeet TDT 0.6B (600M parameters)
 - **Architecture**: Token Duration Transducer with FastConformer
 - **Output**: High-quality transcription with word timestamps
 - **Frame Rate**: ~40ms per frame
 
-### 2. CTC Encoder (Keyword Spotting)
+#### CTC Encoder (Keyword Spotting)
 - **Model**: Parakeet CTC 110M (110M parameters)
 - **Architecture**: FastConformer with CTC head
 - **Output**: Per-frame log-probabilities over 1024 tokens
 - **Frame Rate**: ~40ms per frame (aligned with TDT)
 
-### Frame Alignment
-
 Both encoders use the same audio preprocessing (mel spectrogram with identical parameters), producing frames at the same rate. This enables direct timestamp comparison between:
 - TDT decoder word timestamps
 - CTC keyword detection timestamps
@@ -88,18 +153,20 @@ CTC Frames: [0] [1] [2] ... [374] (375 frames @ 40ms)
             Aligned timestamps
 ```
 
-### Memory Usage
+#### Memory Usage
 
 Running two encoders in parallel increases peak memory consumption:
 
 | Configuration | Peak RAM | Notes |
 |---------------|----------|-------|
 | TDT encoder only | ~66 MB | Standard transcription |
-| TDT + CTC encoders | ~130 MB | With vocabulary boosting |
+| TDT + CTC encoders | ~130 MB | With vocabulary boosting (separate encoder) |
+| TDT + CTC head | ~67 MB | With vocabulary boosting (standalone head, beta) |
 
 *Measured on iPhone 17 Pro. Memory settles after initial model loading.*
 
-The additional ~64 MB overhead comes from the CTC encoder (Parakeet 110M) being loaded alongside the primary TDT encoder. For memory-constrained scenarios, consider:
+The standalone CTC head adds negligible memory (~1MB) since it reuses the existing encoder output. The separate CTC encoder adds ~64MB overhead. For memory-constrained scenarios, consider:
+- Using the standalone CTC head with TDT-CTC-110M (beta)
 - Loading the CTC encoder on-demand rather than at startup
 - Unloading the CTC encoder after transcription completes
 - Using vocabulary boosting only for files where domain terms are expected
diff --git a/Documentation/ASR/TDT-CTC-110M.md b/Documentation/ASR/TDT-CTC-110M.md
index 894efebae..c628a06f4 100644
--- a/Documentation/ASR/TDT-CTC-110M.md
+++ b/Documentation/ASR/TDT-CTC-110M.md
@@ -465,9 +465,78 @@ Tested on iPhone (iOS 17+):
 - Highest accuracy required
 - Extra model size acceptable
 
+## Standalone CTC Head for Custom Vocabulary (Beta)
+
+The TDT-CTC-110M hybrid model shares one FastConformer encoder between its TDT and CTC decoder heads. FluidAudio exploits this by exporting the CTC decoder head as a standalone 1MB CoreML model (`CtcHead.mlmodelc`) that runs on the existing TDT encoder output, enabling custom vocabulary keyword spotting without a second encoder pass.
+
+### How It Works
+
+```
+TDT Preprocessor (fused encoder)
+        │
+        ▼
+encoder output [1, 512, T]
+        │
+   ┌────┴────┐
+   │         │
+   ▼         ▼
+TDT Decoder  CtcHead (1MB, beta)
+   │         │
+   ▼         ▼
+transcript   ctc_logits [1, T, 1025]
+                  │
+                  ▼
+         Keyword Spotter / VocabularyRescorer
+```
+
+The CTC head is a single linear projection (512 → 1025) that maps the 512-dimensional encoder features to log-probabilities over 1024 BPE tokens + 1 blank token.
+
+### Performance
+
+Benchmarked on 772 earnings call files (Earnings22-KWS):
+
+| Approach | Model Size | Dict Recall | RTFx |
+|----------|-----------|-------------|------|
+| Separate CTC encoder | 97.5 MB | 99.4% | 25.98x |
+| **Standalone CTC head** | **1 MB** | **99.4%** | **70.29x** |
+
+The standalone CTC head achieves identical keyword detection quality at 2.7x the speed, using 97x less model weight.
+
+### Loading
+
+The CTC head model auto-downloads from [FluidInference/parakeet-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-ctc-110m-coreml) when loading the TDT-CTC-110M model. It also supports manual placement in the TDT model directory.
+
+Two loading paths are supported:
+1. **Local (v1):** Place `CtcHead.mlmodelc` in the TDT model directory (`parakeet-tdt-ctc-110m/`)
+2. **Auto-download (v2):** Automatically downloaded from the `parakeet-ctc-110m-coreml` HuggingFace repo
+
+```swift
+// CTC head loads automatically with TDT-CTC-110M models
+let models = try await AsrModels.downloadAndLoad(version: .tdtCtc110m)
+// models.ctcHead is non-nil when CtcHead.mlmodelc is available
+```
+
+### Conversion
+
+The CTC head is exported using the conversion script in the mobius repo:
+
+```bash
+cd mobius/models/stt/parakeet-tdt-ctc-110m/coreml/
+uv run python export-ctc-head.py --output-dir ./ctc-head-build
+xcrun coremlcompiler compile ctc-head-build/CtcHead.mlpackage ctc-head-build/
+```
+
+See [mobius PR #36](https://github.com/FluidInference/mobius/pull/36) for the conversion script.
+
+### Status
+
+This feature is **beta**. The CTC head produces identical keyword detection results to the separate CTC encoder, but the auto-download pathway and integration are new. See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450) for details.
+
 ## Resources
 
 - **Model:** [FluidInference/parakeet-tdt-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-tdt-ctc-110m-coreml)
+- **CTC Head model:** [FluidInference/parakeet-ctc-110m-coreml](https://huggingface.co/FluidInference/parakeet-ctc-110m-coreml) (includes CtcHead.mlmodelc)
 - **Benchmark results:** See `benchmarks.md`
 - **PR:** [#433 - Add TDT-CTC-110M support](https://github.com/FluidInference/FluidAudio/pull/433)
+- **CTC Head PR:** [#450 - Add standalone CTC head for custom vocabulary](https://github.com/FluidInference/FluidAudio/pull/450)
 - **Original NVIDIA model:** [nvidia/parakeet-tdt-1.1b](https://huggingface.co/nvidia/parakeet-tdt-1.1b)
diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md
index ae806836b..2bb646b4f 100644
--- a/Documentation/ASR/benchmarks100.md
+++ b/Documentation/ASR/benchmarks100.md
@@ -44,10 +44,10 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru
 
 ---
 
-# Issue #435: Unified CTC Head Export (Full Dataset)
+# Issue #435: Standalone CTC Head for Custom Vocabulary (Beta)
 
-Benchmark comparing separate CTC encoder vs unified CTC head exported from TDT-CTC-110M Preprocessor.
-See [#435](https://github.com/FluidInference/FluidAudio/issues/435).
+Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model.
+See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450).
 
 ## Environment
 
@@ -58,25 +58,28 @@ See [#435](https://github.com/FluidInference/FluidAudio/issues/435).
 
 ## CTC Earnings (Earnings22-KWS, 772 files)
 
-| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Unified CTC (110m TDT) |
+| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) |
 |---|---|---|---|
 | WER | 14.67% | 16.08% | 16.88% |
 | Dict Recall | 99.3% | 99.4% | 99.4% |
 | Vocab Precision | 99.8% | 99.7% | 99.6% |
 | Vocab Recall | 73.7% | 70.0% | 59.6% |
 | Vocab F-score | 84.8% | 82.0% | 74.6% |
-| RTFx | 43.94x | 25.98x | **48.35x** |
+| RTFx | 43.94x | 25.98x | **70.29x** |
+| Additional model size | 97.5 MB | 97.5 MB | **1 MB** |
 
 ## Analysis
 
-- **Dict Recall**: Identical at 99.4% between separate and unified 110m paths. The unified CTC head produces equivalent keyword detection quality.
-- **RTFx**: **48.35x** (unified) vs **25.98x** (separate 110m) = **86% speedup**. Eliminating the separate CTC encoder run nearly doubles throughput.
-- **WER**: Slight increase (16.08% → 16.88%) because the unified CTC head's logits have different characteristics than the separately-trained CTC model, affecting vocabulary rescoring decisions.
+- **Dict Recall**: Identical at 99.4% between separate CTC encoder and standalone CTC head. The CTC head produces equivalent keyword detection quality.
+- **RTFx**: **70.29x** (standalone head) vs **25.98x** (separate encoder) = **2.7x speedup**. The CTC head runs on the existing TDT encoder output with no second encoder pass.
+- **Model size**: 1 MB (standalone head) vs 97.5 MB (separate CTC encoder) = **97x smaller**.
+- **WER**: Slight increase (16.08% → 16.88%) because the CTC head's logits have different characteristics than the separately-trained CTC encoder, affecting vocabulary rescoring decisions.
 - **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue.
 
 ## Key Takeaways
 
-1. **Unified model eliminates separate CTC encoder** — single Preprocessor outputs both TDT encoder features and CTC logits
-2. **Memory reduction**: ~40MB saved by removing duplicate encoder weights
-3. **Dict Recall preserved**: Keyword detection quality is identical
-4. **RTFx nearly doubled**: No second encoder pass needed for custom vocabulary workloads
+1. **Standalone CTC head eliminates separate CTC encoder** — a 1MB linear projection on the shared TDT encoder output
+2. **97x smaller**: 1 MB vs 97.5 MB additional model weight
+3. **Dict Recall preserved**: Keyword detection quality is identical at 99.4%
+4. **2.7x faster**: No second encoder pass needed for custom vocabulary workloads
+5. **Beta status**: Auto-download from HuggingFace and local file loading both supported

From 4e787f78656fe613ff8f6d15d8f6f7890ba3ea95 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 13:59:11 -0400
Subject: [PATCH 05/12] Format CtcEarningsBenchmark.swift

---
 .../ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
index 4bb5da505..784e9f6fe 100644
--- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
@@ -178,7 +178,9 @@ public enum CtcEarningsBenchmark {
 
         do {
             // Load TDT models for transcription
-            print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription...")
+            print(
+                "Loading TDT models (\(tdtVersion == .v2 ? "v2" : tdtVersion == .tdtCtc110m ? "110m" : "v3")) for transcription..."
+            )
             let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
             let asrManager = AsrManager(config: .default)
             try await asrManager.initialize(models: tdtModels)

From d83a893958dbc7a0bfe5acbe649e020cda7600c3 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 14:24:14 -0400
Subject: [PATCH 06/12] Fix review issues: stale cache, memory leak, naming,
 duplicate inference

- Skip CTC head caching for multi-chunk audio (>15s) to prevent stale
  logits from last chunk being used for full-audio rescoring
- Clear cachedCtcLogits in resetState() and cleanup() to prevent leak
- Rename getCachedCtcLogProbs() to getCachedCtcRawLogits() to accurately
  reflect that values are raw logits, not log-probabilities
- Remove duplicate CTC inference in benchmark by reusing pre-computed
  logProbs via spotKeywordsFromLogProbs() for both paths
---
 .../FluidAudio/ASR/Parakeet/AsrManager.swift  | 16 +++++++---
 .../ASR/Parakeet/AsrTranscription.swift       | 13 +++++---
 .../SlidingWindow/CtcEarningsBenchmark.swift  | 31 ++++++-------------
 3 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
index a61e335ec..1b83ba99f 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
@@ -60,9 +60,11 @@ public actor AsrManager {
     /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model).
     public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil }
 
-    /// Get cached CTC logits as [[Float]] for external use (e.g. benchmarks).
-    /// Returns nil if the Preprocessor doesn't output CTC logits.
-    public func getCachedCtcLogProbs() -> (logProbs: [[Float]], frameDuration: Double)? {
+    /// Get cached CTC raw logits as [[Float]] for external use (e.g. benchmarks).
+    /// These are raw logits — callers must apply `CtcKeywordSpotter.applyLogSoftmax()`
+    /// to convert to log-probabilities before use in keyword detection.
+    /// Returns nil if the CTC head model is not available or audio was multi-chunk.
+    public func getCachedCtcRawLogits() -> (rawLogits: [[Float]], frameDuration: Double)? {
         guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil }
         let shape = logits.shape
         guard shape.count == 3 else { return nil }
@@ -78,7 +80,7 @@ public actor AsrManager {
             }
             result.append(frame)
         }
-        return (logProbs: result, frameDuration: duration)
+        return (rawLogits: result, frameDuration: duration)
     }
 
     // Cached prediction options for reuse
@@ -336,6 +338,8 @@ public actor AsrManager {
         let layers = asrModels?.version.decoderLayers ?? 2
         microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers)
         systemDecoderState = TdtDecoderState.make(decoderLayers: layers)
+        cachedCtcLogits = nil
+        cachedCtcFrameDuration = nil
         Task { await sharedMLArrayCache.clear() }
     }
 
@@ -350,7 +354,9 @@ public actor AsrManager {
         // Reset decoder states using fresh allocations for deterministic behavior
         microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers)
         systemDecoderState = TdtDecoderState.make(decoderLayers: layers)
-        // Release vocabulary boosting resources
+        // Release vocabulary boosting resources and cached CTC data
+        cachedCtcLogits = nil
+        cachedCtcFrameDuration = nil
         disableVocabularyBoosting()
         Task { await sharedMLArrayCache.clear() }
         logger.info("AsrManager resources cleaned up")
diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
index 914beb3ae..c49f918ea 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
@@ -150,8 +150,10 @@ extension AsrManager {
 
             let encoderSequenceLength = encoderLength[0].intValue
 
-            // Run CTC head on encoder output if available (for custom vocabulary)
-            if let ctcHeadModel = asrModels?.ctcHead {
+            // Run CTC head on encoder output if available (for custom vocabulary).
+            // Only cache for single-chunk audio — multi-chunk would overwrite per chunk,
+            // leaving only the last chunk's logits which is incorrect for full-audio rescoring.
+            if let ctcHeadModel = asrModels?.ctcHead, isLastChunk {
                 do {
                     let ctcInput = try MLDictionaryFeatureProvider(
                         dictionary: ["encoder_output": MLFeatureValue(multiArray: rawEncoderOutput)]
@@ -159,9 +161,12 @@ extension AsrManager {
                     let ctcOutput = try await ctcHeadModel.compatPrediction(
                         from: ctcInput, options: predictionOptions
                     )
-                    if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue {
+                    if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue,
+                        globalFrameOffset == 0
+                    {
+                        // Only cache when this is both the first and last chunk (single-chunk audio)
                         cachedCtcLogits = ctcLogits
-                        cachedCtcFrameDuration = 0.04  // 40ms per frame (80ms encoder / 2x CTC subsampling)
+                        cachedCtcFrameDuration = 0.04  // 40ms per frame
                     } else {
                         cachedCtcLogits = nil
                         cachedCtcFrameDuration = nil
diff --git a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
index 784e9f6fe..05d774b0e 100644
--- a/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
+++ b/Sources/FluidAudioCLI/Commands/ASR/Parakeet/SlidingWindow/CtcEarningsBenchmark.swift
@@ -511,10 +511,10 @@ public enum CtcEarningsBenchmark {
         // Use cached CTC logits from unified Preprocessor if available (no separate encoder run needed)
         let logProbs: [[Float]]
         let frameDuration: Double
-        if let cached = await asrManager.getCachedCtcLogProbs() {
+        if let cached = await asrManager.getCachedCtcRawLogits() {
             // Cached values are raw logits - apply log-softmax + temperature + blank bias
             logProbs = CtcKeywordSpotter.applyLogSoftmax(
-                rawLogits: cached.logProbs,
+                rawLogits: cached.rawLogits,
                 blankId: spotter.blankId
             )
             frameDuration = cached.frameDuration
@@ -618,24 +618,13 @@ public enum CtcEarningsBenchmark {
         let checkWordsLowerSet = Set(checkWords.map { $0.lowercased() })
 
         // 1. CTC detections (deduplicate - only count each word once, only if in checkWords)
-        // Use pre-computed logProbs for keyword detection when available (unified Preprocessor path)
-        let spotResult: CtcKeywordSpotter.SpotKeywordsResult
-        if !logProbs.isEmpty, await asrManager.hasCachedCtcLogits {
-            // Unified path: run DP keyword detection on cached logProbs (no CTC inference)
-            spotResult = spotter.spotKeywordsFromLogProbs(
-                logProbs: logProbs,
-                frameDuration: frameDuration,
-                customVocabulary: customVocab,
-                minScore: nil
-            )
-        } else {
-            // Separate CTC path: run full CTC inference + keyword detection
-            spotResult = try await spotter.spotKeywordsWithLogProbs(
-                audioSamples: samples,
-                customVocabulary: customVocab,
-                minScore: nil
-            )
-        }
+        // Reuse pre-computed logProbs for keyword detection (avoids duplicate CTC inference)
+        let spotResult = spotter.spotKeywordsFromLogProbs(
+            logProbs: logProbs,
+            frameDuration: frameDuration,
+            customVocabulary: customVocab,
+            minScore: nil
+        )
 
         for detection in spotResult.detections {
             let detail: [String: Any] = [
@@ -643,7 +632,7 @@ public enum CtcEarningsBenchmark {
                 "score": round(Double(detection.score) * 100) / 100,
                 "startTime": round(detection.startTime * 100) / 100,
                 "endTime": round(detection.endTime * 100) / 100,
-                "source": await asrManager.hasCachedCtcLogits ? "ctc-unified" : "ctc",
+                "source": await asrManager.hasCachedCtcLogits ? "ctc-head" : "ctc",
             ]
             detectionDetails.append(detail)
 

From f0e3dab03c40c11707036c0b47e7fa41c9c7e707 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:07:52 -0400
Subject: [PATCH 07/12] Pass isLastChunk: true in single-chunk transcription
 path

The CTC head guard requires isLastChunk to be true, but the single-chunk
path in transcribeWithState did not pass it, causing the CTC head to
never execute for single-chunk audio (the primary use case).
---
 Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
index c49f918ea..4c5238e36 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
@@ -41,7 +41,8 @@ extension AsrManager {
                 paddedAudio,
                 originalLength: frameAlignedLength,
                 actualAudioFrames: nil,  // Will be calculated from originalLength
-                decoderState: &decoderState
+                decoderState: &decoderState,
+                isLastChunk: true  // Single-chunk: always first and last
             )
 
             var result = processTranscriptionResult(

From e8c0a7139f79c740478dcb0adcdc95fe9ef65758 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:09:12 -0400
Subject: [PATCH 08/12] Remove #435 benchmark section from benchmarks100.md

---
 Documentation/ASR/benchmarks100.md | 41 ------------------------------
 1 file changed, 41 deletions(-)

diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md
index 2bb646b4f..99c1113f1 100644
--- a/Documentation/ASR/benchmarks100.md
+++ b/Documentation/ASR/benchmarks100.md
@@ -42,44 +42,3 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru
 
 **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes.
 
----
-
-# Issue #435: Standalone CTC Head for Custom Vocabulary (Beta)
-
-Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model.
-See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450).
-
-## Environment
-
-- **Hardware**: MacBook Air M2, 16 GB
-- **Build**: `swift build -c release`
-- **Date**: 2026-03-28
-- **Branch**: `ctc-head-export`
-
-## CTC Earnings (Earnings22-KWS, 772 files)
-
-| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) |
-|---|---|---|---|
-| WER | 14.67% | 16.08% | 16.88% |
-| Dict Recall | 99.3% | 99.4% | 99.4% |
-| Vocab Precision | 99.8% | 99.7% | 99.6% |
-| Vocab Recall | 73.7% | 70.0% | 59.6% |
-| Vocab F-score | 84.8% | 82.0% | 74.6% |
-| RTFx | 43.94x | 25.98x | **70.29x** |
-| Additional model size | 97.5 MB | 97.5 MB | **1 MB** |
-
-## Analysis
-
-- **Dict Recall**: Identical at 99.4% between separate CTC encoder and standalone CTC head. The CTC head produces equivalent keyword detection quality.
-- **RTFx**: **70.29x** (standalone head) vs **25.98x** (separate encoder) = **2.7x speedup**. The CTC head runs on the existing TDT encoder output with no second encoder pass.
-- **Model size**: 1 MB (standalone head) vs 97.5 MB (separate CTC encoder) = **97x smaller**.
-- **WER**: Slight increase (16.08% → 16.88%) because the CTC head's logits have different characteristics than the separately-trained CTC encoder, affecting vocabulary rescoring decisions.
-- **Vocab Recall**: Lower (70.0% → 59.6%) for the same reason — the CTC head's logit distribution differs from the standalone CTC model, leading to fewer vocabulary replacements being applied. This is a rescoring tuning issue, not a detection issue.
-
-## Key Takeaways
-
-1. **Standalone CTC head eliminates separate CTC encoder** — a 1MB linear projection on the shared TDT encoder output
-2. **97x smaller**: 1 MB vs 97.5 MB additional model weight
-3. **Dict Recall preserved**: Keyword detection quality is identical at 99.4%
-4. **2.7x faster**: No second encoder pass needed for custom vocabulary workloads
-5. **Beta status**: Auto-download from HuggingFace and local file loading both supported

From d9fbbb005bc21126d2d2c43951b3b02ca976412d Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:12:00 -0400
Subject: [PATCH 09/12] Add CTC head benchmark data to Benchmarks.md

---
 Documentation/Benchmarks.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 7a5976a41..580cb8776 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -150,6 +150,17 @@ Derived metrics:
 | Recall    | TP / (TP + FN)      | "Of words that should appear, how many did we find?" |
 | F-Score   | 2 × P × R / (P + R) | Harmonic mean of precision and recall                |
 
+### Issue #435: Standalone CTC Head for Custom Vocabulary (Beta)
+
+Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model.
+See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450).
+
+| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) |
+|---|---|---|---|
+| Dict Recall | 99.3% | 99.4% | 99.4% |
+| RTFx | 43.94x | 25.98x | 70.29x |
+| Additional model size | 97.5 MB | 97.5 MB | 1 MB |
+
 ## Text-to-Speech
 
 We generated the same strings with to generate audio between 1s to ~300s in order to test the speed across a range of varying inputs on Pytorch CPU, MPS, and MLX pipeline, and compared it against the native Swift version with Core ML models.

From cb4f293c3ad2cf8efd29c4e89f16f47e3ce1bff1 Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:15:53 -0400
Subject: [PATCH 10/12] Move CTC head benchmarks to benchmarks100.md

---
 Documentation/ASR/benchmarks100.md | 11 +++++++++++
 Documentation/Benchmarks.md        | 11 -----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md
index 99c1113f1..6220c153e 100644
--- a/Documentation/ASR/benchmarks100.md
+++ b/Documentation/ASR/benchmarks100.md
@@ -42,3 +42,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru
 
 **No regressions.** WER is identical across all 6 benchmarks. RTFx differences are within normal system noise (M2 thermals, background processes). The directory restructuring is a pure file move with no behavioral changes.
 
+## Issue #435: Standalone CTC Head for Custom Vocabulary (Beta)
+
+Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model.
+See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450).
+
+| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) |
+|---|---|---|---|
+| Dict Recall | 99.3% | 99.4% | 99.4% |
+| RTFx | 43.94x | 25.98x | 70.29x |
+| Additional model size | 97.5 MB | 97.5 MB | 1 MB |
+
diff --git a/Documentation/Benchmarks.md b/Documentation/Benchmarks.md
index 580cb8776..7a5976a41 100644
--- a/Documentation/Benchmarks.md
+++ b/Documentation/Benchmarks.md
@@ -150,17 +150,6 @@ Derived metrics:
 | Recall    | TP / (TP + FN)      | "Of words that should appear, how many did we find?" |
 | F-Score   | 2 × P × R / (P + R) | Harmonic mean of precision and recall                |
 
-### Issue #435: Standalone CTC Head for Custom Vocabulary (Beta)
-
-Benchmark comparing separate CTC encoder vs standalone CTC head extracted from the TDT-CTC-110M hybrid model.
-See [#435](https://github.com/FluidInference/FluidAudio/issues/435) and [PR #450](https://github.com/FluidInference/FluidAudio/pull/450).
-
-| Metric | Separate CTC (v2 TDT) | Separate CTC (110m TDT) | Standalone CTC Head (110m TDT) |
-|---|---|---|---|
-| Dict Recall | 99.3% | 99.4% | 99.4% |
-| RTFx | 43.94x | 25.98x | 70.29x |
-| Additional model size | 97.5 MB | 97.5 MB | 1 MB |
-
 ## Text-to-Speech
 
 We generated the same strings with to generate audio between 1s to ~300s in order to test the speed across a range of varying inputs on Pytorch CPU, MPS, and MLX pipeline, and compared it against the native Swift version with Core ML models.

From e093ed2b078d85b4405b55bab3da44f278e2581a Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 15:36:54 -0400
Subject: [PATCH 11/12] Trim CTC head logits to valid encoder frames, excluding
 padding

---
 Sources/FluidAudio/ASR/Parakeet/AsrManager.swift       | 5 ++++-
 Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 6 +++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
index 1b83ba99f..5d82a8678 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift
@@ -56,6 +56,7 @@ public actor AsrManager {
     // Cached CTC logits from fused Preprocessor (unified custom vocabulary)
     internal var cachedCtcLogits: MLMultiArray?
     internal var cachedCtcFrameDuration: Double?
+    internal var cachedCtcValidFrames: Int?
 
     /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model).
     public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil }
@@ -68,7 +69,7 @@ public actor AsrManager {
         guard let logits = cachedCtcLogits, let duration = cachedCtcFrameDuration else { return nil }
         let shape = logits.shape
         guard shape.count == 3 else { return nil }
-        let numFrames = shape[1].intValue
+        let numFrames = min(shape[1].intValue, cachedCtcValidFrames ?? shape[1].intValue)
         let vocabSize = shape[2].intValue
         var result: [[Float]] = []
         result.reserveCapacity(numFrames)
@@ -340,6 +341,7 @@ public actor AsrManager {
         systemDecoderState = TdtDecoderState.make(decoderLayers: layers)
         cachedCtcLogits = nil
         cachedCtcFrameDuration = nil
+        cachedCtcValidFrames = nil
         Task { await sharedMLArrayCache.clear() }
     }
 
@@ -357,6 +359,7 @@ public actor AsrManager {
         // Release vocabulary boosting resources and cached CTC data
         cachedCtcLogits = nil
         cachedCtcFrameDuration = nil
+        cachedCtcValidFrames = nil
         disableVocabularyBoosting()
         Task { await sharedMLArrayCache.clear() }
         logger.info("AsrManager resources cleaned up")
diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
index 4c5238e36..2b7b81276 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
@@ -168,18 +168,22 @@ extension AsrManager {
                         // Only cache when this is both the first and last chunk (single-chunk audio)
                         cachedCtcLogits = ctcLogits
                         cachedCtcFrameDuration = 0.04  // 40ms per frame
+                        cachedCtcValidFrames = encoderSequenceLength
                     } else {
                         cachedCtcLogits = nil
                         cachedCtcFrameDuration = nil
+                        cachedCtcValidFrames = nil
                     }
                 } catch {
                     logger.warning("CTC head inference failed: \(error.localizedDescription)")
                     cachedCtcLogits = nil
                     cachedCtcFrameDuration = nil
+                    cachedCtcValidFrames = nil
                 }
             } else {
                 cachedCtcLogits = nil
                 cachedCtcFrameDuration = nil
+                cachedCtcValidFrames = nil
             }
 
             // Calculate actual audio frames if not provided using shared constants
@@ -659,7 +663,7 @@ extension AsrManager {
             return []
         }
 
-        let numFrames = shape[1].intValue
+        let numFrames = min(shape[1].intValue, cachedCtcValidFrames ?? shape[1].intValue)
         let vocabSize = shape[2].intValue
 
         // Extract raw logits

From 1adfd8f16a2f8fbffafcafeb54002b91b71dc63b Mon Sep 17 00:00:00 2001
From: Alex-Wengg <hanweng9@gmail.com>
Date: Sat, 28 Mar 2026 16:04:30 -0400
Subject: [PATCH 12/12] Skip CTC head inference on multi-chunk audio

---
 Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
index 2b7b81276..55f94be18 100644
--- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
+++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift
@@ -154,7 +154,7 @@ extension AsrManager {
             // Run CTC head on encoder output if available (for custom vocabulary).
             // Only cache for single-chunk audio — multi-chunk would overwrite per chunk,
             // leaving only the last chunk's logits which is incorrect for full-audio rescoring.
-            if let ctcHeadModel = asrModels?.ctcHead, isLastChunk {
+            if let ctcHeadModel = asrModels?.ctcHead, isLastChunk, globalFrameOffset == 0 {
                 do {
                     let ctcInput = try MLDictionaryFeatureProvider(
                         dictionary: ["encoder_output": MLFeatureValue(multiArray: rawEncoderOutput)]
@@ -162,10 +162,7 @@ extension AsrManager {
                     let ctcOutput = try await ctcHeadModel.compatPrediction(
                         from: ctcInput, options: predictionOptions
                     )
-                    if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue,
-                        globalFrameOffset == 0
-                    {
-                        // Only cache when this is both the first and last chunk (single-chunk audio)
+                    if let ctcLogits = ctcOutput.featureValue(for: "ctc_logits")?.multiArrayValue {
                         cachedCtcLogits = ctcLogits
                         cachedCtcFrameDuration = 0.04  // 40ms per frame
                         cachedCtcValidFrames = encoderSequenceLength