From 8df07fe2391cd0e9c0755d0ca953305091ac0e90 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 18:18:58 -0400 Subject: [PATCH 01/16] Move vocabulary boosting out of AsrManager and rename transcribeStreamingChunk AsrManager is the low-level inference engine and shouldn't own vocabulary boosting state or carry streaming semantics. This removes the duplicated vocabulary config/rescoring from AsrManager, leaving SlidingWindowAsrManager as the single owner. CTC head inference and cached logit storage remain in AsrManager since they need encoder_output MLMultiArray access. Closes #457 --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 60 +------ .../ASR/Parakeet/AsrTranscription.swift | 148 +----------------- .../SlidingWindowAsrManager.swift | 2 +- 3 files changed, 7 insertions(+), 203 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index 5d82a8678..add3f6700 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -45,14 +45,6 @@ public actor AsrManager { internal var microphoneDecoderState: TdtDecoderState internal var systemDecoderState: TdtDecoderState - // Vocabulary boosting state (configured via configureVocabularyBoosting) - // Internal access required for AsrTranscription extension (separate file) - internal var customVocabulary: CustomVocabularyContext? - internal var ctcSpotter: CtcKeywordSpotter? - internal var vocabularyRescorer: VocabularyRescorer? - internal var vocabSizeConfig: ContextBiasingConstants.VocabSizeConfig? - internal var vocabBoostingEnabled: Bool { customVocabulary != nil && vocabularyRescorer != nil } - // Cached CTC logits from fused Preprocessor (unified custom vocabulary) internal var cachedCtcLogits: MLMultiArray? internal var cachedCtcFrameDuration: Double? @@ -157,55 +149,6 @@ public actor AsrManager { logger.info("AsrManager initialized successfully with provided models") } - /// Configure vocabulary boosting for batch transcription. - /// - /// When configured, vocabulary terms will be automatically rescored after each `transcribe()` call - /// using CTC-based constrained decoding. The resulting `ASRResult` will have `ctcDetectedTerms` - /// and `ctcAppliedTerms` populated. - /// - /// - Parameters: - /// - vocabulary: Custom vocabulary context with terms to detect - /// - ctcModels: Pre-loaded CTC models for keyword spotting - /// - config: Optional rescorer configuration (default: vocabulary-size-aware config) - /// - Throws: Error if rescorer initialization fails - public func configureVocabularyBoosting( - vocabulary: CustomVocabularyContext, - ctcModels: CtcModels, - config: VocabularyRescorer.Config? = nil - ) async throws { - self.customVocabulary = vocabulary - - let blankId = ctcModels.vocabulary.count - self.ctcSpotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId) - - let vocabSize = vocabulary.terms.count - let vocabConfig = ContextBiasingConstants.rescorerConfig(forVocabSize: vocabSize) - self.vocabSizeConfig = vocabConfig - let effectiveConfig = config ?? .default - - let ctcModelDir = CtcModels.defaultCacheDirectory(for: ctcModels.variant) - self.vocabularyRescorer = try await VocabularyRescorer.create( - spotter: ctcSpotter!, - vocabulary: vocabulary, - config: effectiveConfig, - ctcModelDirectory: ctcModelDir - ) - - let isLargeVocab = vocabSize > ContextBiasingConstants.largeVocabThreshold - logger.info( - "Vocabulary boosting configured with \(vocabSize) terms (isLargeVocab: \(isLargeVocab))" - ) - } - - /// Disable vocabulary boosting and release CTC models. - public func disableVocabularyBoosting() { - customVocabulary = nil - ctcSpotter = nil - vocabularyRescorer = nil - vocabSizeConfig = nil - logger.info("Vocabulary boosting disabled") - } - private func createFeatureProvider( features: [(name: String, array: MLMultiArray)] ) throws @@ -356,11 +299,10 @@ public actor AsrManager { // Reset decoder states using fresh allocations for deterministic behavior microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers) systemDecoderState = TdtDecoderState.make(decoderLayers: layers) - // Release vocabulary boosting resources and cached CTC data + // Release cached CTC data cachedCtcLogits = nil cachedCtcFrameDuration = nil cachedCtcValidFrames = nil - disableVocabularyBoosting() Task { await sharedMLArrayCache.clear() } logger.info("AsrManager resources cleaned up") } diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index 55f94be18..a41bfd560 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -45,7 +45,7 @@ extension AsrManager { isLastChunk: true // Single-chunk: always first and last ) - var result = processTranscriptionResult( + let result = processTranscriptionResult( tokenIds: hypothesis.ySequence, timestamps: hypothesis.timestamps, confidences: hypothesis.tokenConfidences, @@ -55,11 +55,6 @@ extension AsrManager { processingTime: Date().timeIntervalSince(startTime) ) - // Auto-apply vocabulary rescoring when configured - if vocabBoostingEnabled { - result = await applyVocabularyRescoring(result: result, audioSamples: audioSamples) - } - // Store decoder state back switch source { case .microphone: @@ -73,7 +68,7 @@ extension AsrManager { // ChunkProcessor handles stateless chunked transcription for long audio let processor = ChunkProcessor(audioSamples: audioSamples) - var result = try await processor.process( + let result = try await processor.process( using: self, startTime: startTime, progressHandler: { [weak self] progress in @@ -82,11 +77,6 @@ extension AsrManager { } ) - // Auto-apply vocabulary rescoring when configured - if vocabBoostingEnabled { - result = await applyVocabularyRescoring(result: result, audioSamples: audioSamples) - } - // Store decoder state back (ChunkProcessor uses the stored state directly) switch source { case .microphone: @@ -250,9 +240,9 @@ extension AsrManager { return try MLDictionaryFeatureProvider(dictionary: features) } - /// Streaming-friendly chunk transcription that preserves decoder state and supports start-frame offset. - /// This is used by both sliding window chunking and streaming paths to unify behavior. - public func transcribeStreamingChunk( + /// Chunk transcription that preserves decoder state between calls. + /// Used by SlidingWindowAsrManager for overlapping-window processing with token deduplication. + public func transcribeChunk( _ chunkSamples: [Float], source: AudioSource, previousTokens: [Int] = [], @@ -558,132 +548,4 @@ extension AsrManager { return 0 } - // MARK: - Vocabulary Rescoring - - /// Apply vocabulary rescoring to an ASRResult using CTC-based constrained decoding. - /// - /// Runs CTC inference on the audio samples and applies vocabulary rescoring to correct - /// misrecognized words. Returns an updated ASRResult with rescored text and populated - /// `ctcDetectedTerms`/`ctcAppliedTerms` fields. - /// - /// - Parameters: - /// - result: The original ASRResult from transcription - /// - audioSamples: Audio samples used for CTC inference - /// - Returns: An ASRResult with rescored text and CTC metadata, or the original result if rescoring was skipped - internal func applyVocabularyRescoring( - result: ASRResult, audioSamples: [Float] - ) async -> ASRResult { - guard let rescorer = vocabularyRescorer, - let vocab = customVocabulary, - let tokenTimings = result.tokenTimings, !tokenTimings.isEmpty - else { - return result - } - - do { - // Try to use cached CTC logits from unified Preprocessor first - let logProbs: [[Float]] - let frameDuration: Double - - if let cached = cachedCtcLogits, let duration = cachedCtcFrameDuration { - // Convert MLMultiArray to [[Float]] - logProbs = convertCtcLogitsToArray(cached) - frameDuration = duration - logger.debug("Using cached CTC logits from Preprocessor (unified model)") - } else if let spotter = ctcSpotter { - // Fallback: run separate CTC encoder - let spotResult = try await spotter.spotKeywordsWithLogProbs( - audioSamples: audioSamples, - customVocabulary: vocab, - minScore: nil - ) - logProbs = spotResult.logProbs - frameDuration = spotResult.frameDuration - logger.debug("Using separate CTC encoder (legacy dual-model approach)") - } else { - logger.warning("Vocabulary rescoring skipped: no CTC logits available") - return result - } - - guard !logProbs.isEmpty else { - logger.debug("Vocabulary rescoring skipped: no log probs from CTC") - return result - } - - let vocabConfig = vocabSizeConfig ?? ContextBiasingConstants.rescorerConfig(forVocabSize: 0) - // Use the higher of the size-based default and the caller-specified threshold - // so that CustomVocabularyContext.minSimilarity is respected when stricter. - let effectiveMinSimilarity = max(vocabConfig.minSimilarity, vocab.minSimilarity) - - let rescoreOutput = rescorer.ctcTokenRescore( - transcript: result.text, - tokenTimings: tokenTimings, - logProbs: logProbs, - frameDuration: frameDuration, - cbw: vocabConfig.cbw, - marginSeconds: 0.5, - minSimilarity: effectiveMinSimilarity - ) - - guard rescoreOutput.wasModified else { - return result - } - - let detected = rescoreOutput.replacements.compactMap { $0.replacementWord } - let applied = rescoreOutput.replacements.filter { $0.shouldReplace }.compactMap { - $0.replacementWord - } - - logger.info( - "Vocabulary rescoring applied \(applied.count) replacement(s)" - ) - - return result.withRescoring( - text: rescoreOutput.text, - detected: detected.isEmpty ? nil : detected, - applied: applied.isEmpty ? nil : applied - ) - } catch { - logger.warning("Vocabulary rescoring failed: \(error.localizedDescription)") - return result - } - } - - /// Convert CTC logits MLMultiArray to log-probabilities [[Float]] for rescoring. - /// Applies log-softmax with temperature scaling and blank bias to match - /// the processing done in `CtcKeywordSpotter.computeLogProbs`. - private func convertCtcLogitsToArray(_ ctcLogits: MLMultiArray) -> [[Float]] { - // Expected shape: [1, T, V] where T = frames, V = vocab size - let shape = ctcLogits.shape - guard shape.count == 3 else { - logger.warning("Unexpected CTC logits shape: \(shape)") - return [] - } - - let numFrames = min(shape[1].intValue, cachedCtcValidFrames ?? shape[1].intValue) - let vocabSize = shape[2].intValue - - // Extract raw logits - var rawLogits: [[Float]] = [] - rawLogits.reserveCapacity(numFrames) - - for t in 0.. Date: Sat, 28 Mar 2026 19:32:06 -0400 Subject: [PATCH 02/16] Address tech debt items across ASR, Diarizer, and Shared modules - Remove deprecated calculateStartFrameOffset and its tests - Add explicit parakeetTdtCtc110m case to Repo.folderName - Extract duplicated defaultConfiguration() and defaultModelsDirectory() into shared MLModelConfigurationUtils, replacing 5+3 copy-pasted methods - Rename StreamingAudioSourceFactory/SampleSource/Error to drop misleading "Streaming" prefix (types are used by both ASR and Diarizer) - Rename files to match their type names (SortformerDiarizer, LSEENDDiarizer, NemotronStreamingAsrManager+Pipeline) - Remove stale TODO and duplicate vocabularyFileArray constant - Remove outdated nonisolated(unsafe) from SlidingWindowAsrManager - Replace force unwraps in RnntDecoder with guard let + throw --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 4 +- .../FluidAudio/ASR/Parakeet/AsrModels.swift | 14 +---- .../ASR/Parakeet/AsrTranscription.swift | 7 --- .../ASR/Parakeet/ChunkProcessor.swift | 4 +- .../WordSpotting/CtcModels.swift | 5 +- .../SlidingWindowAsrManager.swift | 6 +- ...emotronStreamingAsrManager+Pipeline.swift} | 0 .../ASR/Parakeet/Streaming/RnntDecoder.swift | 28 ++++++++-- .../Diarizer/Core/DiarizerModels.swift | 13 +---- ...DiarizerAPI.swift => LSEENDDiarizer.swift} | 0 .../Offline/Core/OfflineDiarizerManager.swift | 4 +- .../Offline/Core/OfflineDiarizerModels.swift | 11 +--- .../OfflineEmbeddingExtractor.swift | 4 +- .../OfflineSegmentationProcessor.swift | 2 +- ...ipeline.swift => SortformerDiarizer.swift} | 0 .../Sortformer/SortformerModelInference.swift | 5 +- Sources/FluidAudio/ModelNames.swift | 13 ++--- ...leSource.swift => AudioSampleSource.swift} | 6 +- ...Factory.swift => AudioSourceFactory.swift} | 28 +++++----- .../Shared/MLModelConfigurationUtils.swift | 36 ++++++++++++ .../Commands/ProcessCommand.swift | 2 +- .../Parakeet/AsrManagerExtensionTests.swift | 55 ------------------- .../ASR/Parakeet/AsrModelsTests.swift | 4 +- .../ASR/Parakeet/ModelNamesTests.swift | 4 +- 24 files changed, 105 insertions(+), 150 deletions(-) rename Sources/FluidAudio/ASR/Parakeet/Streaming/Nemotron/{NemotronPipeline.swift => NemotronStreamingAsrManager+Pipeline.swift} (100%) rename Sources/FluidAudio/Diarizer/LS-EEND/{LSEENDDiarizerAPI.swift => LSEENDDiarizer.swift} (100%) rename Sources/FluidAudio/Diarizer/Sortformer/{SortformerDiarizerPipeline.swift => SortformerDiarizer.swift} (100%) rename Sources/FluidAudio/Shared/{StreamingAudioSampleSource.swift => AudioSampleSource.swift} (91%) rename Sources/FluidAudio/Shared/{StreamingAudioSourceFactory.swift => AudioSourceFactory.swift} (87%) create mode 100644 Sources/FluidAudio/Shared/MLModelConfigurationUtils.swift diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index add3f6700..ce93e706a 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -41,7 +41,7 @@ public actor AsrManager { } #endif - // TODO:: the decoder state should be moved higher up in the API interface + // Per-source decoder states are actor-internal; callers reset via resetDecoderState(). internal var microphoneDecoderState: TdtDecoderState internal var systemDecoderState: TdtDecoderState @@ -434,7 +434,7 @@ public actor AsrManager { let startTime = Date() // Create a disk-backed source for memory-efficient access - let factory = StreamingAudioSourceFactory() + let factory = AudioSourceFactory() let (sampleSource, _) = try factory.makeDiskBackedSource( from: url, targetSampleRate: config.sampleRate diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift index cffead51d..32b1aeeca 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift @@ -361,11 +361,8 @@ extension AsrModels { } public static func defaultConfiguration() -> MLModelConfiguration { - let config = MLModelConfiguration() - config.allowLowPrecisionAccumulationOnGPU = true // Prefer Neural Engine across platforms for ASR inference to avoid GPU dispatch. - config.computeUnits = .cpuAndNeuralEngine - return config + MLModelConfigurationUtils.defaultConfiguration(computeUnits: .cpuAndNeuralEngine) } /// Create optimized configuration for specific model type @@ -536,14 +533,7 @@ extension AsrModels { } public static func defaultCacheDirectory(for version: AsrModelVersion = .v3) -> URL { - let appSupport = FileManager.default.urls( - for: .applicationSupportDirectory, in: .userDomainMask - ).first! - return - appSupport - .appendingPathComponent("FluidAudio", isDirectory: true) - .appendingPathComponent("Models", isDirectory: true) - .appendingPathComponent(version.repo.folderName, isDirectory: true) + MLModelConfigurationUtils.defaultModelsDirectory(for: version.repo) } // Legacy method for backward compatibility diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index a41bfd560..d0900763d 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -541,11 +541,4 @@ extension AsrManager { return (workingCurrent, removedCount) } - /// Calculate start frame offset for a sliding window segment (deprecated - now handled by timeJump) - nonisolated internal func calculateStartFrameOffset(segmentIndex: Int, leftContextSeconds: Double) -> Int { - // This method is deprecated as frame tracking is now handled by the decoder's timeJump mechanism - // Kept for test compatibility - return 0 - } - } diff --git a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift index 5e15b7618..cbbe722b3 100644 --- a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift +++ b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift @@ -3,7 +3,7 @@ import Foundation import OSLog struct ChunkProcessor { - let sampleSource: StreamingAudioSampleSource + let sampleSource: AudioSampleSource let totalSamples: Int private let logger = AppLogger(category: "ChunkProcessor") @@ -46,7 +46,7 @@ struct ChunkProcessor { } /// Initialize with a streaming audio sample source for memory-efficient processing. - init(sampleSource: StreamingAudioSampleSource) { + init(sampleSource: AudioSampleSource) { self.sampleSource = sampleSource self.totalSamples = sampleSource.sampleCount } diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcModels.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcModels.swift index 723f9f8b5..8b516e6b4 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/CustomVocabulary/WordSpotting/CtcModels.swift @@ -248,10 +248,7 @@ extension CtcModels { /// Default CoreML configuration for CTC inference. public static func defaultConfiguration() -> MLModelConfiguration { - let config = MLModelConfiguration() - config.allowLowPrecisionAccumulationOnGPU = true - config.computeUnits = .cpuAndNeuralEngine - return config + MLModelConfigurationUtils.defaultConfiguration(computeUnits: .cpuAndNeuralEngine) } /// Check whether required CTC model bundles and vocabulary exist at a directory. diff --git a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/SlidingWindowAsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/SlidingWindowAsrManager.swift index 472e25317..576617f4a 100644 --- a/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/SlidingWindowAsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/SlidingWindow/SlidingWindowAsrManager.swift @@ -48,11 +48,9 @@ public actor SlidingWindowAsrManager { // Vocabulary boosting // These are initialized via configureVocabularyBoosting() before start() - // CtcKeywordSpotter and VocabularyRescorer contain CoreML models which are not Sendable. - // We manage the safety ourselves by only accessing them from within the actor. private var customVocabulary: CustomVocabularyContext? - nonisolated(unsafe) private var ctcSpotter: CtcKeywordSpotter? - nonisolated(unsafe) private var vocabularyRescorer: VocabularyRescorer? + private var ctcSpotter: CtcKeywordSpotter? + private var vocabularyRescorer: VocabularyRescorer? private var vocabSizeConfig: ContextBiasingConstants.VocabSizeConfig? private var vocabBoostingEnabled: Bool { customVocabulary != nil && vocabularyRescorer != nil } diff --git a/Sources/FluidAudio/ASR/Parakeet/Streaming/Nemotron/NemotronPipeline.swift b/Sources/FluidAudio/ASR/Parakeet/Streaming/Nemotron/NemotronStreamingAsrManager+Pipeline.swift similarity index 100% rename from Sources/FluidAudio/ASR/Parakeet/Streaming/Nemotron/NemotronPipeline.swift rename to Sources/FluidAudio/ASR/Parakeet/Streaming/Nemotron/NemotronStreamingAsrManager+Pipeline.swift diff --git a/Sources/FluidAudio/ASR/Parakeet/Streaming/RnntDecoder.swift b/Sources/FluidAudio/ASR/Parakeet/Streaming/RnntDecoder.swift index 08f6aef3c..bfeb071fd 100644 --- a/Sources/FluidAudio/ASR/Parakeet/Streaming/RnntDecoder.swift +++ b/Sources/FluidAudio/ASR/Parakeet/Streaming/RnntDecoder.swift @@ -89,7 +89,10 @@ public final class RnntDecoder { let decoderInput = try prepareDecoderInput(lastToken: lastToken, h: hState, c: cState) let decoderOutput = try decoderModel.prediction(from: decoderInput) - var decoderStep = decoderOutput.featureValue(for: "decoder")!.multiArrayValue! + guard let decoderArray = decoderOutput.featureValue(for: "decoder")?.multiArrayValue else { + throw RnntDecoderError.missingOutput("decoder") + } + var decoderStep = decoderArray // Decoder outputs [1, 640, 2] - NeMo uses the LAST frame if decoderStep.shape.count == 3 && decoderStep.shape[2].intValue > 1 { // Slice to keep only the last frame [1, 640, 1] @@ -106,7 +109,9 @@ public final class RnntDecoder { // 3. Get Token ID // Output "token_id" is [1, 1, 1] (argmax) - let tokenIdMultiArray = jointOutput.featureValue(for: "token_id")!.multiArrayValue! + guard let tokenIdMultiArray = jointOutput.featureValue(for: "token_id")?.multiArrayValue else { + throw RnntDecoderError.missingOutput("token_id") + } let tokenId = tokenIdMultiArray[0].int32Value if tokenId == blankId { @@ -120,8 +125,12 @@ public final class RnntDecoder { lastToken = tokenId // Update State - let newH = decoderOutput.featureValue(for: "h_out")!.multiArrayValue! - let newC = decoderOutput.featureValue(for: "c_out")!.multiArrayValue! + guard let newH = decoderOutput.featureValue(for: "h_out")?.multiArrayValue else { + throw RnntDecoderError.missingOutput("h_out") + } + guard let newC = decoderOutput.featureValue(for: "c_out")?.multiArrayValue else { + throw RnntDecoderError.missingOutput("c_out") + } hState = newH cState = newC @@ -222,3 +231,14 @@ public final class RnntDecoder { } } + +enum RnntDecoderError: Error, LocalizedError { + case missingOutput(String) + + var errorDescription: String? { + switch self { + case .missingOutput(let name): + return "RNNT decoder missing expected output: \(name)" + } + } +} diff --git a/Sources/FluidAudio/Diarizer/Core/DiarizerModels.swift b/Sources/FluidAudio/Diarizer/Core/DiarizerModels.swift index 36265d03e..45f6c1831 100644 --- a/Sources/FluidAudio/Diarizer/Core/DiarizerModels.swift +++ b/Sources/FluidAudio/Diarizer/Core/DiarizerModels.swift @@ -98,21 +98,12 @@ extension DiarizerModels { } public static func defaultModelsDirectory() -> URL { - let applicationSupport = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! - return - applicationSupport - .appendingPathComponent("FluidAudio", isDirectory: true) - .appendingPathComponent("Models", isDirectory: true) - .appendingPathComponent(Repo.diarizer.folderName, isDirectory: true) + MLModelConfigurationUtils.defaultModelsDirectory(for: .diarizer) } static func defaultConfiguration() -> MLModelConfiguration { - let config = MLModelConfiguration() - // Enable Float16 optimization for ~2x speedup - config.allowLowPrecisionAccumulationOnGPU = true let isCI = ProcessInfo.processInfo.environment["CI"] != nil - config.computeUnits = isCI ? .cpuAndNeuralEngine : .all - return config + return MLModelConfigurationUtils.defaultConfiguration(computeUnits: isCI ? .cpuAndNeuralEngine : .all) } } diff --git a/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizerAPI.swift b/Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift similarity index 100% rename from Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizerAPI.swift rename to Sources/FluidAudio/Diarizer/LS-EEND/LSEENDDiarizer.swift diff --git a/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerManager.swift b/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerManager.swift index 8f7addcc0..14ceb14a7 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerManager.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerManager.swift @@ -100,7 +100,7 @@ public final class OfflineDiarizerManager { /// - Parameter url: Path to the audio file /// - Returns: Diarization result with speaker segments public func process(_ url: URL) async throws -> DiarizationResult { - let factory = StreamingAudioSourceFactory() + let factory = AudioSourceFactory() let (source, loadDuration) = try factory.makeDiskBackedSource( from: url, targetSampleRate: config.segmentation.sampleRate @@ -114,7 +114,7 @@ public final class OfflineDiarizerManager { } public func process( - audioSource: StreamingAudioSampleSource, + audioSource: AudioSampleSource, audioLoadingSeconds: TimeInterval ) async throws -> DiarizationResult { try config.validate() diff --git a/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerModels.swift b/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerModels.swift index cfe5cb5a4..7f882f028 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerModels.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Core/OfflineDiarizerModels.swift @@ -68,18 +68,11 @@ public struct OfflineDiarizerModels: Sendable { } public static func defaultModelsDirectory() -> URL { - let base = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! - return - base - .appendingPathComponent("FluidAudio", isDirectory: true) - .appendingPathComponent("Models", isDirectory: true) + MLModelConfigurationUtils.defaultModelsDirectory() } private static func defaultConfiguration() -> MLModelConfiguration { - let configuration = MLModelConfiguration() - configuration.allowLowPrecisionAccumulationOnGPU = true - configuration.computeUnits = .all - return configuration + MLModelConfigurationUtils.defaultConfiguration(computeUnits: .all) } public static func load( diff --git a/Sources/FluidAudio/Diarizer/Offline/Extraction/OfflineEmbeddingExtractor.swift b/Sources/FluidAudio/Diarizer/Offline/Extraction/OfflineEmbeddingExtractor.swift index 5fdc4f367..f87e4e87a 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Extraction/OfflineEmbeddingExtractor.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Extraction/OfflineEmbeddingExtractor.swift @@ -179,7 +179,7 @@ struct OfflineEmbeddingExtractor { } func extractEmbeddings( - audioSource: StreamingAudioSampleSource, + audioSource: AudioSampleSource, segmentation: SegmentationOutput ) async throws -> [TimedEmbedding] { let stream = AsyncThrowingStream { continuation in @@ -221,7 +221,7 @@ struct OfflineEmbeddingExtractor { } func extractEmbeddings( - audioSource: StreamingAudioSampleSource, + audioSource: AudioSampleSource, segmentationStream: S ) async throws -> [TimedEmbedding] where S.Element == SegmentationChunk { var embeddings: [TimedEmbedding] = [] diff --git a/Sources/FluidAudio/Diarizer/Offline/Segmentation/OfflineSegmentationProcessor.swift b/Sources/FluidAudio/Diarizer/Offline/Segmentation/OfflineSegmentationProcessor.swift index f1c7c434b..76b0ab6cb 100644 --- a/Sources/FluidAudio/Diarizer/Offline/Segmentation/OfflineSegmentationProcessor.swift +++ b/Sources/FluidAudio/Diarizer/Offline/Segmentation/OfflineSegmentationProcessor.swift @@ -42,7 +42,7 @@ struct OfflineSegmentationProcessor { } func process( - audioSource: StreamingAudioSampleSource, + audioSource: AudioSampleSource, segmentationModel: MLModel, config: OfflineDiarizerConfig, chunkHandler: SegmentationChunkHandler? = nil diff --git a/Sources/FluidAudio/Diarizer/Sortformer/SortformerDiarizerPipeline.swift b/Sources/FluidAudio/Diarizer/Sortformer/SortformerDiarizer.swift similarity index 100% rename from Sources/FluidAudio/Diarizer/Sortformer/SortformerDiarizerPipeline.swift rename to Sources/FluidAudio/Diarizer/Sortformer/SortformerDiarizer.swift diff --git a/Sources/FluidAudio/Diarizer/Sortformer/SortformerModelInference.swift b/Sources/FluidAudio/Diarizer/Sortformer/SortformerModelInference.swift index 09732923b..22415f978 100644 --- a/Sources/FluidAudio/Diarizer/Sortformer/SortformerModelInference.swift +++ b/Sources/FluidAudio/Diarizer/Sortformer/SortformerModelInference.swift @@ -91,11 +91,8 @@ extension SortformerModels { /// Default MLModel configuration public static func defaultConfiguration() -> MLModelConfiguration { - let config = MLModelConfiguration() - config.allowLowPrecisionAccumulationOnGPU = true let isCI = ProcessInfo.processInfo.environment["CI"] != nil - config.computeUnits = isCI ? .cpuAndNeuralEngine : .all - return config + return MLModelConfigurationUtils.defaultConfiguration(computeUnits: isCI ? .cpuAndNeuralEngine : .all) } /// Load Sortformer models from HuggingFace. diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 5f3ff3f01..0535d1a81 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -129,6 +129,8 @@ public enum Repo: String, CaseIterable { return "nemotron-streaming/560ms" case .sortformer: return "sortformer" + case .parakeetTdtCtc110m: + return "parakeet-tdt-ctc-110m" default: return name.replacingOccurrences(of: "-coreml", with: "") } @@ -203,9 +205,6 @@ public enum ModelNames { jointFile, ] - /// Vocabulary filename for the 110m hybrid TDT-CTC model (JSON array format) - public static let vocabularyFileArray = "parakeet_vocab.json" - /// Required models for fused frontend (110m hybrid: preprocessor contains encoder) public static let requiredModelsFused: Set = [ preprocessorFile, @@ -215,12 +214,8 @@ public enum ModelNames { /// Get vocabulary filename for specific model version public static func vocabulary(for repo: Repo) -> String { - switch repo { - case .parakeetTdtCtc110m: - return vocabularyFileArray - default: - return vocabularyFile - } + // All Parakeet models use the same vocabulary file (format varies: dict for v2/v3, array for 110m) + return vocabularyFile } } diff --git a/Sources/FluidAudio/Shared/StreamingAudioSampleSource.swift b/Sources/FluidAudio/Shared/AudioSampleSource.swift similarity index 91% rename from Sources/FluidAudio/Shared/StreamingAudioSampleSource.swift rename to Sources/FluidAudio/Shared/AudioSampleSource.swift index faf84f365..a73108d25 100644 --- a/Sources/FluidAudio/Shared/StreamingAudioSampleSource.swift +++ b/Sources/FluidAudio/Shared/AudioSampleSource.swift @@ -1,6 +1,6 @@ import Foundation -public protocol StreamingAudioSampleSource: Sendable { +public protocol AudioSampleSource: Sendable { var sampleCount: Int { get } func copySamples( into destination: UnsafeMutablePointer, @@ -9,7 +9,7 @@ public protocol StreamingAudioSampleSource: Sendable { ) throws } -public struct ArrayAudioSampleSource: StreamingAudioSampleSource { +public struct ArrayAudioSampleSource: AudioSampleSource { private let samples: [Float] public init(samples: [Float]) { @@ -39,7 +39,7 @@ public struct ArrayAudioSampleSource: StreamingAudioSampleSource { } } -public struct DiskBackedAudioSampleSource: StreamingAudioSampleSource { +public struct DiskBackedAudioSampleSource: AudioSampleSource { private let mappedData: Data private let floatStride = MemoryLayout.stride private let fileURL: URL diff --git a/Sources/FluidAudio/Shared/StreamingAudioSourceFactory.swift b/Sources/FluidAudio/Shared/AudioSourceFactory.swift similarity index 87% rename from Sources/FluidAudio/Shared/StreamingAudioSourceFactory.swift rename to Sources/FluidAudio/Shared/AudioSourceFactory.swift index 550e6995e..e7f4090c0 100644 --- a/Sources/FluidAudio/Shared/StreamingAudioSourceFactory.swift +++ b/Sources/FluidAudio/Shared/AudioSourceFactory.swift @@ -3,8 +3,8 @@ import Foundation import OSLog import os -public struct StreamingAudioSourceFactory { - private let logger = AppLogger(category: "StreamingAudioSourceFactory") +public struct AudioSourceFactory { + private let logger = AppLogger(category: "AudioSourceFactory") public init() {} @@ -26,7 +26,7 @@ public struct StreamingAudioSourceFactory { let tempURL = try makeTemporaryURL() guard FileManager.default.createFile(atPath: tempURL.path, contents: nil) else { - throw StreamingAudioError.processingFailed("Failed to create temporary audio buffer at \(tempURL.path)") + throw AudioSourceError.processingFailed("Failed to create temporary audio buffer at \(tempURL.path)") } let handle = try FileHandle(forWritingTo: tempURL) @@ -35,7 +35,7 @@ public struct StreamingAudioSourceFactory { } guard let converter = AVAudioConverter(from: inputFormat, to: targetFormat) else { - throw StreamingAudioError.processingFailed( + throw AudioSourceError.processingFailed( "Unsupported audio format \(inputFormat); failed to create converter") } @@ -75,11 +75,11 @@ public struct StreamingAudioSourceFactory { let duration = Date().timeIntervalSince(startTime) return (source, duration) - } catch let streamingError as StreamingAudioError { + } catch let streamingError as AudioSourceError { throw streamingError } catch { logger.error("Streaming audio source creation failed: \(error.localizedDescription)") - throw StreamingAudioError.processingFailed( + throw AudioSourceError.processingFailed( "Streaming audio source creation failed: \(error.localizedDescription)" ) } @@ -106,7 +106,7 @@ public struct StreamingAudioSourceFactory { frameCapacity: inputCapacity ) else { - throw StreamingAudioError.failedToAllocateBuffer("Input", requestedFrames: Int(inputCapacity)) + throw AudioSourceError.failedToAllocateBuffer("Input", requestedFrames: Int(inputCapacity)) } let estimatedOutputFrames = AVAudioFrameCount( @@ -118,7 +118,7 @@ public struct StreamingAudioSourceFactory { frameCapacity: max(1024, estimatedOutputFrames) ) else { - throw StreamingAudioError.failedToAllocateBuffer("Output", requestedFrames: Int(estimatedOutputFrames)) + throw AudioSourceError.failedToAllocateBuffer("Output", requestedFrames: Int(estimatedOutputFrames)) } var totalSamples = 0 @@ -167,13 +167,13 @@ public struct StreamingAudioSourceFactory { ) if let conversionError { - throw StreamingAudioError.processingFailed( + throw AudioSourceError.processingFailed( "Audio conversion failed: \(conversionError.localizedDescription)" ) } if let error = readError.withLock({ $0 }) { - throw StreamingAudioError.processingFailed( + throw AudioSourceError.processingFailed( "Failed while reading audio: \(error.localizedDescription)" ) } @@ -181,7 +181,7 @@ public struct StreamingAudioSourceFactory { let producedFrames = Int(outputBuffer.frameLength) if producedFrames > 0 { guard let channelData = outputBuffer.floatChannelData?.pointee else { - throw StreamingAudioError.processingFailed("Missing channel data during conversion") + throw AudioSourceError.processingFailed("Missing channel data during conversion") } let byteCount = producedFrames * MemoryLayout.stride let baseAddress = UnsafeRawPointer(channelData) @@ -199,7 +199,7 @@ public struct StreamingAudioSourceFactory { } } -public enum StreamingAudioError: Error, LocalizedError { +public enum AudioSourceError: Error, LocalizedError { case processingFailed(String) public var errorDescription: String? { @@ -210,8 +210,8 @@ public enum StreamingAudioError: Error, LocalizedError { } } -extension StreamingAudioError { - fileprivate static func failedToAllocateBuffer(_ name: String, requestedFrames: Int) -> StreamingAudioError { +extension AudioSourceError { + fileprivate static func failedToAllocateBuffer(_ name: String, requestedFrames: Int) -> AudioSourceError { .processingFailed("Failed to allocate \(name.lowercased()) buffer (\(requestedFrames) frames)") } } diff --git a/Sources/FluidAudio/Shared/MLModelConfigurationUtils.swift b/Sources/FluidAudio/Shared/MLModelConfigurationUtils.swift new file mode 100644 index 000000000..c315e0ed4 --- /dev/null +++ b/Sources/FluidAudio/Shared/MLModelConfigurationUtils.swift @@ -0,0 +1,36 @@ +@preconcurrency import CoreML +import Foundation + +/// Shared utilities for creating `MLModelConfiguration` instances and resolving model directories. +public enum MLModelConfigurationUtils { + + /// Create a default `MLModelConfiguration` with low-precision GPU accumulation enabled. + /// + /// - Parameter computeUnits: Compute units to use (default: `.cpuAndNeuralEngine`). + /// - Returns: Configured `MLModelConfiguration`. + public static func defaultConfiguration( + computeUnits: MLComputeUnits = .cpuAndNeuralEngine + ) -> MLModelConfiguration { + let config = MLModelConfiguration() + config.allowLowPrecisionAccumulationOnGPU = true + config.computeUnits = computeUnits + return config + } + + /// Default models directory under Application Support. + /// + /// - Parameter repo: Optional repository whose `folderName` is appended. When `nil`, + /// returns `~/Library/Application Support/FluidAudio/Models/`. + /// - Returns: URL for the models directory. + public static func defaultModelsDirectory(for repo: Repo? = nil) -> URL { + let base = FileManager.default.urls(for: .applicationSupportDirectory, in: .userDomainMask).first! + var url = + base + .appendingPathComponent("FluidAudio", isDirectory: true) + .appendingPathComponent("Models", isDirectory: true) + if let repo { + url = url.appendingPathComponent(repo.folderName, isDirectory: true) + } + return url + } +} diff --git a/Sources/FluidAudioCLI/Commands/ProcessCommand.swift b/Sources/FluidAudioCLI/Commands/ProcessCommand.swift index f3f62c58a..24dcd185c 100644 --- a/Sources/FluidAudioCLI/Commands/ProcessCommand.swift +++ b/Sources/FluidAudioCLI/Commands/ProcessCommand.swift @@ -166,7 +166,7 @@ enum ProcessCommand { // Load and process audio file without materializing the full sample buffer. let audioURL = URL(fileURLWithPath: audioFile) - let factory = StreamingAudioSourceFactory() + let factory = AudioSourceFactory() let targetSampleRate = offlineConfig.segmentation.sampleRate let diskSourceResult = try factory.makeDiskBackedSource( from: audioURL, diff --git a/Tests/FluidAudioTests/ASR/Parakeet/AsrManagerExtensionTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/AsrManagerExtensionTests.swift index 42c01be92..057bdb364 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/AsrManagerExtensionTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/AsrManagerExtensionTests.swift @@ -90,54 +90,6 @@ final class AsrManagerExtensionTests: XCTestCase { XCTAssertEqual(Array(result.suffix(500)), Array(repeating: 0.0, count: 500)) } - // MARK: - calculateStartFrameOffset Tests - - func testCalculateStartFrameOffsetFirstSegment() { - let offset = manager.calculateStartFrameOffset(segmentIndex: 0, leftContextSeconds: 2.0) - - // Method is deprecated - now always returns 0 (frame tracking handled by timeJump mechanism) - XCTAssertEqual(offset, 0) - } - - func testCalculateStartFrameOffsetSecondSegment() { - let leftContext = 2.0 - let offset = manager.calculateStartFrameOffset(segmentIndex: 1, leftContextSeconds: leftContext) - - // Method is deprecated - now always returns 0 (frame tracking handled by timeJump mechanism) - XCTAssertEqual(offset, 0) - } - - func testCalculateStartFrameOffsetThirdSegment() { - let leftContext = 1.5 - let offset = manager.calculateStartFrameOffset(segmentIndex: 2, leftContextSeconds: leftContext) - - // Method is deprecated - now always returns 0 (frame tracking handled by timeJump mechanism) - XCTAssertEqual(offset, 0) - } - - func testCalculateStartFrameOffsetVariousContexts() { - // Method is deprecated - now always returns 0 (frame tracking handled by timeJump mechanism) - let testCases: [(leftContext: Double, expected: Int)] = [ - (0.0, 0), // No context - (0.08, 0), // Method always returns 0 - (0.16, 0), // Method always returns 0 - (1.0, 0), // Method always returns 0 - (3.2, 0), // Method always returns 0 - ] - - for (leftContext, expected) in testCases { - let offset = manager.calculateStartFrameOffset(segmentIndex: 1, leftContextSeconds: leftContext) - XCTAssertEqual(offset, expected, "Failed for leftContext=\(leftContext)") - } - } - - func testCalculateStartFrameOffsetNegativeSegment() { - let offset = manager.calculateStartFrameOffset(segmentIndex: -1, leftContextSeconds: 2.0) - - // Method is deprecated - now always returns 0 (frame tracking handled by timeJump mechanism) - XCTAssertEqual(offset, 0) - } - // MARK: - Performance Tests func testPadAudioPerformance() { @@ -151,11 +103,4 @@ final class AsrManagerExtensionTests: XCTestCase { } } - func testCalculateStartFrameOffsetPerformance() { - measure { - for i in 0..<10_000 { - _ = manager.calculateStartFrameOffset(segmentIndex: i % 100, leftContextSeconds: 2.0) - } - } - } } diff --git a/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift index 3e510c45b..60e9e4261 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift @@ -384,8 +384,8 @@ final class AsrModelsTests: XCTestCase { } func testTdtCtc110mVocabularyFilename() { - // tdtCtc110m uses parakeet_vocab.json (array format) - let vocabFile = ModelNames.ASR.vocabularyFileArray + // tdtCtc110m uses parakeet_vocab.json (array format, same filename as v2/v3) + let vocabFile = ModelNames.ASR.vocabularyFile XCTAssertEqual(vocabFile, "parakeet_vocab.json") // Verify it has .json extension diff --git a/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift index 6048ff95c..1df3f76bb 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/ModelNamesTests.swift @@ -133,10 +133,10 @@ final class ModelNamesTests: XCTestCase { } func testParakeetTdtCtc110mVocabulary() { - // tdtCtc110m uses array-format vocabulary + // tdtCtc110m uses same vocabulary file (array-format JSON, parsed at load time) let vocabFile = ModelNames.ASR.vocabulary(for: .parakeetTdtCtc110m) XCTAssertEqual(vocabFile, "parakeet_vocab.json") - XCTAssertEqual(vocabFile, ModelNames.ASR.vocabularyFileArray) + XCTAssertEqual(vocabFile, ModelNames.ASR.vocabularyFile) } func testParakeetTdtCtc110mUsesRequiredModelsFused() { From 6499df61a0443c8a2c302caaec076372a3d93f8f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 19:48:04 -0400 Subject: [PATCH 03/16] Clean up AsrManager naming, dead code, and actor isolation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Delete dead loadModel() and getDefaultModelsDirectory() (legacy Models/Parakeet path) - Remove dangling doc comment from deleted property - Rename transcribeStreaming → transcribeDiskBacked (avoids confusion with the real streaming API in SlidingWindowAsrManager) - Convert getDecoderLayers() to decoderLayerCount computed property - Move AudioSource enum from AsrManager.swift to Shared/AudioSource.swift - Mark pure utility methods as nonisolated: normalizedTimingToken, calculateConfidence, sliceEncoderOutput, removeDuplicateTokenSequence --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 51 ++++--------------- .../ASR/Parakeet/AsrTranscription.swift | 6 +-- .../ASR/Parakeet/ChunkProcessor.swift | 2 +- Sources/FluidAudio/Shared/AudioSource.swift | 6 +++ 4 files changed, 19 insertions(+), 46 deletions(-) create mode 100644 Sources/FluidAudio/Shared/AudioSource.swift diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index ce93e706a..a4135df79 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -3,11 +3,6 @@ import AVFoundation import Foundation import OSLog -public enum AudioSource: Sendable { - case microphone - case system -} - public actor AsrManager { internal let logger = AppLogger(category: "ASR") @@ -24,14 +19,12 @@ public actor AsrManager { internal let progressEmitter = ProgressEmitter() - /// Get the number of decoder layers for the current model. + /// Number of decoder layers for the current model. /// Returns 2 if models not loaded (v2/v3 default, tdtCtc110m uses 1). - internal func getDecoderLayers() -> Int { - return asrModels?.version.decoderLayers ?? 2 + internal var decoderLayerCount: Int { + asrModels?.version.decoderLayers ?? 2 } - /// Token duration optimization model - /// Cached vocabulary loaded once during initialization internal var vocabulary: [Int: String] = [:] #if DEBUG @@ -251,32 +244,6 @@ public actor AsrManager { } } - private func loadModel( - path: URL, - name: String, - configuration: MLModelConfiguration - ) async throws -> MLModel { - do { - let model = try MLModel(contentsOf: path, configuration: configuration) - return model - } catch { - logger.error("Failed to load \(name) model: \(error)") - - throw ASRError.modelLoadFailed - } - } - private static func getDefaultModelsDirectory() -> URL { - let applicationSupportURL = FileManager.default.urls( - for: .applicationSupportDirectory, in: .userDomainMask - ).first! - let appDirectory = applicationSupportURL.appendingPathComponent( - "FluidAudio", isDirectory: true) - let directory = appDirectory.appendingPathComponent("Models/Parakeet", isDirectory: true) - - try? FileManager.default.createDirectory(at: directory, withIntermediateDirectories: true) - return directory.standardizedFileURL - } - public func resetState() { // Use model's decoder layer count, or 2 if models not loaded (v2/v3 default) let layers = asrModels?.version.decoderLayers ?? 2 @@ -409,7 +376,7 @@ public actor AsrManager { let estimatedSamples = Int((Double(audioFile.length) * sampleRateRatio).rounded(.up)) if estimatedSamples > config.streamingThreshold { - return try await transcribeStreaming(url, source: source) + return try await transcribeDiskBacked(url, source: source) } } @@ -418,17 +385,17 @@ public actor AsrManager { return result } - /// Transcribe audio from a file URL using streaming mode. + /// Transcribe audio from a file URL using disk-backed chunked processing. /// - /// Memory-efficient transcription that processes audio in chunks, maintaining constant - /// memory usage (~1.2MB) regardless of file size. Ideal for long audio files. + /// Memory-efficient transcription that memory-maps the file and processes audio in chunks, + /// maintaining constant memory usage (~1.2MB) regardless of file size. Ideal for long audio files. /// /// - Parameters: /// - url: The URL to the audio file /// - source: The audio source type (defaults to .system) /// - Returns: An ASRResult containing the transcribed text and token timings /// - Throws: ASRError if transcription fails, models are not initialized, or the file cannot be read - public func transcribeStreaming(_ url: URL, source: AudioSource = .system) async throws -> ASRResult { + public func transcribeDiskBacked(_ url: URL, source: AudioSource = .system) async throws -> ASRResult { guard isAvailable else { throw ASRError.notInitialized } let startTime = Date() @@ -531,7 +498,7 @@ public actor AsrManager { try await initializeDecoderState(for: source) } - internal func normalizedTimingToken(_ token: String) -> String { + nonisolated internal func normalizedTimingToken(_ token: String) -> String { token.replacingOccurrences(of: "▁", with: " ") } diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index d0900763d..eb1430dc9 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -346,7 +346,7 @@ extension AsrManager { /// Calculate confidence score based purely on TDT model token confidence scores /// Returns the average of token-level softmax probabilities from the decoder /// Range: 0.1 (empty transcription) to 1.0 (perfect confidence) - private func calculateConfidence( + nonisolated private func calculateConfidence( duration: Double, tokenCount: Int, isEmpty: Bool, tokenConfidences: [Float] ) -> Float { // Empty transcription gets low confidence @@ -438,7 +438,7 @@ extension AsrManager { } /// Slice encoder output to remove left context frames (following NeMo approach) - private func sliceEncoderOutput( + nonisolated private func sliceEncoderOutput( _ encoderOutput: MLMultiArray, from startFrame: Int, newLength: Int @@ -473,7 +473,7 @@ extension AsrManager { /// and the number of removed leading tokens so caller can drop aligned timestamps. /// Ideally this is not needed. We need to make some more fixes to the TDT decoding logic, /// this should be a temporary workaround. - internal func removeDuplicateTokenSequence( + nonisolated internal func removeDuplicateTokenSequence( previous: [Int], current: [Int], maxOverlap: Int = 12 ) -> (deduped: [Int], removedCount: Int) { diff --git a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift index cbbe722b3..92a62ed0e 100644 --- a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift +++ b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift @@ -66,7 +66,7 @@ struct ChunkProcessor { var chunkStart = 0 var chunkIndex = 0 var chunkDecoderState = TdtDecoderState.make( - decoderLayers: await manager.getDecoderLayers() + decoderLayers: await manager.decoderLayerCount ) while chunkStart < totalSamples { diff --git a/Sources/FluidAudio/Shared/AudioSource.swift b/Sources/FluidAudio/Shared/AudioSource.swift new file mode 100644 index 000000000..b290feaa2 --- /dev/null +++ b/Sources/FluidAudio/Shared/AudioSource.swift @@ -0,0 +1,6 @@ +import Foundation + +public enum AudioSource: Sendable { + case microphone + case system +} From 4a966aef2812869558156653ba9e82ab838c0242 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 19:58:11 -0400 Subject: [PATCH 04/16] Eliminate ANEOptimizer indirection layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANEOptimizer was a thin wrapper over ANEMemoryUtils in the wrong location (ASR/Parakeet/ instead of Shared/). All callers now use ANEMemoryUtils directly. - Replace ANEOptimizer.createANEAlignedArray → ANEMemoryUtils.createAlignedArray - Replace ANEOptimizer.prefetchToNeuralEngine(x) → x.prefetchToNeuralEngine() (MLMultiArray extension already in ANEMemoryOptimizer) - Move convertToFloat16 to ANEMemoryUtils (throws proper ANEMemoryError) - Move ZeroCopyFeatureProvider to Shared/ZeroCopyFeatureProvider.swift - Inline optimalComputeUnits (always returned .cpuAndNeuralEngine), delete ModelType enum - Simplify AsrModels.optimizedConfiguration to use shared utility - Delete ANEOptimizer.swift --- .../ASR/Parakeet/ANEOptimizer.swift | 166 ------------------ .../FluidAudio/ASR/Parakeet/AsrModels.swift | 16 +- .../Parakeet/Decoder/TdtDecoderState.swift | 4 +- .../ASR/Parakeet/Decoder/TdtDecoderV3.swift | 12 +- .../ASR/Parakeet/MLArrayCache.swift | 6 +- .../FluidAudio/Shared/ANEMemoryUtils.swift | 36 ++++ .../Shared/ZeroCopyFeatureProvider.swift | 33 ++++ .../ASR/Parakeet/AsrModelsTests.swift | 51 +----- .../Shared/ANEOptimizerTests.swift | 48 ++--- .../Shared/MLArrayCacheTests.swift | 4 +- 10 files changed, 111 insertions(+), 265 deletions(-) delete mode 100644 Sources/FluidAudio/ASR/Parakeet/ANEOptimizer.swift create mode 100644 Sources/FluidAudio/Shared/ZeroCopyFeatureProvider.swift diff --git a/Sources/FluidAudio/ASR/Parakeet/ANEOptimizer.swift b/Sources/FluidAudio/ASR/Parakeet/ANEOptimizer.swift deleted file mode 100644 index 09ad6c531..000000000 --- a/Sources/FluidAudio/ASR/Parakeet/ANEOptimizer.swift +++ /dev/null @@ -1,166 +0,0 @@ -import Accelerate -import CoreML -import Foundation -import Metal - -/// Neural Engine optimization utilities for ASR pipeline -public enum ANEOptimizer { - - // Use shared ANE constants - public static let aneAlignment = ANEMemoryUtils.aneAlignment - public static let aneTileSize = ANEMemoryUtils.aneTileSize - - /// Create ANE-aligned MLMultiArray with optimized memory layout - public static func createANEAlignedArray( - shape: [NSNumber], - dataType: MLMultiArrayDataType - ) throws -> MLMultiArray { - do { - return try ANEMemoryUtils.createAlignedArray( - shape: shape, - dataType: dataType, - zeroClear: false // ASR doesn't need zero-cleared memory - ) - } catch ANEMemoryUtils.ANEMemoryError.allocationFailed { - throw NSError( - domain: "ANEOptimizer", code: -1, - userInfo: [NSLocalizedDescriptionKey: "Failed to allocate ANE-aligned memory"]) - } catch { - throw NSError( - domain: "ANEOptimizer", code: -1, - userInfo: [NSLocalizedDescriptionKey: "ANE memory allocation error: \(error)"]) - } - } - - /// Calculate optimal strides for ANE tile processing - public static func calculateOptimalStrides( - for shape: [NSNumber], - dataType: MLMultiArrayDataType - ) -> [NSNumber] { - return ANEMemoryUtils.calculateOptimalStrides(for: shape) - } - - /// Configure optimal compute units for each model type - public static func optimalComputeUnits(for modelType: ModelType) -> MLComputeUnits { - return .cpuAndNeuralEngine - } - - /// Create zero-copy memory view between models - public static func createZeroCopyView( - from sourceArray: MLMultiArray, - shape: [NSNumber], - offset: Int = 0 - ) throws -> MLMultiArray { - // Ensure we have enough data - let sourceElements = sourceArray.shape.map { $0.intValue }.reduce(1, *) - let viewElements = shape.map { $0.intValue }.reduce(1, *) - - guard offset + viewElements <= sourceElements else { - throw NSError( - domain: "ANEOptimizer", code: -2, - userInfo: [NSLocalizedDescriptionKey: "View exceeds source array bounds"]) - } - - // Calculate byte offset - let elementSize = ANEMemoryUtils.getElementSize(for: sourceArray.dataType) - - let byteOffset = offset * elementSize - let offsetPointer = sourceArray.dataPointer.advanced(by: byteOffset) - - // Create view with same data but new shape - return try MLMultiArray( - dataPointer: offsetPointer, - shape: shape, - dataType: sourceArray.dataType, - strides: calculateOptimalStrides(for: shape, dataType: sourceArray.dataType), - deallocator: nil // No deallocation since it's a view - ) - } - - /// Prefetch data to Neural Engine - public static func prefetchToNeuralEngine(_ array: MLMultiArray) { - // Trigger ANE prefetch by accessing first and last elements - // This causes the ANE to initiate DMA transfer - if array.count > 0 { - _ = array[0] - _ = array[array.count - 1] - } - } - - /// Convert float32 array to float16 for ANE efficiency - public static func convertToFloat16(_ input: MLMultiArray) throws -> MLMultiArray { - guard input.dataType == .float32 else { - throw NSError( - domain: "ANEOptimizer", code: -3, - userInfo: [NSLocalizedDescriptionKey: "Input must be float32"]) - } - - // Create float16 array with ANE alignment - let float16Array = try createANEAlignedArray( - shape: input.shape, - dataType: .float16 - ) - - // Convert using Accelerate with platform-specific handling - let sourcePtr = input.dataPointer.bindMemory(to: Float.self, capacity: input.count) - - var sourceBuffer = vImage_Buffer( - data: sourcePtr, - height: 1, - width: vImagePixelCount(input.count), - rowBytes: input.count * MemoryLayout.stride - ) - - // Use UInt16 as storage type for cross-platform compatibility - let destPtr = float16Array.dataPointer.bindMemory(to: UInt16.self, capacity: input.count) - - var destBuffer = vImage_Buffer( - data: destPtr, - height: 1, - width: vImagePixelCount(input.count), - rowBytes: input.count * MemoryLayout.stride - ) - - vImageConvert_PlanarFtoPlanar16F(&sourceBuffer, &destBuffer, 0) - - return float16Array - } - - /// Model type enumeration for compute unit selection - public enum ModelType { - case encoder - case decoder - case joint - } -} - -/// Extension for MLFeatureProvider to enable zero-copy chaining -public class ZeroCopyFeatureProvider: NSObject, MLFeatureProvider { - private let features: [String: MLFeatureValue] - - public init(features: [String: MLFeatureValue]) { - self.features = features - super.init() - } - - public var featureNames: Set { - Set(features.keys) - } - - public func featureValue(for featureName: String) -> MLFeatureValue? { - features[featureName] - } - - /// Create a provider that chains output from one model to input of another - public static func chain( - from outputProvider: MLFeatureProvider, - outputName: String, - to inputName: String - ) -> ZeroCopyFeatureProvider? { - guard let outputValue = outputProvider.featureValue(for: outputName) else { - return nil - } - - return ZeroCopyFeatureProvider(features: [inputName: outputValue]) - } -} diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift index 32b1aeeca..d8dbd618c 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift @@ -365,22 +365,14 @@ extension AsrModels { MLModelConfigurationUtils.defaultConfiguration(computeUnits: .cpuAndNeuralEngine) } - /// Create optimized configuration for specific model type + /// Create optimized configuration for model inference public static func optimizedConfiguration( - for modelType: ANEOptimizer.ModelType, enableFP16: Bool = true ) -> MLModelConfiguration { - let config = MLModelConfiguration() - config.allowLowPrecisionAccumulationOnGPU = enableFP16 - config.computeUnits = ANEOptimizer.optimalComputeUnits(for: modelType) - - // Enable model-specific optimizations let isCI = ProcessInfo.processInfo.environment["CI"] != nil - if isCI { - config.computeUnits = .cpuOnly - } - - return config + return MLModelConfigurationUtils.defaultConfiguration( + computeUnits: isCI ? .cpuOnly : .cpuAndNeuralEngine + ) } /// Create optimized prediction options for inference diff --git a/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderState.swift b/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderState.swift index dfef2ca36..0a684be86 100644 --- a/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderState.swift +++ b/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderState.swift @@ -32,11 +32,11 @@ struct TdtDecoderState: Sendable { init(decoderLayers: Int = 2) throws { // Use ANE-aligned arrays for optimal performance let decoderHiddenSize = ASRConstants.decoderHiddenSize - hiddenState = try ANEOptimizer.createANEAlignedArray( + hiddenState = try ANEMemoryUtils.createAlignedArray( shape: [NSNumber(value: decoderLayers), 1, NSNumber(value: decoderHiddenSize)], dataType: .float32 ) - cellState = try ANEOptimizer.createANEAlignedArray( + cellState = try ANEMemoryUtils.createAlignedArray( shape: [NSNumber(value: decoderLayers), 1, NSNumber(value: decoderHiddenSize)], dataType: .float32 ) diff --git a/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderV3.swift b/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderV3.swift index 1fa504dfd..54ccc34fe 100644 --- a/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderV3.swift +++ b/Sources/FluidAudio/ASR/Parakeet/Decoder/TdtDecoderV3.swift @@ -175,11 +175,11 @@ internal struct TdtDecoderV3 { // Preallocate joint input tensors and a reusable provider to avoid per-step allocations. let encoderHidden = expectedEncoderHidden let decoderHidden = ASRConstants.decoderHiddenSize - let reusableEncoderStep = try ANEOptimizer.createANEAlignedArray( + let reusableEncoderStep = try ANEMemoryUtils.createAlignedArray( shape: [1, NSNumber(value: encoderHidden), 1], dataType: .float32 ) - let reusableDecoderStep = try ANEOptimizer.createANEAlignedArray( + let reusableDecoderStep = try ANEMemoryUtils.createAlignedArray( shape: [1, NSNumber(value: decoderHidden), 1], dataType: .float32 ) @@ -617,8 +617,8 @@ internal struct TdtDecoderV3 { try encoderFrames.copyFrame(at: timeIndex, into: encoderDestPtr, destinationStride: encoderDestStride) // Prefetch arrays for ANE - ANEOptimizer.prefetchToNeuralEngine(encoderStep) - ANEOptimizer.prefetchToNeuralEngine(preparedDecoderStep) + encoderStep.prefetchToNeuralEngine() + preparedDecoderStep.prefetchToNeuralEngine() // Reuse tiny output tensors for joint prediction (provide raw MLMultiArray backings) predictionOptions.outputBackings = [ @@ -702,7 +702,7 @@ internal struct TdtDecoderV3 { } out = destination } else { - out = try ANEOptimizer.createANEAlignedArray( + out = try ANEMemoryUtils.createAlignedArray( shape: [1, NSNumber(value: hiddenSize), 1], dataType: .float32 ) @@ -829,7 +829,7 @@ internal struct TdtDecoderV3 { encoderOutput: encoderOutput, validLength: encoderOutput.count, expectedHiddenSize: config.encoderHiddenSize) - let encoderStep = try ANEOptimizer.createANEAlignedArray( + let encoderStep = try ANEMemoryUtils.createAlignedArray( shape: [1, NSNumber(value: encoderFrames.hiddenSize), 1], dataType: .float32) let encoderPtr = encoderStep.dataPointer.bindMemory(to: Float.self, capacity: encoderFrames.hiddenSize) diff --git a/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift b/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift index 23c97bd76..3ede440dd 100644 --- a/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift +++ b/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift @@ -33,7 +33,7 @@ actor MLArrayCache { return array } - return try ANEOptimizer.createANEAlignedArray(shape: shape, dataType: dataType) + return try ANEMemoryUtils.createAlignedArray(shape: shape, dataType: dataType) } /// Return an array to the cache for reuse @@ -67,7 +67,7 @@ actor MLArrayCache { let prewarmCount = min(5, maxCacheSize / max(shapes.count, 1)) for _ in 0.. MLMultiArray { if let float32Array = float32Array { // Convert existing array to Float16 - return try ANEOptimizer.convertToFloat16(float32Array) + return try ANEMemoryUtils.convertToFloat16(float32Array) } else { // Get new Float16 array from cache return try getArray(shape: shape, dataType: .float16) diff --git a/Sources/FluidAudio/Shared/ANEMemoryUtils.swift b/Sources/FluidAudio/Shared/ANEMemoryUtils.swift index 6ba9e085b..6e6ff8aa2 100644 --- a/Sources/FluidAudio/Shared/ANEMemoryUtils.swift +++ b/Sources/FluidAudio/Shared/ANEMemoryUtils.swift @@ -1,3 +1,4 @@ +import Accelerate import CoreML import Darwin import Foundation @@ -143,6 +144,41 @@ public enum ANEMemoryUtils { ) } + /// Convert a float32 MLMultiArray to float16 with ANE-aligned memory. + public static func convertToFloat16(_ input: MLMultiArray) throws -> MLMultiArray { + guard input.dataType == .float32 else { + throw ANEMemoryError.unsupportedDataType + } + + let float16Array = try createAlignedArray( + shape: input.shape, + dataType: .float16, + zeroClear: false + ) + + let sourcePtr = input.dataPointer.bindMemory(to: Float.self, capacity: input.count) + + var sourceBuffer = vImage_Buffer( + data: sourcePtr, + height: 1, + width: vImagePixelCount(input.count), + rowBytes: input.count * MemoryLayout.stride + ) + + let destPtr = float16Array.dataPointer.bindMemory(to: UInt16.self, capacity: input.count) + + var destBuffer = vImage_Buffer( + data: destPtr, + height: 1, + width: vImagePixelCount(input.count), + rowBytes: input.count * MemoryLayout.stride + ) + + vImageConvert_PlanarFtoPlanar16F(&sourceBuffer, &destBuffer, 0) + + return float16Array + } + /// Stride-aware copy between two MLMultiArrays that may have different stride layouts. /// /// Copies all logical elements from `source` to `destination` (which must have the same shape diff --git a/Sources/FluidAudio/Shared/ZeroCopyFeatureProvider.swift b/Sources/FluidAudio/Shared/ZeroCopyFeatureProvider.swift new file mode 100644 index 000000000..c4a9ba332 --- /dev/null +++ b/Sources/FluidAudio/Shared/ZeroCopyFeatureProvider.swift @@ -0,0 +1,33 @@ +import CoreML +import Foundation + +/// Zero-copy MLFeatureProvider for chaining model outputs to inputs. +public class ZeroCopyFeatureProvider: NSObject, MLFeatureProvider { + private let features: [String: MLFeatureValue] + + public init(features: [String: MLFeatureValue]) { + self.features = features + super.init() + } + + public var featureNames: Set { + Set(features.keys) + } + + public func featureValue(for featureName: String) -> MLFeatureValue? { + features[featureName] + } + + /// Create a provider that chains output from one model to input of another + public static func chain( + from outputProvider: MLFeatureProvider, + outputName: String, + to inputName: String + ) -> ZeroCopyFeatureProvider? { + guard let outputValue = outputProvider.featureValue(for: outputName) else { + return nil + } + + return ZeroCopyFeatureProvider(features: [inputName: outputValue]) + } +} diff --git a/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift index 60e9e4261..66f39342f 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/AsrModelsTests.swift @@ -209,34 +209,13 @@ final class AsrModelsTests: XCTestCase { // In CI environment, all compute units are overridden to .cpuOnly let isCI = ProcessInfo.processInfo.environment["CI"] != nil - // Test encoder configuration - let melConfig = AsrModels.optimizedConfiguration(for: .encoder) + let config = AsrModels.optimizedConfiguration() if isCI { - XCTAssertEqual(melConfig.computeUnits, .cpuOnly) + XCTAssertEqual(config.computeUnits, .cpuOnly) } else { - XCTAssertEqual(melConfig.computeUnits, .cpuAndNeuralEngine) + XCTAssertEqual(config.computeUnits, .cpuAndNeuralEngine) } - XCTAssertTrue(melConfig.allowLowPrecisionAccumulationOnGPU) - - // Test decoder configuration - let decoderConfig = AsrModels.optimizedConfiguration(for: .decoder) - if isCI { - XCTAssertEqual(decoderConfig.computeUnits, .cpuOnly) - } else { - XCTAssertEqual(decoderConfig.computeUnits, .cpuAndNeuralEngine) - } - - // Test joint configuration - let jointConfig = AsrModels.optimizedConfiguration(for: .joint) - if isCI { - XCTAssertEqual(jointConfig.computeUnits, .cpuOnly) - } else { - XCTAssertEqual(jointConfig.computeUnits, .cpuAndNeuralEngine) - } - - // Test with FP16 disabled - let fp32Config = AsrModels.optimizedConfiguration(for: .encoder, enableFP16: false) - XCTAssertFalse(fp32Config.allowLowPrecisionAccumulationOnGPU) + XCTAssertTrue(config.allowLowPrecisionAccumulationOnGPU) } func testOptimizedConfigurationCIEnvironment() { @@ -251,7 +230,7 @@ final class AsrModelsTests: XCTestCase { } } - let config = AsrModels.optimizedConfiguration(for: .encoder) + let config = AsrModels.optimizedConfiguration() XCTAssertEqual(config.computeUnits, .cpuOnly) } @@ -288,22 +267,10 @@ final class AsrModelsTests: XCTestCase { XCTAssertEqual(config.computeUnits, .cpuAndNeuralEngine) } - func testOptimalComputeUnitsRespectsPlatform() { - // Test each model type - let modelTypes: [ANEOptimizer.ModelType] = [ - .encoder, - .decoder, - .joint, - ] - - for modelType in modelTypes { - let computeUnits = ANEOptimizer.optimalComputeUnits(for: modelType) - - // All models should use CPU+ANE for optimal performance - XCTAssertEqual( - computeUnits, .cpuAndNeuralEngine, - "Model type \(modelType) should use CPU+ANE") - } + func testOptimalComputeUnitsDefault() { + // Default configuration uses CPU+ANE for optimal performance + let config = AsrModels.defaultConfiguration() + XCTAssertEqual(config.computeUnits, .cpuAndNeuralEngine) } // MARK: - TDT-CTC-110M Model Version Tests diff --git a/Tests/FluidAudioTests/Shared/ANEOptimizerTests.swift b/Tests/FluidAudioTests/Shared/ANEOptimizerTests.swift index 254fa1d84..42ad4f986 100644 --- a/Tests/FluidAudioTests/Shared/ANEOptimizerTests.swift +++ b/Tests/FluidAudioTests/Shared/ANEOptimizerTests.swift @@ -10,7 +10,7 @@ final class ANEOptimizerTests: XCTestCase { func testCreateANEAlignedArrayFloat32() throws { let shape: [NSNumber] = [1, 100] - let array = try ANEOptimizer.createANEAlignedArray( + let array = try ANEMemoryUtils.createAlignedArray( shape: shape, dataType: .float32 ) @@ -22,7 +22,7 @@ final class ANEOptimizerTests: XCTestCase { let isCI = ProcessInfo.processInfo.environment["CI"] != nil if !isCI { // Verify memory alignment only in non-CI environment - let alignment = ANEOptimizer.aneAlignment + let alignment = ANEMemoryUtils.aneAlignment let pointerValue = Int(bitPattern: array.dataPointer) XCTAssertEqual(pointerValue % alignment, 0, "Array should be \(alignment)-byte aligned") } @@ -30,7 +30,7 @@ final class ANEOptimizerTests: XCTestCase { func testCreateANEAlignedArrayFloat16() throws { let shape: [NSNumber] = [1, 64] // Smaller shape for CI stability - let array = try ANEOptimizer.createANEAlignedArray( + let array = try ANEMemoryUtils.createAlignedArray( shape: shape, dataType: .float16 ) @@ -46,7 +46,7 @@ final class ANEOptimizerTests: XCTestCase { // Verify memory alignment only in non-CI environment let pointerValue = Int(bitPattern: array.dataPointer) - XCTAssertEqual(pointerValue % ANEOptimizer.aneAlignment, 0) + XCTAssertEqual(pointerValue % ANEMemoryUtils.aneAlignment, 0) } } @@ -56,9 +56,8 @@ final class ANEOptimizerTests: XCTestCase { func testCalculateOptimalStridesBasic() { let shape: [NSNumber] = [1, 3, 224, 224] - let strides = ANEOptimizer.calculateOptimalStrides( - for: shape, - dataType: .float32 + let strides = ANEMemoryUtils.calculateOptimalStrides( + for: shape ) XCTAssertEqual(strides.count, shape.count) @@ -71,9 +70,8 @@ final class ANEOptimizerTests: XCTestCase { func testCalculateOptimalStridesWithPadding() { // Test with dimension that needs padding (not multiple of 16) let shape: [NSNumber] = [1, 100] // 100 is not multiple of 16 - let strides = ANEOptimizer.calculateOptimalStrides( - for: shape, - dataType: .float32 + let strides = ANEMemoryUtils.calculateOptimalStrides( + for: shape ) // The stride for the first dimension should account for padding @@ -84,22 +82,10 @@ final class ANEOptimizerTests: XCTestCase { // MARK: - Compute Unit Selection Tests - func testOptimalComputeUnits() { - // All models use CPU+ANE for optimal performance - XCTAssertEqual( - ANEOptimizer.optimalComputeUnits(for: .encoder), - .cpuAndNeuralEngine - ) - - XCTAssertEqual( - ANEOptimizer.optimalComputeUnits(for: .decoder), - .cpuAndNeuralEngine - ) - - XCTAssertEqual( - ANEOptimizer.optimalComputeUnits(for: .joint), - .cpuAndNeuralEngine - ) + func testDefaultConfigurationComputeUnits() { + // Default configuration uses CPU+ANE + let config = MLModelConfigurationUtils.defaultConfiguration() + XCTAssertEqual(config.computeUnits, .cpuAndNeuralEngine) } // MARK: - Zero-Copy View Tests (Removed - causes crashes with memory operations) @@ -115,7 +101,7 @@ final class ANEOptimizerTests: XCTestCase { float32Array[i] = NSNumber(value: Float(i) * 0.1) } - let result = try ANEOptimizer.convertToFloat16(float32Array) + let result = try ANEMemoryUtils.convertToFloat16(float32Array) XCTAssertEqual(result.shape, float32Array.shape) @@ -129,7 +115,7 @@ final class ANEOptimizerTests: XCTestCase { // Verify ANE alignment only in non-CI environment let pointerValue = Int(bitPattern: result.dataPointer) - XCTAssertEqual(pointerValue % ANEOptimizer.aneAlignment, 0) + XCTAssertEqual(pointerValue % ANEMemoryUtils.aneAlignment, 0) } // Verify data conversion accuracy (regardless of CI) @@ -145,11 +131,9 @@ final class ANEOptimizerTests: XCTestCase { let int32Array = try MLMultiArray(shape: [5], dataType: .int32) XCTAssertThrowsError( - try ANEOptimizer.convertToFloat16(int32Array) + try ANEMemoryUtils.convertToFloat16(int32Array) ) { error in - let nsError = error as NSError - XCTAssertEqual(nsError.domain, "ANEOptimizer") - XCTAssertEqual(nsError.code, -3) + XCTAssertTrue(error is ANEMemoryUtils.ANEMemoryError) } } diff --git a/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift b/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift index b7c68af1f..63cecadc4 100644 --- a/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift +++ b/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift @@ -26,7 +26,7 @@ final class MLArrayCacheTests: XCTestCase { if !isCI { // Verify ANE alignment only in non-CI environment let pointerValue = Int(bitPattern: array.dataPointer) - XCTAssertEqual(pointerValue % ANEOptimizer.aneAlignment, 0) + XCTAssertEqual(pointerValue % ANEMemoryUtils.aneAlignment, 0) } } @@ -115,7 +115,7 @@ final class MLArrayCacheTests: XCTestCase { // Verify ANE alignment only in non-CI environment let pointerValue = Int(bitPattern: fp16Array.dataPointer) - XCTAssertEqual(pointerValue % ANEOptimizer.aneAlignment, 0) + XCTAssertEqual(pointerValue % ANEMemoryUtils.aneAlignment, 0) } } From a1c6426060c2d3373d7dde64f7cd7f552b8d6fad Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 20:13:09 -0400 Subject: [PATCH 05/16] Clean up AsrTranscription: remove dead code, extract helpers, eliminate duplication - Remove unused import OSLog and dead sliceEncoderOutput method - Add clearCachedCtcData() helper to eliminate repeated nil assignments - Add decoderState(for:)/setDecoderState(_:for:) to eliminate switch duplication - Extract frameAlignedAudio() helper for duplicated frame-alignment logic - Add ASRConstants.secondsPerEncoderFrame to replace magic number 0.08 - Replace hardcoded 16_000 with config.sampleRate - Remove unused duration parameter from calculateConfidence - Simplify processTranscriptionResult by removing dead tokenTimings parameter - Replace convertTokensWithExistingTimings with simpler convertTokensToText --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 101 ++++------- .../ASR/Parakeet/AsrTranscription.swift | 167 +++++------------- Sources/FluidAudio/Shared/ASRConstants.swift | 3 + .../ASR/Parakeet/AsrTranscriptionTests.swift | 16 +- 4 files changed, 89 insertions(+), 198 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index a4135df79..0a16dae16 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -38,6 +38,22 @@ public actor AsrManager { internal var microphoneDecoderState: TdtDecoderState internal var systemDecoderState: TdtDecoderState + /// Get decoder state for a given audio source. + internal func decoderState(for source: AudioSource) -> TdtDecoderState { + switch source { + case .microphone: return microphoneDecoderState + case .system: return systemDecoderState + } + } + + /// Set decoder state for a given audio source. + internal func setDecoderState(_ state: TdtDecoderState, for source: AudioSource) { + switch source { + case .microphone: microphoneDecoderState = state + case .system: systemDecoderState = state + } + } + // Cached CTC logits from fused Preprocessor (unified custom vocabulary) internal var cachedCtcLogits: MLMultiArray? internal var cachedCtcFrameDuration: Double? @@ -46,6 +62,13 @@ public actor AsrManager { /// Whether the Preprocessor outputs CTC logits (unified custom vocabulary model). public var hasCachedCtcLogits: Bool { cachedCtcLogits != nil } + /// Clear all cached CTC data (logits, frame duration, valid frames). + internal func clearCachedCtcData() { + cachedCtcLogits = nil + cachedCtcFrameDuration = nil + cachedCtcValidFrames = nil + } + /// Get cached CTC raw logits as [[Float]] for external use (e.g. benchmarks). /// These are raw logits — callers must apply `CtcKeywordSpotter.applyLogSoftmax()` /// to convert to log-probabilities before use in keyword detection. @@ -211,16 +234,7 @@ public actor AsrManager { throw ASRError.notInitialized } - // Get the appropriate decoder state - var state: TdtDecoderState - switch source { - case .microphone: - state = microphoneDecoderState - case .system: - state = systemDecoderState - } - - // Reset the existing decoder state to clear all cached values including predictorOutput + var state = decoderState(for: source) state.reset() let initDecoderInput = try prepareDecoderInput( @@ -234,14 +248,7 @@ public actor AsrManager { ) state.update(from: initDecoderOutput) - - // Store back - switch source { - case .microphone: - microphoneDecoderState = state - case .system: - systemDecoderState = state - } + setDecoderState(state, for: source) } public func resetState() { @@ -249,9 +256,7 @@ public actor AsrManager { let layers = asrModels?.version.decoderLayers ?? 2 microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers) systemDecoderState = TdtDecoderState.make(decoderLayers: layers) - cachedCtcLogits = nil - cachedCtcFrameDuration = nil - cachedCtcValidFrames = nil + clearCachedCtcData() Task { await sharedMLArrayCache.clear() } } @@ -266,10 +271,7 @@ public actor AsrManager { // Reset decoder states using fresh allocations for deterministic behavior microphoneDecoderState = TdtDecoderState.make(decoderLayers: layers) systemDecoderState = TdtDecoderState.make(decoderLayers: layers) - // Release cached CTC data - cachedCtcLogits = nil - cachedCtcFrameDuration = nil - cachedCtcValidFrames = nil + clearCachedCtcData() Task { await sharedMLArrayCache.clear() } logger.info("AsrManager resources cleaned up") } @@ -408,7 +410,7 @@ public actor AsrManager { ) let totalSamples = sampleSource.sampleCount - guard totalSamples >= 16_000 else { + guard totalSamples >= config.sampleRate else { sampleSource.cleanup() throw ASRError.invalidAudioData } @@ -502,49 +504,14 @@ public actor AsrManager { token.replacingOccurrences(of: "▁", with: " ") } - internal func convertTokensWithExistingTimings( - _ tokenIds: [Int], timings: [TokenTiming] - ) -> ( - text: String, timings: [TokenTiming] - ) { - guard !tokenIds.isEmpty else { return ("", []) } - - // SentencePiece-compatible decoding algorithm: - // 1. Convert token IDs to token strings - var tokens: [String] = [] - var tokenInfos: [(token: String, tokenId: Int, timing: TokenTiming?)] = [] - - for (index, tokenId) in tokenIds.enumerated() { - if let token = vocabulary[tokenId], !token.isEmpty { - tokens.append(token) - let timing = index < timings.count ? timings[index] : nil - tokenInfos.append((token: token, tokenId: tokenId, timing: timing)) - } - } - - // 2. Concatenate all tokens (this is how SentencePiece works) - let concatenated = tokens.joined() + /// Decode token IDs to text using SentencePiece conventions. + internal func convertTokensToText(_ tokenIds: [Int]) -> String { + guard !tokenIds.isEmpty else { return "" } - // 3. Replace ▁ with space (SentencePiece standard) - let text = concatenated.replacingOccurrences(of: "▁", with: " ") + let tokens = tokenIds.compactMap { vocabulary[$0] }.filter { !$0.isEmpty } + return tokens.joined() + .replacingOccurrences(of: "▁", with: " ") .trimmingCharacters(in: .whitespaces) - - // 4. For now, return original timings as-is - // Note: Proper timing alignment would require tracking character positions - // through the concatenation and replacement process - let adjustedTimings = tokenInfos.compactMap { info in - info.timing.map { timing in - TokenTiming( - token: normalizedTimingToken(info.token), - tokenId: info.tokenId, - startTime: timing.startTime, - endTime: timing.endTime, - confidence: timing.confidence - ) - } - } - - return (text, adjustedTimings) } nonisolated internal func extractFeatureValue( diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift index eb1430dc9..0c947f536 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrTranscription.swift @@ -1,6 +1,5 @@ @preconcurrency import CoreML import Foundation -import OSLog extension AsrManager { @@ -8,34 +7,15 @@ extension AsrManager { _ audioSamples: [Float], source: AudioSource ) async throws -> ASRResult { guard isAvailable else { throw ASRError.notInitialized } - guard audioSamples.count >= 16_000 else { throw ASRError.invalidAudioData } + guard audioSamples.count >= config.sampleRate else { throw ASRError.invalidAudioData } let startTime = Date() - // Get the appropriate decoder state - var decoderState: TdtDecoderState - switch source { - case .microphone: - decoderState = microphoneDecoderState - case .system: - decoderState = systemDecoderState - } + var decoderState = decoderState(for: source) // Route to appropriate processing method based on audio length if audioSamples.count <= ASRConstants.maxModelSamples { - let originalLength = audioSamples.count - let frameAlignedCandidate = - ((originalLength + ASRConstants.samplesPerEncoderFrame - 1) - / ASRConstants.samplesPerEncoderFrame) * ASRConstants.samplesPerEncoderFrame - let frameAlignedLength: Int - let alignedSamples: [Float] - if frameAlignedCandidate > originalLength && frameAlignedCandidate <= ASRConstants.maxModelSamples { - frameAlignedLength = frameAlignedCandidate - alignedSamples = audioSamples + Array(repeating: 0, count: frameAlignedLength - originalLength) - } else { - frameAlignedLength = originalLength - alignedSamples = audioSamples - } + let (alignedSamples, frameAlignedLength) = frameAlignedAudio(audioSamples) let paddedAudio: [Float] = padAudioIfNeeded(alignedSamples, targetLength: ASRConstants.maxModelSamples) let (hypothesis, encoderSequenceLength) = try await executeMLInferenceWithTimings( paddedAudio, @@ -55,13 +35,7 @@ extension AsrManager { processingTime: Date().timeIntervalSince(startTime) ) - // Store decoder state back - switch source { - case .microphone: - microphoneDecoderState = decoderState - case .system: - systemDecoderState = decoderState - } + setDecoderState(decoderState, for: source) return result } @@ -77,13 +51,7 @@ extension AsrManager { } ) - // Store decoder state back (ChunkProcessor uses the stored state directly) - switch source { - case .microphone: - microphoneDecoderState = decoderState - case .system: - systemDecoderState = decoderState - } + setDecoderState(decoderState, for: source) return result } @@ -157,20 +125,14 @@ extension AsrManager { cachedCtcFrameDuration = 0.04 // 40ms per frame cachedCtcValidFrames = encoderSequenceLength } else { - cachedCtcLogits = nil - cachedCtcFrameDuration = nil - cachedCtcValidFrames = nil + clearCachedCtcData() } } catch { logger.warning("CTC head inference failed: \(error.localizedDescription)") - cachedCtcLogits = nil - cachedCtcFrameDuration = nil - cachedCtcValidFrames = nil + clearCachedCtcData() } } else { - cachedCtcLogits = nil - cachedCtcFrameDuration = nil - cachedCtcValidFrames = nil + clearCachedCtcData() } // Calculate actual audio frames if not provided using shared constants @@ -248,25 +210,10 @@ extension AsrManager { previousTokens: [Int] = [], isLastChunk: Bool = false ) async throws -> (tokens: [Int], timestamps: [Int], confidences: [Float], encoderSequenceLength: Int) { - // Select and copy decoder state for the source - var state = (source == .microphone) ? microphoneDecoderState : systemDecoderState + var state = decoderState(for: source) - let originalLength = chunkSamples.count - let frameAlignedCandidate = - ((originalLength + ASRConstants.samplesPerEncoderFrame - 1) - / ASRConstants.samplesPerEncoderFrame) * ASRConstants.samplesPerEncoderFrame - let frameAlignedLength: Int - let alignedSamples: [Float] - if previousTokens.isEmpty - && frameAlignedCandidate > originalLength - && frameAlignedCandidate <= ASRConstants.maxModelSamples - { - frameAlignedLength = frameAlignedCandidate - alignedSamples = chunkSamples + Array(repeating: 0, count: frameAlignedLength - originalLength) - } else { - frameAlignedLength = originalLength - alignedSamples = chunkSamples - } + let (alignedSamples, frameAlignedLength) = frameAlignedAudio( + chunkSamples, allowAlignment: previousTokens.isEmpty) let padded = padAudioIfNeeded(alignedSamples, targetLength: ASRConstants.maxModelSamples) let (hypothesis, encLen) = try await executeMLInferenceWithTimings( padded, @@ -277,12 +224,7 @@ extension AsrManager { isLastChunk: isLastChunk ) - // Persist updated state back to the source-specific slot - if source == .microphone { - microphoneDecoderState = state - } else { - systemDecoderState = state - } + setDecoderState(state, for: source) // Apply token deduplication if previous tokens are provided if !previousTokens.isEmpty && hypothesis.hasTokens { @@ -307,23 +249,16 @@ extension AsrManager { tokenDurations: [Int] = [], encoderSequenceLength: Int, audioSamples: [Float], - processingTime: TimeInterval, - tokenTimings: [TokenTiming] = [] + processingTime: TimeInterval ) -> ASRResult { - let (text, finalTimings) = convertTokensWithExistingTimings(tokenIds, timings: tokenTimings) + let text = convertTokensToText(tokenIds) let duration = TimeInterval(audioSamples.count) / TimeInterval(config.sampleRate) - // Convert timestamps to TokenTiming objects if provided - let timingsFromTimestamps = createTokenTimings( + let resultTimings = createTokenTimings( from: tokenIds, timestamps: timestamps, confidences: confidences, tokenDurations: tokenDurations) - // Use existing timings if provided, otherwise use timings from timestamps - let resultTimings = tokenTimings.isEmpty ? timingsFromTimestamps : finalTimings - - // Calculate confidence based on actual model confidence scores from TDT decoder let confidence = calculateConfidence( - duration: duration, tokenCount: tokenIds.count, isEmpty: text.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty, tokenConfidences: confidences @@ -338,6 +273,27 @@ extension AsrManager { ) } + /// Align audio samples to encoder frame boundaries by zero-padding to the next frame boundary. + /// Returns the aligned samples and the frame-aligned length. + /// - Parameters: + /// - audioSamples: Raw audio samples + /// - allowAlignment: When false, skip alignment (e.g. when previous context exists) + nonisolated internal func frameAlignedAudio( + _ audioSamples: [Float], allowAlignment: Bool = true + ) -> (samples: [Float], frameAlignedLength: Int) { + let originalLength = audioSamples.count + let frameAlignedCandidate = + ((originalLength + ASRConstants.samplesPerEncoderFrame - 1) + / ASRConstants.samplesPerEncoderFrame) * ASRConstants.samplesPerEncoderFrame + if allowAlignment && frameAlignedCandidate > originalLength + && frameAlignedCandidate <= ASRConstants.maxModelSamples + { + let aligned = audioSamples + Array(repeating: 0, count: frameAlignedCandidate - originalLength) + return (aligned, frameAlignedCandidate) + } + return (audioSamples, originalLength) + } + nonisolated internal func padAudioIfNeeded(_ audioSamples: [Float], targetLength: Int) -> [Float] { guard audioSamples.count < targetLength else { return audioSamples } return audioSamples + Array(repeating: 0, count: targetLength - audioSamples.count) @@ -347,7 +303,7 @@ extension AsrManager { /// Returns the average of token-level softmax probabilities from the decoder /// Range: 0.1 (empty transcription) to 1.0 (perfect confidence) nonisolated private func calculateConfidence( - duration: Double, tokenCount: Int, isEmpty: Bool, tokenConfidences: [Float] + tokenCount: Int, isEmpty: Bool, tokenConfidences: [Float] ) -> Float { // Empty transcription gets low confidence if isEmpty { @@ -391,27 +347,25 @@ extension AsrManager { // Sort by timestamp to ensure chronological order let sortedData = combinedData.sorted { $0.timestamp < $1.timestamp } + let frameDuration = ASRConstants.secondsPerEncoderFrame + for i in 0.. 0 { - // Use actual token duration (convert frames to time: duration * 0.08) - let durationInSeconds = TimeInterval(data.duration) * 0.08 - endTime = startTime + max(durationInSeconds, 0.08) // Minimum 80ms duration + let durationInSeconds = TimeInterval(data.duration) * frameDuration + endTime = startTime + max(durationInSeconds, frameDuration) } else if i < sortedData.count - 1 { - // Fallback: Use next token's start time as this token's end time - let nextStartTime = TimeInterval(sortedData[i + 1].timestamp) * 0.08 - endTime = max(nextStartTime, startTime + 0.08) // Ensure end > start + let nextStartTime = TimeInterval(sortedData[i + 1].timestamp) * frameDuration + endTime = max(nextStartTime, startTime + frameDuration) } else { - // Last token: assume minimum duration - endTime = startTime + 0.08 + endTime = startTime + frameDuration } // Validate that end time is after start time @@ -437,37 +391,6 @@ extension AsrManager { return timings } - /// Slice encoder output to remove left context frames (following NeMo approach) - nonisolated private func sliceEncoderOutput( - _ encoderOutput: MLMultiArray, - from startFrame: Int, - newLength: Int - ) throws -> MLMultiArray { - let shape = encoderOutput.shape - let batchSize = shape[0].intValue - let hiddenSize = shape[2].intValue - - // Create new array with sliced dimensions - let slicedArray = try MLMultiArray( - shape: [batchSize, newLength, hiddenSize] as [NSNumber], - dataType: encoderOutput.dataType - ) - - // Copy data from startFrame onwards - let sourcePtr = encoderOutput.dataPointer.bindMemory(to: Float.self, capacity: encoderOutput.count) - let destPtr = slicedArray.dataPointer.bindMemory(to: Float.self, capacity: slicedArray.count) - - for t in 0.. Date: Sat, 28 Mar 2026 20:14:56 -0400 Subject: [PATCH 06/16] Fix enableFP16 parameter ignored in optimizedConfiguration, remove dead code - Apply enableFP16 to allowLowPrecisionAccumulationOnGPU in optimizedConfiguration (fixes review feedback on PR #460) - Remove dead loadWithANEOptimization method (no callers) - Remove unused import OSLog --- .../FluidAudio/ASR/Parakeet/AsrModels.swift | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift index d8dbd618c..d28a372ad 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrModels.swift @@ -1,6 +1,5 @@ @preconcurrency import CoreML import Foundation -import OSLog /// ASR model version enum public enum AsrModelVersion: Sendable { @@ -331,8 +330,6 @@ extension AsrModels { return try await load(from: targetDir, configuration: configuration, progressHandler: progressHandler) } - /// Load models with ANE-optimized configurations - private static func describeComputeUnits(_ units: MLComputeUnits) -> String { switch units { case .cpuOnly: @@ -348,18 +345,6 @@ extension AsrModels { } } - public static func loadWithANEOptimization( - from directory: URL? = nil, - enableFP16: Bool = true - ) async throws -> AsrModels { - let targetDir = directory ?? defaultCacheDirectory() - - logger.info("Loading ASR models with ANE optimization from: \(targetDir.path)") - - // Use the load method that already applies per-model optimizations - return try await load(from: targetDir, configuration: nil) - } - public static func defaultConfiguration() -> MLModelConfiguration { // Prefer Neural Engine across platforms for ASR inference to avoid GPU dispatch. MLModelConfigurationUtils.defaultConfiguration(computeUnits: .cpuAndNeuralEngine) @@ -370,9 +355,11 @@ extension AsrModels { enableFP16: Bool = true ) -> MLModelConfiguration { let isCI = ProcessInfo.processInfo.environment["CI"] != nil - return MLModelConfigurationUtils.defaultConfiguration( + let config = MLModelConfigurationUtils.defaultConfiguration( computeUnits: isCI ? .cpuOnly : .cpuAndNeuralEngine ) + config.allowLowPrecisionAccumulationOnGPU = enableFP16 + return config } /// Create optimized prediction options for inference From e05cbeb91243ed2a4afe2a5656c989cff162a02f Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 20:22:57 -0400 Subject: [PATCH 07/16] Remove dead PerformanceMonitor/AggregatedMetrics, move metrics to Shared - Delete PerformanceMonitor actor (never instantiated, component times hardcoded to 0) - Delete AggregatedMetrics struct (only used by dead monitor) - Remove unused imports (os, MachTaskSelfWrapper) - Move ASRPerformanceMetrics to Shared/ (not Parakeet-specific) - Remove dead PerformanceMonitor tests, keep ASRPerformanceMetrics tests --- .../ASR/Parakeet/PerformanceMetrics.swift | 155 ------------------ .../Shared/PerformanceMetrics.swift | 25 +++ .../Parakeet/PerformanceMetricsTests.swift | 77 --------- 3 files changed, 25 insertions(+), 232 deletions(-) delete mode 100644 Sources/FluidAudio/ASR/Parakeet/PerformanceMetrics.swift create mode 100644 Sources/FluidAudio/Shared/PerformanceMetrics.swift diff --git a/Sources/FluidAudio/ASR/Parakeet/PerformanceMetrics.swift b/Sources/FluidAudio/ASR/Parakeet/PerformanceMetrics.swift deleted file mode 100644 index b154f4725..000000000 --- a/Sources/FluidAudio/ASR/Parakeet/PerformanceMetrics.swift +++ /dev/null @@ -1,155 +0,0 @@ -import Foundation -import MachTaskSelfWrapper -import os - -/// Performance metrics for ASR processing -public struct ASRPerformanceMetrics: Codable, Sendable { - public let preprocessorTime: TimeInterval - public let encoderTime: TimeInterval - public let decoderTime: TimeInterval - public let totalProcessingTime: TimeInterval - public let rtfx: Float // Real-time factor - public let peakMemoryMB: Float - public let gpuUtilization: Float? - - public var summary: String { - """ - Performance Metrics: - - Preprocessor: \(String(format: "%.3f", preprocessorTime))s - - Encoder: \(String(format: "%.3f", encoderTime))s - - Decoder: \(String(format: "%.3f", decoderTime))s - - Total: \(String(format: "%.3f", totalProcessingTime))s - - RTFx: \(String(format: "%.1f", rtfx))x real-time - - Peak Memory: \(String(format: "%.1f", peakMemoryMB)) MB - - GPU Utilization: \(gpuUtilization.map { String(format: "%.1f%%", $0) } ?? "N/A") - """ - } -} - -/// Performance monitor for tracking ASR metrics -public actor PerformanceMonitor { - - public init() {} - private let logger = AppLogger(category: "Performance") - private var metrics: [ASRPerformanceMetrics] = [] - private let signpostLogger = OSSignposter(subsystem: AppLogger.defaultSubsystem, category: "Performance") - - /// Track performance for a processing session - public func trackSession( - operation: String, - audioLengthSeconds: Float, - block: @escaping () async throws -> T - ) async throws -> (result: T, metrics: ASRPerformanceMetrics) { - let sessionID = signpostLogger.makeSignpostID() - let state = signpostLogger.beginInterval("ASR.Operation", id: sessionID) - - let startTime = Date() - let startMemory = getCurrentMemoryUsage() - - // Track individual components - let preprocessorTime: TimeInterval = 0 - let encoderTime: TimeInterval = 0 - let decoderTime: TimeInterval = 0 - - // Execute the operation - let result = try await block() - - let totalTime = Date().timeIntervalSince(startTime) - let peakMemory = max(startMemory, getCurrentMemoryUsage()) - let rtfx = audioLengthSeconds / Float(totalTime) - - signpostLogger.endInterval("ASR.Operation", state) - - let metrics = ASRPerformanceMetrics( - preprocessorTime: preprocessorTime, - encoderTime: encoderTime, - decoderTime: decoderTime, - totalProcessingTime: totalTime, - rtfx: rtfx, - peakMemoryMB: peakMemory, - gpuUtilization: nil // Would require Metal performance counters - ) - - self.metrics.append(metrics) - logger.info("\(operation) completed: \(metrics.summary)") - - return (result, metrics) - } - - /// Track a specific component's execution time - public func trackComponent( - _ component: String, - block: @escaping () async throws -> T - ) async throws -> (result: T, time: TimeInterval) { - let componentID = signpostLogger.makeSignpostID() - let state = signpostLogger.beginInterval("ASR.Component", id: componentID) - - let startTime = Date() - let result = try await block() - let time = Date().timeIntervalSince(startTime) - - signpostLogger.endInterval("ASR.Component", state) - - return (result, time) - } - - /// Get aggregated metrics - public func getAggregatedMetrics() -> AggregatedMetrics? { - guard !metrics.isEmpty else { return nil } - - let avgRTFx = metrics.map { $0.rtfx }.reduce(0, +) / Float(metrics.count) - let avgProcessingTime = metrics.map { $0.totalProcessingTime }.reduce(0, +) / Double(metrics.count) - let maxMemory = metrics.map { $0.peakMemoryMB }.max() ?? 0 - - return AggregatedMetrics( - averageRTFx: avgRTFx, - averageProcessingTime: avgProcessingTime, - maxMemoryMB: maxMemory, - sampleCount: metrics.count - ) - } - - /// Clear all stored metrics - public func reset() { - metrics.removeAll() - } - - /// Get current memory usage in MB - private func getCurrentMemoryUsage() -> Float { - var info = mach_task_basic_info() - var count = mach_msg_type_number_t(MemoryLayout.size) / 4 - - let result = withUnsafeMutablePointer(to: &info) { - $0.withMemoryRebound(to: integer_t.self, capacity: 1) { - task_info( - get_current_task_port(), - task_flavor_t(MACH_TASK_BASIC_INFO), - $0, - &count) - } - } - - if result == KERN_SUCCESS { - return Float(info.resident_size) / 1024.0 / 1024.0 - } - - return 0 - } -} - -/// Aggregated performance metrics -public struct AggregatedMetrics: Sendable { - public let averageRTFx: Float - public let averageProcessingTime: TimeInterval - public let maxMemoryMB: Float - public let sampleCount: Int - - public var summary: String { - """ - Aggregated Metrics (\(sampleCount) samples): - - Average RTFx: \(String(format: "%.1f", averageRTFx))x real-time - - Average Processing Time: \(String(format: "%.3f", averageProcessingTime))s - - Max Memory Usage: \(String(format: "%.1f", maxMemoryMB)) MB - """ - } -} diff --git a/Sources/FluidAudio/Shared/PerformanceMetrics.swift b/Sources/FluidAudio/Shared/PerformanceMetrics.swift new file mode 100644 index 000000000..271ac84bf --- /dev/null +++ b/Sources/FluidAudio/Shared/PerformanceMetrics.swift @@ -0,0 +1,25 @@ +import Foundation + +/// Performance metrics for ASR processing +public struct ASRPerformanceMetrics: Codable, Sendable { + public let preprocessorTime: TimeInterval + public let encoderTime: TimeInterval + public let decoderTime: TimeInterval + public let totalProcessingTime: TimeInterval + public let rtfx: Float // Real-time factor + public let peakMemoryMB: Float + public let gpuUtilization: Float? + + public var summary: String { + """ + Performance Metrics: + - Preprocessor: \(String(format: "%.3f", preprocessorTime))s + - Encoder: \(String(format: "%.3f", encoderTime))s + - Decoder: \(String(format: "%.3f", decoderTime))s + - Total: \(String(format: "%.3f", totalProcessingTime))s + - RTFx: \(String(format: "%.1f", rtfx))x real-time + - Peak Memory: \(String(format: "%.1f", peakMemoryMB)) MB + - GPU Utilization: \(gpuUtilization.map { String(format: "%.1f%%", $0) } ?? "N/A") + """ + } +} diff --git a/Tests/FluidAudioTests/ASR/Parakeet/PerformanceMetricsTests.swift b/Tests/FluidAudioTests/ASR/Parakeet/PerformanceMetricsTests.swift index 81cce4879..514dd372e 100644 --- a/Tests/FluidAudioTests/ASR/Parakeet/PerformanceMetricsTests.swift +++ b/Tests/FluidAudioTests/ASR/Parakeet/PerformanceMetricsTests.swift @@ -42,81 +42,4 @@ final class PerformanceMetricsTests: XCTestCase { let summary = metrics.summary XCTAssertTrue(summary.contains("N/A"), "Summary should show N/A for nil GPU utilization") } - - // MARK: - AggregatedMetrics - - func testAggregatedMetricsSummaryFormatting() { - let aggregated = AggregatedMetrics( - averageRTFx: 8.5, - averageProcessingTime: 1.234, - maxMemoryMB: 512.0, - sampleCount: 10 - ) - - let summary = aggregated.summary - XCTAssertTrue(summary.contains("10 samples"), "Summary should contain sample count") - XCTAssertTrue(summary.contains("8.5"), "Summary should contain average RTFx") - XCTAssertTrue(summary.contains("1.234"), "Summary should contain average processing time") - XCTAssertTrue(summary.contains("512.0"), "Summary should contain max memory") - } - - // MARK: - PerformanceMonitor - - func testAggregatedMetricsEmptyReturnsNil() async { - let monitor = PerformanceMonitor() - let result = await monitor.getAggregatedMetrics() - XCTAssertNil(result, "Empty monitor should return nil for aggregated metrics") - } - - func testResetClearsMetrics() async throws { - let monitor = PerformanceMonitor() - - // Track a session to add metrics - _ = try await monitor.trackSession(operation: "test", audioLengthSeconds: 1.0) { - return 42 - } - - // Verify metrics exist - let before = await monitor.getAggregatedMetrics() - XCTAssertNotNil(before) - - // Reset and verify empty - await monitor.reset() - let after = await monitor.getAggregatedMetrics() - XCTAssertNil(after, "After reset, aggregated metrics should be nil") - } - - func testTrackSessionReturnsMetrics() async throws { - let monitor = PerformanceMonitor() - - let (result, metrics) = try await monitor.trackSession( - operation: "test", - audioLengthSeconds: 2.0 - ) { - return "hello" - } - - XCTAssertEqual(result, "hello") - XCTAssertGreaterThanOrEqual(metrics.totalProcessingTime, 0) - XCTAssertGreaterThan(metrics.rtfx, 0) - } - - func testAggregatedMetricsComputation() async throws { - let monitor = PerformanceMonitor() - - for i in 0..<3 { - _ = try await monitor.trackSession( - operation: "test\(i)", - audioLengthSeconds: Float(i + 1) - ) { - return i - } - } - - let aggregated = await monitor.getAggregatedMetrics() - XCTAssertNotNil(aggregated) - XCTAssertEqual(aggregated?.sampleCount, 3) - XCTAssertGreaterThan(aggregated!.averageRTFx, 0) - XCTAssertGreaterThan(aggregated!.averageProcessingTime, 0) - } } From 7c6b2d60093d193c0d3d756ff5ab848a6ea6c3aa Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 20:25:53 -0400 Subject: [PATCH 08/16] Simplify ProgressEmitter: remove dead code paths, move to Shared - Remove redundant currentStream() wrapper (callers use ensureSession) - Fix finishSession: return early when inactive instead of creating an orphan stream - Remove auto-create in resetAndPrepareNextSession (renamed to reset); next ensureSession() creates on demand - Remove onTermination closure with unnecessary weak self - Move from ASR/Parakeet/ to Shared/ (generic async stream utility) --- .../FluidAudio/ASR/Parakeet/AsrManager.swift | 2 +- .../Parakeet => Shared}/ProgressEmitter.swift | 34 ++++++------------- 2 files changed, 11 insertions(+), 25 deletions(-) rename Sources/FluidAudio/{ASR/Parakeet => Shared}/ProgressEmitter.swift (52%) diff --git a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift index 0a16dae16..3d7f12e17 100644 --- a/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift +++ b/Sources/FluidAudio/ASR/Parakeet/AsrManager.swift @@ -128,7 +128,7 @@ public actor AsrManager { /// Only one session is supported at a time. public var transcriptionProgressStream: AsyncThrowingStream { get async { - await progressEmitter.currentStream() + await progressEmitter.ensureSession() } } diff --git a/Sources/FluidAudio/ASR/Parakeet/ProgressEmitter.swift b/Sources/FluidAudio/Shared/ProgressEmitter.swift similarity index 52% rename from Sources/FluidAudio/ASR/Parakeet/ProgressEmitter.swift rename to Sources/FluidAudio/Shared/ProgressEmitter.swift index b0ea84a4c..cea1674ed 100644 --- a/Sources/FluidAudio/ASR/Parakeet/ProgressEmitter.swift +++ b/Sources/FluidAudio/Shared/ProgressEmitter.swift @@ -7,38 +7,33 @@ actor ProgressEmitter { init() {} - func ensureSession() async -> AsyncThrowingStream { + func ensureSession() -> AsyncThrowingStream { if let stream = streamStorage { return stream } - return await startSession() + return startSession() } - func currentStream() async -> AsyncThrowingStream { - await ensureSession() - } - - func report(progress: Double) async { + func report(progress: Double) { guard isActive else { return } let clamped = min(max(progress, 0.0), 1.0) continuation?.yield(clamped) } - func finishSession() async { - guard isActive else { - _ = await ensureSession() - return - } + func finishSession() { + guard isActive else { return } continuation?.yield(1.0) continuation?.finish() + reset() } - func failSession(_ error: Error) async { + func failSession(_ error: Error) { continuation?.finish(throwing: error) + reset() } - private func startSession() async -> AsyncThrowingStream { + private func startSession() -> AsyncThrowingStream { if let stream = streamStorage { return stream } @@ -48,22 +43,13 @@ actor ProgressEmitter { self.continuation = continuation self.isActive = true - continuation.onTermination = - { [weak self] (_: AsyncThrowingStream.Continuation.Termination) in - Task { [weak self] in - guard let self else { return } - await self.resetAndPrepareNextSession() - } - } - continuation.yield(0.0) return stream } - private func resetAndPrepareNextSession() async { + private func reset() { continuation = nil streamStorage = nil isActive = false - _ = await startSession() } } From 888b02430286c2c3db88cb8b732754b2b12e189a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 20:31:29 -0400 Subject: [PATCH 09/16] Clean up MLArrayCache: remove dead code, fix resetData bug, move to Shared - Remove unused `import os` and logger - Delete dead `getFloat16Array` method and its 2 tests - Fix `returnArray` to reset data for all types, not just float32 - Remove debug logging from hot path - Move from ASR/Parakeet/ to Shared/ (used by ASR and shared cache) --- .../Parakeet => Shared}/MLArrayCache.swift | 29 ++--------- .../Shared/MLArrayCacheTests.swift | 52 ------------------- 2 files changed, 3 insertions(+), 78 deletions(-) rename Sources/FluidAudio/{ASR/Parakeet => Shared}/MLArrayCache.swift (64%) diff --git a/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift b/Sources/FluidAudio/Shared/MLArrayCache.swift similarity index 64% rename from Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift rename to Sources/FluidAudio/Shared/MLArrayCache.swift index 3ede440dd..b62ba4fbd 100644 --- a/Sources/FluidAudio/ASR/Parakeet/MLArrayCache.swift +++ b/Sources/FluidAudio/Shared/MLArrayCache.swift @@ -1,12 +1,10 @@ import CoreML import Foundation -import os /// Thread-safe cache for MLMultiArray instances to reduce allocation overhead actor MLArrayCache { private var cache: [CacheKey: [MLMultiArray]] = [:] private let maxCacheSize: Int - private let logger = AppLogger(category: "MLArrayCache") struct CacheKey: Hashable { let shape: [Int] @@ -24,10 +22,7 @@ actor MLArrayCache { dataType: dataType ) - // Check if we have a cached array if var arrays = cache[key], !arrays.isEmpty { - // Never return the same buffer twice while it is still in use; keep the trimmed bucket so we only - // hand out arrays that callers have explicitly returned to the cache. let array = arrays.removeLast() cache[key] = arrays return array @@ -47,20 +42,14 @@ actor MLArrayCache { // Limit cache size per key if arrays.count < maxCacheSize / max(cache.count, 1) { - // Reset the array data before caching - if array.dataType == .float32 { - array.resetData(to: 0) - } + array.resetData(to: 0) arrays.append(array) cache[key] = arrays - logger.debug("Returned array to cache for shape: \(array.shape)") } } /// Pre-warm the cache with commonly used shapes - func prewarm(shapes: [(shape: [NSNumber], dataType: MLMultiArrayDataType)]) async { - logger.info("Pre-warming cache with \(shapes.count) shapes") - + func prewarm(shapes: [(shape: [NSNumber], dataType: MLMultiArrayDataType)]) { for (shape, dataType) in shapes { do { var arrays: [MLMultiArray] = [] @@ -74,26 +63,14 @@ actor MLArrayCache { let key = CacheKey(shape: shape.map { $0.intValue }, dataType: dataType) cache[key] = arrays } catch { - logger.error("Failed to pre-warm shape \(shape): \(error)") + // Silently skip shapes that fail to allocate during pre-warm } } } - /// Get a Float16 array (converting from Float32 if needed) - func getFloat16Array(shape: [NSNumber], from float32Array: MLMultiArray? = nil) throws -> MLMultiArray { - if let float32Array = float32Array { - // Convert existing array to Float16 - return try ANEMemoryUtils.convertToFloat16(float32Array) - } else { - // Get new Float16 array from cache - return try getArray(shape: shape, dataType: .float16) - } - } - /// Clear the cache func clear() { cache.removeAll() - logger.info("Cache cleared") } } diff --git a/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift b/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift index 63cecadc4..09673423b 100644 --- a/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift +++ b/Tests/FluidAudioTests/Shared/MLArrayCacheTests.swift @@ -97,58 +97,6 @@ final class MLArrayCacheTests: XCTestCase { XCTAssertNotNil(finalArray) } - // MARK: - Float16 Support - - func testGetFloat16ArrayFromScratch() async throws { - let shape: [NSNumber] = [1, 64] // Smaller for CI stability - let fp16Array = try await cache.getFloat16Array(shape: shape) - - XCTAssertEqual(fp16Array.shape, shape) - - // In CI, we might get Float32 instead of Float16 for stability - let isCI = ProcessInfo.processInfo.environment["CI"] != nil - if isCI { - // In CI, accept either Float16 or Float32 - XCTAssertTrue(fp16Array.dataType == .float16 || fp16Array.dataType == .float32) - } else { - XCTAssertEqual(fp16Array.dataType, .float16) - - // Verify ANE alignment only in non-CI environment - let pointerValue = Int(bitPattern: fp16Array.dataPointer) - XCTAssertEqual(pointerValue % ANEMemoryUtils.aneAlignment, 0) - } - } - - func testGetFloat16ArrayFromFloat32() async throws { - // Create Float32 array - let shape: [NSNumber] = [50] // Smaller for CI stability - let float32Array = try MLMultiArray(shape: shape, dataType: .float32) - - // Fill with test values - for i in 0.. Date: Sat, 28 Mar 2026 20:34:51 -0400 Subject: [PATCH 10/16] Clean up ChunkProcessor: remove dead imports, use constants, fix cutoff bug - Remove unused `import CoreML` and `import OSLog` - Replace hardcoded `sampleRate = 16000` with `ASRConstants.sampleRate` - Replace manual frameDuration calculation with `ASRConstants.secondsPerEncoderFrame` - Fix duplicate token at cutoff boundary in mergeByMidpoint (`<=` to `<`) --- Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift index 92a62ed0e..f96a3b2c6 100644 --- a/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift +++ b/Sources/FluidAudio/ASR/Parakeet/ChunkProcessor.swift @@ -1,6 +1,4 @@ -import CoreML import Foundation -import OSLog struct ChunkProcessor { let sampleSource: AudioSampleSource @@ -18,7 +16,6 @@ struct ChunkProcessor { // Stateless chunking aligned with CoreML reference: // - process ~14.96s of audio per window (frame-aligned) to stay under encoder limit // - 2.0s overlap (frame-aligned) to give the decoder slack when merging windows - private let sampleRate: Int = 16000 private let overlapSeconds: Double = 2.0 /// Context samples prepended from previous chunk for mel spectrogram stability (80ms = 1 encoder frame). @@ -36,7 +33,7 @@ struct ChunkProcessor { return raw / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame } private var overlapSamples: Int { - let requested = Int(overlapSeconds * Double(sampleRate)) + let requested = Int(overlapSeconds * Double(ASRConstants.sampleRate)) let capped = min(requested, chunkSamples / 2) return capped / ASRConstants.samplesPerEncoderFrame * ASRConstants.samplesPerEncoderFrame } @@ -219,7 +216,7 @@ struct ChunkProcessor { if left.isEmpty { return right } if right.isEmpty { return left } - let frameDuration = Double(ASRConstants.samplesPerEncoderFrame) / Double(sampleRate) + let frameDuration = ASRConstants.secondsPerEncoderFrame let overlapDuration = overlapSeconds let halfOverlapWindow = overlapDuration / 2 @@ -433,7 +430,7 @@ struct ChunkProcessor { frameDuration: Double ) -> [TokenWindow] { let cutoff = (leftEndTime + rightStartTime) / 2 - let trimmedLeft = left.filter { Double($0.timestamp) * frameDuration <= cutoff } + let trimmedLeft = left.filter { Double($0.timestamp) * frameDuration < cutoff } let trimmedRight = right.filter { Double($0.timestamp) * frameDuration >= cutoff } return trimmedLeft + trimmedRight } From f553e58d2e42aa1044a117c358cd58b6fe509f9d Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 20:39:46 -0400 Subject: [PATCH 11/16] Add run_parakeet_benchmarks.sh and reference it in benchmarks100.md Track the benchmark orchestration script that runs all 4 Parakeet model benchmarks (v3, v2, TDT-CTC-110M, CTC earnings) with asset verification and sleep prevention. Link it from the benchmark results doc for reproducibility. Whitelist the script in .gitignore (scripts/ was ignored). --- .gitignore | 1 + Documentation/ASR/benchmarks100.md | 12 ++ Scripts/run_parakeet_benchmarks.sh | 237 +++++++++++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100755 Scripts/run_parakeet_benchmarks.sh diff --git a/.gitignore b/.gitignore index 8594bc323..72d625aa5 100644 --- a/.gitignore +++ b/.gitignore @@ -102,6 +102,7 @@ Resources/ !Sources/FluidAudio/Resources/ !Sources/FluidAudio/Resources/** scripts/ +!Scripts/run_parakeet_benchmarks.sh Documentation/parakeet-tdt/ docs/parakeet-tdt/ diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 6220c153e..9358752c8 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -2,6 +2,18 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-structure`) to verify the directory restructuring introduces no regressions. +## Reproduction + +All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/run_parakeet_benchmarks.sh`](../../Scripts/run_parakeet_benchmarks.sh): + +```bash +# Download models and datasets (requires internet) +./Scripts/run_parakeet_benchmarks.sh --download + +# Run all 4 benchmarks offline (100 files each, sleep-prevented) +./Scripts/run_parakeet_benchmarks.sh +``` + ## Environment - **Hardware**: MacBook Air M2, 16 GB diff --git a/Scripts/run_parakeet_benchmarks.sh b/Scripts/run_parakeet_benchmarks.sh new file mode 100755 index 000000000..c3d8f93e0 --- /dev/null +++ b/Scripts/run_parakeet_benchmarks.sh @@ -0,0 +1,237 @@ +#!/bin/bash +# Run all Parakeet model benchmarks (100 files each) with sleep prevention. +# +# Benchmarks: +# 1. ASR v3 — parakeet-tdt-0.6b-v3 on LibriSpeech test-clean +# 2. ASR v2 — parakeet-tdt-0.6b-v2 on LibriSpeech test-clean +# 3. ASR tdt-ctc-110m — parakeet-tdt-ctc-110m on LibriSpeech test-clean +# 4. CTC custom vocab — ctc-earnings-benchmark (tdt-ctc-110m + CTC 110m keyword spotting) +# +# Usage: +# ./Scripts/run_parakeet_benchmarks.sh # verify + run +# ./Scripts/run_parakeet_benchmarks.sh --download # download missing assets, then exit +# +# The script verifies all models and dataset files exist locally before running. +# If anything is missing it will tell you exactly what and exit (unless --download). +# Uses caffeinate to prevent sleep so you can close the lid. +# Results are saved to benchmark_results/ with timestamps. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +RESULTS_DIR="$PROJECT_DIR/benchmark_results" +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +LOG_FILE="$RESULTS_DIR/benchmark_${TIMESTAMP}.log" +MAX_FILES=100 +SUBSET="test-clean" + +MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models" +DATASETS_DIR="$HOME/Library/Application Support/FluidAudio/Datasets" +EARNINGS_DIR="$HOME/Library/Application Support/FluidAudio/earnings22-kws/test-dataset" + +mkdir -p "$RESULTS_DIR" + +log() { + echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE" +} + +# --------------------------------------------------------------------------- +# Verify local assets +# --------------------------------------------------------------------------- +verify_assets() { + local missing=0 + + # --- Parakeet v3 --- + local v3_dir="$MODELS_DIR/parakeet-tdt-0.6b-v3" + for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do + if [[ ! -e "$v3_dir/$f" ]]; then + log "MISSING v3: $v3_dir/$f" + missing=1 + fi + done + + # --- Parakeet v2 (folder may have -coreml suffix) --- + local v2_dir="" + if [[ -d "$MODELS_DIR/parakeet-tdt-0.6b-v2-coreml" ]]; then + v2_dir="$MODELS_DIR/parakeet-tdt-0.6b-v2-coreml" + elif [[ -d "$MODELS_DIR/parakeet-tdt-0.6b-v2" ]]; then + v2_dir="$MODELS_DIR/parakeet-tdt-0.6b-v2" + fi + if [[ -z "$v2_dir" ]]; then + log "MISSING v2: no parakeet-tdt-0.6b-v2* directory found" + missing=1 + else + for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do + if [[ ! -e "$v2_dir/$f" ]]; then + log "MISSING v2: $v2_dir/$f" + missing=1 + fi + done + fi + + # --- TDT-CTC-110M (fused: no separate Encoder) --- + local tdt_ctc_dir="$MODELS_DIR/parakeet-tdt-ctc-110m" + for f in Preprocessor.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do + if [[ ! -e "$tdt_ctc_dir/$f" ]]; then + log "MISSING tdt-ctc-110m: $tdt_ctc_dir/$f" + missing=1 + fi + done + + # --- CTC 110M model (for custom vocabulary / keyword spotting) --- + local ctc_dir="$MODELS_DIR/parakeet-ctc-110m-coreml" + for f in MelSpectrogram.mlmodelc AudioEncoder.mlmodelc vocab.json; do + if [[ ! -e "$ctc_dir/$f" ]]; then + log "MISSING ctc-110m: $ctc_dir/$f" + missing=1 + fi + done + + # --- LibriSpeech test-clean --- + local ls_dir="$DATASETS_DIR/LibriSpeech/$SUBSET" + local trans_count + trans_count=$(find "$ls_dir" -name "*.trans.txt" 2>/dev/null | wc -l | tr -d ' ') + if [[ "$trans_count" -lt 5 ]]; then + log "MISSING LibriSpeech $SUBSET: found $trans_count transcript files (need >= 5)" + missing=1 + fi + + # --- Earnings22 KWS dataset --- + local earnings_wav_count + earnings_wav_count=$(find "$EARNINGS_DIR" -maxdepth 1 -name "*.wav" 2>/dev/null | wc -l | tr -d ' ') + if [[ "$earnings_wav_count" -lt 10 ]]; then + log "MISSING Earnings22 KWS: found $earnings_wav_count wav files (need >= 10)" + missing=1 + fi + + return $missing +} + +# --------------------------------------------------------------------------- +# Phase 1: --download (verify first, download only what's missing) +# --------------------------------------------------------------------------- +if [[ "${1:-}" == "--download" ]]; then + log "=== Checking local assets ===" + + if verify_assets; then + log "All models and datasets already present locally. Nothing to download." + exit 0 + fi + + log "Some assets are missing — downloading..." + + log "Building release binary..." + cd "$PROJECT_DIR" && swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" + CLI="$PROJECT_DIR/.build/release/fluidaudiocli" + + log "Downloading LibriSpeech $SUBSET dataset..." + "$CLI" download --dataset "librispeech-$SUBSET" 2>&1 | tee -a "$LOG_FILE" + + log "Downloading Earnings22 KWS dataset..." + "$CLI" download --dataset earnings22-kws 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading Parakeet v3 models (triggers download if missing)..." + "$CLI" asr-benchmark --model-version v3 --subset "$SUBSET" --max-files 1 \ + --output "$RESULTS_DIR/warmup_v3.json" 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading Parakeet v2 models..." + "$CLI" asr-benchmark --model-version v2 --subset "$SUBSET" --max-files 1 \ + --output "$RESULTS_DIR/warmup_v2.json" 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading TDT-CTC-110M + CTC models..." + "$CLI" ctc-earnings-benchmark --tdt-version tdt-ctc-110m --max-files 1 --auto-download \ + --output "$RESULTS_DIR/warmup_ctc.json" 2>&1 | tee -a "$LOG_FILE" + + rm -f "$RESULTS_DIR"/warmup_*.json + log "=== Downloads complete ===" + exit 0 +fi + +# --------------------------------------------------------------------------- +# Phase 2: Run benchmarks (offline-safe, sleep-prevented) +# --------------------------------------------------------------------------- +log "=== Verifying local assets before offline run ===" +if ! verify_assets; then + log "" + log "ERROR: Missing assets — cannot run offline." + log "Run with --download first while connected to the internet:" + log " ./Scripts/run_parakeet_benchmarks.sh --download" + exit 1 +fi +log "All assets verified locally." + +log "=== Parakeet benchmark suite: $MAX_FILES files x 4 benchmarks ===" +log "Results directory: $RESULTS_DIR" + +cd "$PROJECT_DIR" + +# Build release if not already built +if [[ ! -x ".build/release/fluidaudiocli" ]]; then + log "Building release binary..." + swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" +fi +CLI="$PROJECT_DIR/.build/release/fluidaudiocli" + +# caffeinate -s: prevent sleep even on AC power / lid closed +# caffeinate -i: prevent idle sleep +# We wrap the entire benchmark suite so caffeinate dies when the script ends. +caffeinate -si -w $$ & +CAFFEINATE_PID=$! +log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid" + +run_asr_benchmark() { + local model_version="$1" + local label="$2" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + + log "--- $label: starting ($MAX_FILES files, $SUBSET) ---" + local start_time=$(date +%s) + + "$CLI" asr-benchmark \ + --model-version "$model_version" \ + --subset "$SUBSET" \ + --max-files "$MAX_FILES" \ + --no-auto-download \ + --output "$output_file" \ + 2>&1 | tee -a "$LOG_FILE" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +run_ctc_earnings_benchmark() { + local label="ctc_earnings_vocab" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + + log "--- $label: starting ($MAX_FILES files, tdt-ctc-110m + CTC keyword spotting) ---" + local start_time=$(date +%s) + + "$CLI" ctc-earnings-benchmark \ + --tdt-version tdt-ctc-110m \ + --ctc-variant 110m \ + --max-files "$MAX_FILES" \ + --output "$output_file" \ + 2>&1 | tee -a "$LOG_FILE" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +SUITE_START=$(date +%s) + +run_asr_benchmark "v3" "parakeet_v3" +run_asr_benchmark "v2" "parakeet_v2" +run_asr_benchmark "tdt-ctc-110m" "parakeet_tdt_ctc_110m" +run_ctc_earnings_benchmark + +SUITE_END=$(date +%s) +SUITE_ELAPSED=$(( SUITE_END - SUITE_START )) + +log "=== All benchmarks complete in ${SUITE_ELAPSED}s ===" +log "Results:" +ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE" + +# caffeinate will exit automatically since the parent process ($$) exits From cd7bea957f005aaeb07165c8df89268e1b52aa2a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 21:21:26 -0400 Subject: [PATCH 12/16] Add EOU/Nemotron benchmarks to script, fix CTC folderName bug - Add explicit folderName cases for parakeetCtc110m and parakeetCtc06b in ModelNames.swift. The default case strips "-coreml" which broke auto-detection of the CTC model directory. - Add EOU 320ms and Nemotron 1120ms streaming benchmarks to the script - Add WER comparison table against benchmarks100.md baselines - Fix CTC earnings to use v2 TDT (matching baseline config) - Fix WER extraction for fields stored as percentages vs decimals Verified: all 6 benchmarks match baselines (v3 2.6%, v2 3.8%, TDT-CTC 3.6%, earnings 16.5%, EOU 7.11%, Nemotron 1.99%). --- Scripts/run_parakeet_benchmarks.sh | 175 ++++++++++++++++++++++++++-- Sources/FluidAudio/ModelNames.swift | 4 + 2 files changed, 172 insertions(+), 7 deletions(-) diff --git a/Scripts/run_parakeet_benchmarks.sh b/Scripts/run_parakeet_benchmarks.sh index c3d8f93e0..a54b32fec 100755 --- a/Scripts/run_parakeet_benchmarks.sh +++ b/Scripts/run_parakeet_benchmarks.sh @@ -5,7 +5,9 @@ # 1. ASR v3 — parakeet-tdt-0.6b-v3 on LibriSpeech test-clean # 2. ASR v2 — parakeet-tdt-0.6b-v2 on LibriSpeech test-clean # 3. ASR tdt-ctc-110m — parakeet-tdt-ctc-110m on LibriSpeech test-clean -# 4. CTC custom vocab — ctc-earnings-benchmark (tdt-ctc-110m + CTC 110m keyword spotting) +# 4. CTC custom vocab — ctc-earnings-benchmark (v2 TDT + CTC 110m keyword spotting) +# 5. EOU streaming — parakeet-eou 320ms on LibriSpeech test-clean +# 6. Nemotron streaming — nemotron 1120ms on LibriSpeech test-clean # # Usage: # ./Scripts/run_parakeet_benchmarks.sh # verify + run @@ -88,6 +90,16 @@ verify_assets() { fi done + # --- EOU streaming models (320ms chunks) --- + local eou_dir="$MODELS_DIR/parakeet-eou-streaming/320ms" + if [[ ! -d "$eou_dir" ]]; then + log "MISSING eou-320ms: $eou_dir" + missing=1 + fi + + # --- Nemotron models (uses v3 encoder + nemotron-specific models) --- + # Nemotron reuses the v3 models directory; no separate check needed beyond v3 above. + # --- LibriSpeech test-clean --- local ls_dir="$DATASETS_DIR/LibriSpeech/$SUBSET" local trans_count @@ -139,11 +151,18 @@ if [[ "${1:-}" == "--download" ]]; then "$CLI" asr-benchmark --model-version v2 --subset "$SUBSET" --max-files 1 \ --output "$RESULTS_DIR/warmup_v2.json" 2>&1 | tee -a "$LOG_FILE" - log "Pre-loading TDT-CTC-110M + CTC models..." - "$CLI" ctc-earnings-benchmark --tdt-version tdt-ctc-110m --max-files 1 --auto-download \ + log "Pre-loading CTC earnings models..." + "$CLI" ctc-earnings-benchmark --max-files 1 --auto-download \ --output "$RESULTS_DIR/warmup_ctc.json" 2>&1 | tee -a "$LOG_FILE" - rm -f "$RESULTS_DIR"/warmup_*.json + log "Pre-loading EOU streaming models..." + "$CLI" parakeet-eou --benchmark --chunk-size 320 --max-files 1 \ + --output "$RESULTS_DIR/warmup_eou.json" 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading Nemotron streaming models..." + "$CLI" nemotron-benchmark --max-files 1 2>&1 | tee -a "$LOG_FILE" + + rm -f "$RESULTS_DIR"/warmup_*.json /tmp/nemotron_*_benchmark.json log "=== Downloads complete ===" exit 0 fi @@ -161,7 +180,7 @@ if ! verify_assets; then fi log "All assets verified locally." -log "=== Parakeet benchmark suite: $MAX_FILES files x 4 benchmarks ===" +log "=== Parakeet benchmark suite: $MAX_FILES files x 6 benchmarks ===" log "Results directory: $RESULTS_DIR" cd "$PROJECT_DIR" @@ -205,11 +224,11 @@ run_ctc_earnings_benchmark() { local label="ctc_earnings_vocab" local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" - log "--- $label: starting ($MAX_FILES files, tdt-ctc-110m + CTC keyword spotting) ---" + log "--- $label: starting ($MAX_FILES files, v2 TDT + CTC keyword spotting) ---" local start_time=$(date +%s) + # TDT v2 is used for transcription to match benchmarks100.md baseline "$CLI" ctc-earnings-benchmark \ - --tdt-version tdt-ctc-110m \ --ctc-variant 110m \ --max-files "$MAX_FILES" \ --output "$output_file" \ @@ -220,12 +239,56 @@ run_ctc_earnings_benchmark() { log "--- $label: finished in ${elapsed}s — $output_file ---" } +run_eou_benchmark() { + local label="eou_320ms" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + + log "--- $label: starting ($MAX_FILES files, $SUBSET, 320ms chunks) ---" + local start_time=$(date +%s) + + "$CLI" parakeet-eou \ + --benchmark \ + --chunk-size 320 \ + --max-files "$MAX_FILES" \ + --use-cache \ + --output "$output_file" \ + 2>&1 | tee -a "$LOG_FILE" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +run_nemotron_benchmark() { + local label="nemotron_1120ms" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + + log "--- $label: starting ($MAX_FILES files, $SUBSET, 1120ms chunks) ---" + local start_time=$(date +%s) + + "$CLI" nemotron-benchmark \ + --max-files "$MAX_FILES" \ + 2>&1 | tee -a "$LOG_FILE" + + # Nemotron writes to /tmp; copy to our results dir + local tmp_file="/tmp/nemotron_1120ms_benchmark.json" + if [[ -f "$tmp_file" ]]; then + cp "$tmp_file" "$output_file" + fi + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + SUITE_START=$(date +%s) run_asr_benchmark "v3" "parakeet_v3" run_asr_benchmark "v2" "parakeet_v2" run_asr_benchmark "tdt-ctc-110m" "parakeet_tdt_ctc_110m" run_ctc_earnings_benchmark +run_eou_benchmark +run_nemotron_benchmark SUITE_END=$(date +%s) SUITE_ELAPSED=$(( SUITE_END - SUITE_START )) @@ -234,4 +297,102 @@ log "=== All benchmarks complete in ${SUITE_ELAPSED}s ===" log "Results:" ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE" +# --------------------------------------------------------------------------- +# Compare WER against benchmarks100.md baselines +# --------------------------------------------------------------------------- +# Baselines from Documentation/ASR/benchmarks100.md (main column) +BASELINE_V3_WER="2.6" +BASELINE_V2_WER="3.8" +BASELINE_TDT_CTC_WER="3.6" +BASELINE_EARNINGS_WER="16.54" +BASELINE_EOU_WER="7.11" +BASELINE_NEMOTRON_WER="1.99" + +extract_wer() { + local json_file="$1" + local field="$2" + if [[ -f "$json_file" ]]; then + python3 -c "import json,sys; d=json.load(open('$json_file')); print(round(d['summary']['$field']*100, 2))" 2>/dev/null || echo "N/A" + else + echo "N/A" + fi +} + +# For JSON fields that already store WER as a percentage (not decimal) +extract_wer_pct() { + local json_file="$1" + local section="$2" + local field="$3" + if [[ -f "$json_file" ]]; then + if [[ -n "$section" ]]; then + python3 -c "import json; d=json.load(open('$json_file')); print(round(d['$section']['$field'], 2))" 2>/dev/null || echo "N/A" + else + python3 -c "import json; d=json.load(open('$json_file')); print(round(d['$field'], 2))" 2>/dev/null || echo "N/A" + fi + else + echo "N/A" + fi +} + +V3_FILE="$RESULTS_DIR/parakeet_v3_${TIMESTAMP}.json" +V2_FILE="$RESULTS_DIR/parakeet_v2_${TIMESTAMP}.json" +TDT_CTC_FILE="$RESULTS_DIR/parakeet_tdt_ctc_110m_${TIMESTAMP}.json" +EARNINGS_FILE="$RESULTS_DIR/ctc_earnings_vocab_${TIMESTAMP}.json" +EOU_FILE="$RESULTS_DIR/eou_320ms_${TIMESTAMP}.json" +NEMOTRON_FILE="$RESULTS_DIR/nemotron_1120ms_${TIMESTAMP}.json" + +V3_WER=$(extract_wer "$V3_FILE" "averageWER") +V2_WER=$(extract_wer "$V2_FILE" "averageWER") +TDT_CTC_WER=$(extract_wer "$TDT_CTC_FILE" "averageWER") +EARNINGS_WER=$(extract_wer_pct "$EARNINGS_FILE" "summary" "avgWer") +EOU_WER=$(extract_wer "$EOU_FILE" "averageWER") +NEMOTRON_WER=$(extract_wer_pct "$NEMOTRON_FILE" "" "wer") + +log "" +log "=== WER Comparison vs benchmarks100.md baselines ===" +log "" +printf "%-25s %10s %10s %10s\n" "Model" "Baseline" "Current" "Delta" | tee -a "$LOG_FILE" +printf "%-25s %10s %10s %10s\n" "-------------------------" "----------" "----------" "----------" | tee -a "$LOG_FILE" + +compare_wer() { + local label="$1" baseline="$2" current="$3" + if [[ "$current" == "N/A" ]]; then + printf "%-25s %9s%% %10s %10s\n" "$label" "$baseline" "N/A" "—" | tee -a "$LOG_FILE" + return + fi + local delta + delta=$(python3 -c "print(f'{$current - $baseline:+.2f}')" 2>/dev/null || echo "?") + local marker="" + local regression + regression=$(python3 -c "print('YES' if $current > $baseline + 0.3 else 'NO')" 2>/dev/null || echo "NO") + if [[ "$regression" == "YES" ]]; then + marker=" ← REGRESSION" + fi + printf "%-25s %9s%% %9s%% %9s%%%s\n" "$label" "$baseline" "$current" "$delta" "$marker" | tee -a "$LOG_FILE" +} + +compare_wer "Parakeet TDT v3 (0.6B)" "$BASELINE_V3_WER" "$V3_WER" +compare_wer "Parakeet TDT v2 (0.6B)" "$BASELINE_V2_WER" "$V2_WER" +compare_wer "CTC-TDT 110M" "$BASELINE_TDT_CTC_WER" "$TDT_CTC_WER" +compare_wer "CTC Earnings" "$BASELINE_EARNINGS_WER" "$EARNINGS_WER" +compare_wer "EOU 320ms (120M)" "$BASELINE_EOU_WER" "$EOU_WER" +compare_wer "Nemotron 1120ms (0.6B)" "$BASELINE_NEMOTRON_WER" "$NEMOTRON_WER" + +log "" + +# Check for any regressions (>0.3% WER increase) +ANY_REGRESSION=$(python3 -c " +baselines = [($BASELINE_V3_WER, '$V3_WER'), ($BASELINE_V2_WER, '$V2_WER'), ($BASELINE_TDT_CTC_WER, '$TDT_CTC_WER'), ($BASELINE_EARNINGS_WER, '$EARNINGS_WER'), ($BASELINE_EOU_WER, '$EOU_WER'), ($BASELINE_NEMOTRON_WER, '$NEMOTRON_WER')] +for b, c in baselines: + if c != 'N/A' and float(c) > b + 0.3: + print('YES'); exit() +print('NO') +" 2>/dev/null || echo "NO") + +if [[ "$ANY_REGRESSION" == "YES" ]]; then + log "⚠ WER REGRESSION DETECTED — investigate before merging" +else + log "✓ No WER regressions (all within 0.3% of baseline)" +fi + # caffeinate will exit automatically since the parent process ($$) exits diff --git a/Sources/FluidAudio/ModelNames.swift b/Sources/FluidAudio/ModelNames.swift index 0535d1a81..5227c7712 100644 --- a/Sources/FluidAudio/ModelNames.swift +++ b/Sources/FluidAudio/ModelNames.swift @@ -129,6 +129,10 @@ public enum Repo: String, CaseIterable { return "nemotron-streaming/560ms" case .sortformer: return "sortformer" + case .parakeetCtc110m: + return "parakeet-ctc-110m-coreml" + case .parakeetCtc06b: + return "parakeet-ctc-0.6b-coreml" case .parakeetTdtCtc110m: return "parakeet-tdt-ctc-110m" default: From bd5e4c132797c3caad447f3bab060da3d5dc63a5 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 23:11:26 -0400 Subject: [PATCH 13/16] Add diarizer benchmark script and AMI subset baseline results Adds Scripts/run_diarizer_benchmarks.sh that runs all 4 diarization systems (Offline VBx, Streaming 5s, Sortformer, LS-EEND) on a 4-meeting AMI SDM subset for regression testing. Adds Documentation/Diarization/BenchmarkAMISubset.md recording the baseline DER/RTFx for each system on EN2002a, ES2004a, IS1009a, TS3003a. --- .gitignore | 1 + .../Diarization/BenchmarkAMISubset.md | 147 ++++++ Scripts/run_diarizer_benchmarks.sh | 494 ++++++++++++++++++ 3 files changed, 642 insertions(+) create mode 100644 Documentation/Diarization/BenchmarkAMISubset.md create mode 100755 Scripts/run_diarizer_benchmarks.sh diff --git a/.gitignore b/.gitignore index 72d625aa5..5836d594d 100644 --- a/.gitignore +++ b/.gitignore @@ -103,6 +103,7 @@ Resources/ !Sources/FluidAudio/Resources/** scripts/ !Scripts/run_parakeet_benchmarks.sh +!Scripts/run_diarizer_benchmarks.sh Documentation/parakeet-tdt/ docs/parakeet-tdt/ diff --git a/Documentation/Diarization/BenchmarkAMISubset.md b/Documentation/Diarization/BenchmarkAMISubset.md new file mode 100644 index 000000000..19b99f040 --- /dev/null +++ b/Documentation/Diarization/BenchmarkAMISubset.md @@ -0,0 +1,147 @@ +# Diarization Benchmarks + +Hardware: 2024 MacBook Pro, 48GB RAM, M4 Pro, macOS Tahoe 26.0 + +Dataset: AMI SDM (Single Distant Microphone), 4-meeting subset — one session per speaker group for diversity. + +All results use collar=0.25s, ignoreOverlap=true. + +## Summary + +| System | Avg DER | Avg RTFx | Mode | +|---|---|---|---| +| LS-EEND (AMI) | 25.7% | 53.9x | Streaming | +| Offline VBx | 21.8% | 97.5x | Offline | +| Streaming 5s/0.8 | 29.9% | 96.2x | Streaming | +| Sortformer (high-lat) | 34.3% | 120.3x | Streaming | + +## Offline VBx + +Pyannote segmentation + WeSpeaker embeddings + PLDA scoring + VBx clustering. + +Default configuration: step ratio 0.2, minSegmentDurationSeconds 1.0, clustering threshold 0.7. + +```bash +Scripts/run_diarizer_benchmarks.sh +# or manually: +swift run -c release fluidaudiocli diarization-benchmark --mode offline \ + --dataset ami-sdm --auto-download +``` + +```text +---------------------------------------------------------------------- +Meeting DER % Miss % FA % SE % Speakers RTFx +---------------------------------------------------------------------- +ES2004a 14.5 7.6 1.7 5.2 5/4 98.2 +IS1009a 17.7 3.6 3.0 11.1 6/4 99.1 +TS3003a 21.2 11.7 1.4 8.1 2/4 98.4 +EN2002a 33.9 4.5 1.4 28.0 4/4 94.2 +---------------------------------------------------------------------- +AVERAGE 21.8 6.9 1.9 13.1 - 97.5 +====================================================================== +``` + +Full VoxConverse results (232 clips): 15.07% DER, 122x RTFx. See [Benchmarks.md](../Benchmarks.md) for details. + +## Streaming (5s chunks, 0.8 threshold) + +Pyannote segmentation + WeSpeaker embeddings + online SpeakerManager clustering. + +Best streaming configuration: 5s chunks, 0s overlap, 0.8 clustering threshold. + +```bash +Scripts/run_diarizer_benchmarks.sh +# or manually: +swift run -c release fluidaudiocli diarization-benchmark --mode streaming \ + --dataset ami-sdm --chunk-seconds 5.0 --overlap-seconds 0.0 \ + --threshold 0.8 --auto-download +``` + +```text +---------------------------------------------------------------------- +Meeting DER % Miss % FA % SE % Speakers RTFx +---------------------------------------------------------------------- +ES2004a 17.0 9.0 1.3 6.7 7/4 99.2 +IS1009a 18.1 4.7 2.7 10.8 4/4 101.0 +TS3003a 21.0 12.7 1.4 6.8 2/4 104.3 +EN2002a 63.4 9.2 1.1 53.0 7/4 80.1 +---------------------------------------------------------------------- +AVERAGE 29.9 8.9 1.6 19.3 - 96.2 +====================================================================== +``` + +Full 7-meeting results: 26.2% DER, 223x RTFx. See [Benchmarks.md](../Benchmarks.md) for details. + +EN2002a is a known difficult meeting for the streaming pipeline — aggressive speaker error (53%) due to over-fragmentation. + +## Sortformer (NVIDIA High-Latency) + +NVIDIA end-to-end Sortformer model, 30.4s chunk config. + +Model: [FluidInference/diar-streaming-sortformer-coreml](https://huggingface.co/FluidInference/diar-streaming-sortformer-coreml) + +```bash +Scripts/run_diarizer_benchmarks.sh +# or manually: +swift run -c release fluidaudiocli sortformer-benchmark \ + --nvidia-high-latency --hf --auto-download +``` + +```text +---------------------------------------------------------------------- +Meeting DER % Miss % FA % SE % Speakers RTFx +---------------------------------------------------------------------- +IS1009a 26.5 15.9 1.4 9.3 4/4 122.9 +ES2004a 33.4 24.5 0.1 8.8 4/4 117.9 +EN2002a 35.7 20.0 0.4 15.2 4/4 121.5 +TS3003a 41.8 36.8 0.7 4.3 4/4 119.0 +---------------------------------------------------------------------- +AVERAGE 34.3 24.3 0.7 9.4 - 120.3 +====================================================================== +``` + +Full 16-meeting results: 31.7% DER, 126.7x RTFx. See [Benchmarks.md](../Benchmarks.md) for details. + +## LS-EEND (AMI variant) + +Linear Streaming End-to-End Neural Diarization from Westlake University. + +Model: [GradientDescent2718/ls-eend-coreml](https://huggingface.co/GradientDescent2718/ls-eend-coreml) + +```bash +Scripts/run_diarizer_benchmarks.sh +# or manually: +swift run -c release fluidaudiocli lseend-benchmark \ + --variant ami --auto-download +``` + +```text +---------------------------------------------------------------------- +Meeting DER % Miss % FA % SE % Speakers RTFx +---------------------------------------------------------------------- +TS3003a 19.0 16.6 0.8 1.6 4/4 47.5 +IS1009a 23.4 8.0 2.6 12.8 4/4 57.7 +EN2002a 24.5 19.7 1.1 3.6 4/4 53.2 +ES2004a 35.8 13.3 19.2 3.2 4/4 57.2 +---------------------------------------------------------------------- +AVERAGE 25.7 14.4 5.9 5.3 - 53.9 +====================================================================== +``` + +Full 16-meeting results: 20.7% DER, 74.5x RTFx. See [Benchmarks.md](../Benchmarks.md) for details. + +## Reproducing + +Run all 4 systems on the default 4-meeting subset: + +```bash +./Scripts/run_diarizer_benchmarks.sh +``` + +Run on all 16 AMI meetings: + +```bash +./Scripts/run_diarizer_benchmarks.sh --all +``` + +Results are saved to `benchmark_results/` with timestamps. The script uses `caffeinate` to prevent sleep during long runs. diff --git a/Scripts/run_diarizer_benchmarks.sh b/Scripts/run_diarizer_benchmarks.sh new file mode 100755 index 000000000..ea582270c --- /dev/null +++ b/Scripts/run_diarizer_benchmarks.sh @@ -0,0 +1,494 @@ +#!/bin/bash +# Run all diarizer model benchmarks on AMI SDM with sleep prevention. +# +# Benchmarks: +# 1. Offline (VBx) — OfflineDiarizerManager, step=0.2, min-seg=1.0 +# 2. Streaming (5s) — DiarizerManager, 5s chunks, 0s overlap, threshold=0.8 +# 3. Sortformer — SortformerDiarizer, NVIDIA high-latency config +# 4. LS-EEND — LSEENDDiarizer, AMI variant +# +# Usage: +# ./Scripts/run_diarizer_benchmarks.sh # quick run (4 meetings) +# ./Scripts/run_diarizer_benchmarks.sh --all # full run (all 16 meetings) +# ./Scripts/run_diarizer_benchmarks.sh --max-files 8 # custom subset +# ./Scripts/run_diarizer_benchmarks.sh --download # download missing assets, then exit +# +# The script verifies all models and dataset files exist locally before running. +# If anything is missing it will tell you exactly what and exit (unless --download). +# Uses caffeinate to prevent sleep so you can close the lid. +# Results are saved to benchmark_results/ with timestamps. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +RESULTS_DIR="$PROJECT_DIR/benchmark_results" +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") +LOG_FILE="$RESULTS_DIR/diarizer_benchmark_${TIMESTAMP}.log" + +MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models" +DATASETS_DIR="$HOME/FluidAudioDatasets" +AMI_SDM_DIR="$DATASETS_DIR/ami_official/sdm" +AMI_RTTM_DIR="$DATASETS_DIR/ami_official/rttm" +MAX_FILES=4 # default: quick 4-meeting subset + +# AMI SDM has 16 meetings — this is the standard diarization test set. +# Ordered so the first N picks one from each speaker group for maximum diversity. +# Groups: EN2002 (4 speakers), ES2004 (4), IS1009 (4), TS3003 (4) +ALL_AMI_MEETINGS=( + EN2002a ES2004a IS1009a TS3003a + EN2002b ES2004b IS1009b TS3003b + EN2002c ES2004c IS1009c TS3003c + EN2002d ES2004d IS1009d TS3003d +) + +# Parse --all / --max-files from arguments +args=("$@") +for ((i=0; i<${#args[@]}; i++)); do + case "${args[$i]}" in + --all) MAX_FILES=${#ALL_AMI_MEETINGS[@]} ;; + --max-files) MAX_FILES="${args[$((i+1))]}" ; i=$((i+1)) ;; + esac +done + +# Select the subset of meetings to run +AMI_MEETINGS=("${ALL_AMI_MEETINGS[@]:0:$MAX_FILES}") + +mkdir -p "$RESULTS_DIR" + +log() { + echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE" +} + +# --------------------------------------------------------------------------- +# Verify local assets +# --------------------------------------------------------------------------- +verify_assets() { + local missing=0 + + # --- AMI SDM audio files --- + local wav_count=0 + for meeting in "${AMI_MEETINGS[@]}"; do + if [[ -f "$AMI_SDM_DIR/${meeting}.Mix-Headset.wav" ]]; then + wav_count=$((wav_count + 1)) + else + log "MISSING AMI SDM: $AMI_SDM_DIR/${meeting}.Mix-Headset.wav" + missing=1 + fi + done + if [[ "$wav_count" -eq 0 ]]; then + log "MISSING AMI SDM: no wav files found in $AMI_SDM_DIR" + missing=1 + fi + + # --- AMI RTTM annotations (downloaded automatically by --auto-download) --- + local rttm_count=0 + for meeting in "${ALL_AMI_MEETINGS[@]}"; do + if [[ -f "$AMI_RTTM_DIR/${meeting}.rttm" ]]; then + rttm_count=$((rttm_count + 1)) + fi + done + if [[ "$rttm_count" -eq 0 ]]; then + log "NOTE AMI RTTM annotations not found — will be auto-downloaded by CLI" + fi + + # --- Offline diarizer models (pyannote segmentation + wespeaker embedding) --- + local diar_dir="$MODELS_DIR/speaker-diarization-coreml" + if [[ ! -d "$diar_dir" ]]; then + log "MISSING Diarizer models: $diar_dir" + missing=1 + fi + + # --- Sortformer models (folder may or may not have -coreml suffix) --- + if [[ ! -d "$MODELS_DIR/diar-streaming-sortformer-coreml" ]] && [[ ! -d "$MODELS_DIR/diar-streaming-sortformer" ]]; then + log "MISSING Sortformer models: $MODELS_DIR/diar-streaming-sortformer{,-coreml}" + missing=1 + fi + + # --- LS-EEND models (folder may or may not have -coreml suffix) --- + if [[ ! -d "$MODELS_DIR/ls-eend-coreml" ]] && [[ ! -d "$MODELS_DIR/ls-eend" ]]; then + log "MISSING LS-EEND models: $MODELS_DIR/ls-eend{,-coreml}" + missing=1 + fi + + return $missing +} + +# --------------------------------------------------------------------------- +# Phase 1: --download (verify first, download only what's missing) +# --------------------------------------------------------------------------- +if [[ "${1:-}" == "--download" ]]; then + log "=== Checking local assets ===" + + if verify_assets; then + log "All models and datasets already present locally. Nothing to download." + exit 0 + fi + + log "Some assets are missing — downloading..." + + log "Building release binary..." + cd "$PROJECT_DIR" && swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" + CLI="$PROJECT_DIR/.build/release/fluidaudiocli" + + log "Downloading AMI SDM dataset + annotations..." + "$CLI" diarization-benchmark --mode offline --auto-download --max-files 1 \ + --output "$RESULTS_DIR/warmup_offline.json" 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading Sortformer models..." + "$CLI" sortformer-benchmark --nvidia-high-latency --hf --auto-download --max-files 1 \ + --output "$RESULTS_DIR/warmup_sortformer.json" 2>&1 | tee -a "$LOG_FILE" + + log "Pre-loading LS-EEND models..." + "$CLI" lseend-benchmark --variant ami --auto-download --max-files 1 \ + --output "$RESULTS_DIR/warmup_lseend.json" 2>&1 | tee -a "$LOG_FILE" + + rm -f "$RESULTS_DIR"/warmup_*.json + log "=== Downloads complete ===" + exit 0 +fi + +# --------------------------------------------------------------------------- +# Phase 2: Run benchmarks (offline-safe, sleep-prevented) +# --------------------------------------------------------------------------- +log "=== Verifying local assets before offline run ===" +if ! verify_assets; then + log "" + log "ERROR: Missing assets — cannot run offline." + log "Run with --download first while connected to the internet:" + log " ./Scripts/run_diarizer_benchmarks.sh --download" + exit 1 +fi +log "All assets verified locally." + +log "=== Diarizer benchmark suite: ${#AMI_MEETINGS[@]}/${#ALL_AMI_MEETINGS[@]} meetings x 4 systems ===" +log "Results directory: $RESULTS_DIR" + +cd "$PROJECT_DIR" + +# Build release if not already built +if [[ ! -x ".build/release/fluidaudiocli" ]]; then + log "Building release binary..." + swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" +fi +CLI="$PROJECT_DIR/.build/release/fluidaudiocli" + +# caffeinate -s: prevent sleep even on AC power / lid closed +# caffeinate -i: prevent idle sleep +caffeinate -si -w $$ & +CAFFEINATE_PID=$! +log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid" + +# --------------------------------------------------------------------------- +# Benchmark runners +# --------------------------------------------------------------------------- + +# Run a benchmark for each meeting via --single-file, then merge JSON results. +# This ensures we control exactly which meetings run (not the CLI's internal order). +merge_json_results() { + local output_file="$1" + shift + local tmp_files=("$@") + python3 -c " +import json, sys, glob +results = [] +for f in sys.argv[1:]: + try: + with open(f) as fh: + data = json.load(fh) + if isinstance(data, list): + results.extend(data) + else: + results.append(data) + except: pass +with open(sys.argv[1].rsplit('_tmp_', 1)[0] + '.json', 'w') as out: + json.dump(results, out, indent=2) +" "$@" 2>/dev/null + # Also write to the expected output path + python3 -c " +import json, sys +results = [] +for f in sys.argv[2:]: + try: + with open(f) as fh: + data = json.load(fh) + if isinstance(data, list): + results.extend(data) + else: + results.append(data) + except: pass +with open(sys.argv[1], 'w') as out: + json.dump(results, out, indent=2) +" "$output_file" "${tmp_files[@]}" 2>/dev/null + rm -f "${tmp_files[@]}" +} + +run_offline_benchmark() { + local label="offline_vbx" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + local tmp_files=() + + log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, offline VBx) ---" + local start_time=$(date +%s) + + for meeting in "${AMI_MEETINGS[@]}"; do + local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json" + tmp_files+=("$tmp") + log " [$label] $meeting" + "$CLI" diarization-benchmark \ + --mode offline \ + --dataset ami-sdm \ + --single-file "$meeting" \ + --auto-download \ + --output "$tmp" \ + 2>&1 | tee -a "$LOG_FILE" + done + + merge_json_results "$output_file" "${tmp_files[@]}" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +run_streaming_benchmark() { + local label="streaming_5s" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + local tmp_files=() + + log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, 5s chunks, threshold=0.8) ---" + local start_time=$(date +%s) + + for meeting in "${AMI_MEETINGS[@]}"; do + local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json" + tmp_files+=("$tmp") + log " [$label] $meeting" + "$CLI" diarization-benchmark \ + --mode streaming \ + --dataset ami-sdm \ + --single-file "$meeting" \ + --chunk-seconds 5.0 \ + --overlap-seconds 0.0 \ + --threshold 0.8 \ + --auto-download \ + --output "$tmp" \ + 2>&1 | tee -a "$LOG_FILE" + done + + merge_json_results "$output_file" "${tmp_files[@]}" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +run_sortformer_benchmark() { + local label="sortformer" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + local tmp_files=() + + log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, NVIDIA high-latency) ---" + local start_time=$(date +%s) + + for meeting in "${AMI_MEETINGS[@]}"; do + local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json" + tmp_files+=("$tmp") + log " [$label] $meeting" + "$CLI" sortformer-benchmark \ + --nvidia-high-latency \ + --hf \ + --dataset ami \ + --single-file "$meeting" \ + --auto-download \ + --output "$tmp" \ + 2>&1 | tee -a "$LOG_FILE" + done + + merge_json_results "$output_file" "${tmp_files[@]}" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +run_lseend_benchmark() { + local label="lseend_ami" + local output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" + local tmp_files=() + + log "--- $label: starting (${#AMI_MEETINGS[@]} meetings, AMI SDM, AMI variant) ---" + local start_time=$(date +%s) + + for meeting in "${AMI_MEETINGS[@]}"; do + local tmp="$RESULTS_DIR/${label}_tmp_${meeting}.json" + tmp_files+=("$tmp") + log " [$label] $meeting" + "$CLI" lseend-benchmark \ + --variant ami \ + --dataset ami \ + --single-file "$meeting" \ + --auto-download \ + --output "$tmp" \ + 2>&1 | tee -a "$LOG_FILE" + done + + merge_json_results "$output_file" "${tmp_files[@]}" + + local end_time=$(date +%s) + local elapsed=$(( end_time - start_time )) + log "--- $label: finished in ${elapsed}s — $output_file ---" +} + +# --------------------------------------------------------------------------- +# Run all 4 benchmarks +# --------------------------------------------------------------------------- +SUITE_START=$(date +%s) + +run_offline_benchmark +run_streaming_benchmark +run_sortformer_benchmark +run_lseend_benchmark + +SUITE_END=$(date +%s) +SUITE_ELAPSED=$(( SUITE_END - SUITE_START )) + +log "=== All benchmarks complete in ${SUITE_ELAPSED}s ===" +log "Results:" +ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE" + +# --------------------------------------------------------------------------- +# Extract DER and RTFx from JSON results +# --------------------------------------------------------------------------- + +# Streaming diarization benchmark: JSON is array of per-meeting results with "der" and "rtfx" +extract_streaming_metrics() { + local json_file="$1" + if [[ -f "$json_file" ]]; then + python3 -c " +import json, sys +with open('$json_file') as f: + results = json.load(f) +if not results: + print('N/A N/A') + sys.exit() +avg_der = sum(r['der'] for r in results) / len(results) +avg_rtfx = sum(r['rtfx'] for r in results) / len(results) +print(f'{avg_der:.1f} {avg_rtfx:.1f}') +" 2>/dev/null || echo "N/A N/A" + else + echo "N/A N/A" + fi +} + +# Sortformer/LS-EEND: same JSON array format via DiarizationBenchmarkUtils +extract_shared_metrics() { + local json_file="$1" + if [[ -f "$json_file" ]]; then + python3 -c " +import json, sys +with open('$json_file') as f: + results = json.load(f) +if not results: + print('N/A N/A') + sys.exit() +avg_der = sum(r['der'] for r in results) / len(results) +avg_rtfx = sum(r['rtfx'] for r in results) / len(results) +print(f'{avg_der:.1f} {avg_rtfx:.1f}') +" 2>/dev/null || echo "N/A N/A" + else + echo "N/A N/A" + fi +} + +# --------------------------------------------------------------------------- +# Compare DER & RTFx against Benchmarks.md baselines +# --------------------------------------------------------------------------- + +# Baselines from Documentation/Benchmarks.md (AMI SDM, all 16 meetings) +# Note: when running a subset (--max-files <16), DER will differ from these baselines +# due to per-meeting variance. Baselines are for full 16-meeting runs only. +# Offline: no AMI SDM baseline yet — first --all run establishes it. +# Streaming: 5s/0s/0.8 on AMI SDM (7 meetings) = 26.2% DER, 223.1x RTFx +# Sortformer: NVIDIA high-latency on AMI SDM (16 meetings) = 31.7% DER, 126.7x RTFx +# LS-EEND: AMI variant on AMI SDM (16 meetings) = 20.7% DER, 74.5x RTFx +BASELINE_STREAMING_DER="26.2" +BASELINE_STREAMING_RTFX="223.1" +BASELINE_SORTFORMER_DER="31.7" +BASELINE_SORTFORMER_RTFX="126.7" +BASELINE_LSEEND_DER="20.7" +BASELINE_LSEEND_RTFX="74.5" + +OFFLINE_FILE="$RESULTS_DIR/offline_vbx_${TIMESTAMP}.json" +STREAMING_FILE="$RESULTS_DIR/streaming_5s_${TIMESTAMP}.json" +SORTFORMER_FILE="$RESULTS_DIR/sortformer_${TIMESTAMP}.json" +LSEEND_FILE="$RESULTS_DIR/lseend_ami_${TIMESTAMP}.json" + +read OFFLINE_DER OFFLINE_RTFX <<< $(extract_streaming_metrics "$OFFLINE_FILE") +read STREAMING_DER STREAMING_RTFX <<< $(extract_streaming_metrics "$STREAMING_FILE") +read SORTFORMER_DER SORTFORMER_RTFX <<< $(extract_shared_metrics "$SORTFORMER_FILE") +read LSEEND_DER LSEEND_RTFX <<< $(extract_shared_metrics "$LSEEND_FILE") + +log "" +log "=== DER & RTFx Comparison vs Benchmarks.md baselines (AMI SDM, ${#AMI_MEETINGS[@]} meetings) ===" +log "" +printf "%-25s %12s %12s %12s %12s %12s\n" \ + "System" "Base DER" "DER" "Delta" "Base RTFx" "RTFx" | tee -a "$LOG_FILE" +printf "%-25s %12s %12s %12s %12s %12s\n" \ + "-------------------------" "------------" "------------" "------------" "------------" "------------" | tee -a "$LOG_FILE" + +compare_der_rtfx() { + local label="$1" base_der="$2" current_der="$3" base_rtfx="$4" current_rtfx="$5" + + if [[ "$current_der" == "N/A" ]]; then + printf "%-25s %11s%% %12s %12s %11sx %12s\n" \ + "$label" "$base_der" "N/A" "—" "$base_rtfx" "N/A" | tee -a "$LOG_FILE" + return + fi + + local delta marker="" + delta=$(python3 -c "print(f'{$current_der - $base_der:+.1f}')" 2>/dev/null || echo "?") + local regression + regression=$(python3 -c "print('YES' if $current_der > $base_der + 2.0 else 'NO')" 2>/dev/null || echo "NO") + if [[ "$regression" == "YES" ]]; then + marker=" <- REGRESSION" + fi + + printf "%-25s %11s%% %11s%% %11s%% %11sx %11sx%s\n" \ + "$label" "$base_der" "$current_der" "$delta" "$base_rtfx" "$current_rtfx" "$marker" | tee -a "$LOG_FILE" +} + +# Offline has no AMI SDM baseline yet — show as "new" +if [[ "$OFFLINE_DER" != "N/A" ]]; then + printf "%-25s %12s %11s%% %12s %12s %11sx\n" \ + "Offline (VBx)" "—" "$OFFLINE_DER" "(new)" "—" "$OFFLINE_RTFX" | tee -a "$LOG_FILE" +else + printf "%-25s %12s %12s %12s %12s %12s\n" \ + "Offline (VBx)" "—" "N/A" "—" "—" "N/A" | tee -a "$LOG_FILE" +fi + +compare_der_rtfx "Streaming (5s/0.8)" "$BASELINE_STREAMING_DER" "$STREAMING_DER" "$BASELINE_STREAMING_RTFX" "$STREAMING_RTFX" +compare_der_rtfx "Sortformer (high-lat)" "$BASELINE_SORTFORMER_DER" "$SORTFORMER_DER" "$BASELINE_SORTFORMER_RTFX" "$SORTFORMER_RTFX" +compare_der_rtfx "LS-EEND (AMI)" "$BASELINE_LSEEND_DER" "$LSEEND_DER" "$BASELINE_LSEEND_RTFX" "$LSEEND_RTFX" + +log "" + +# Check for any DER regressions (>2.0% increase — diarization is noisier than ASR) +ANY_REGRESSION=$(python3 -c " +baselines = [ + ($BASELINE_STREAMING_DER, '$STREAMING_DER'), + ($BASELINE_SORTFORMER_DER, '$SORTFORMER_DER'), + ($BASELINE_LSEEND_DER, '$LSEEND_DER'), +] +for b, c in baselines: + if c != 'N/A' and float(c) > b + 2.0: + print('YES'); exit() +print('NO') +" 2>/dev/null || echo "NO") + +if [[ "$ANY_REGRESSION" == "YES" ]]; then + log "WARNING: DER REGRESSION DETECTED (>2.0% above baseline) — investigate before merging" +else + log "No DER regressions (all within 2.0% of baseline)" +fi + +# caffeinate will exit automatically since the parent process ($$) exits From ad70060880c39515b42549d68dc47fb044a88f6c Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 23:13:34 -0400 Subject: [PATCH 14/16] Rename benchmark scripts to subset to clarify scope MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_parakeet_benchmarks.sh → run_parakeet_subset.sh run_diarizer_benchmarks.sh → run_diarizer_subset.sh Updates all references in .gitignore, benchmarks100.md, and BenchmarkAMISubset.md. --- .gitignore | 4 ++-- Documentation/ASR/benchmarks100.md | 6 +++--- Documentation/Diarization/BenchmarkAMISubset.md | 12 ++++++------ ...diarizer_benchmarks.sh => run_diarizer_subset.sh} | 10 +++++----- ...parakeet_benchmarks.sh => run_parakeet_subset.sh} | 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) rename Scripts/{run_diarizer_benchmarks.sh => run_diarizer_subset.sh} (97%) rename Scripts/{run_parakeet_benchmarks.sh => run_parakeet_subset.sh} (98%) diff --git a/.gitignore b/.gitignore index 5836d594d..9fd3f851e 100644 --- a/.gitignore +++ b/.gitignore @@ -102,8 +102,8 @@ Resources/ !Sources/FluidAudio/Resources/ !Sources/FluidAudio/Resources/** scripts/ -!Scripts/run_parakeet_benchmarks.sh -!Scripts/run_diarizer_benchmarks.sh +!Scripts/run_parakeet_subset.sh +!Scripts/run_diarizer_subset.sh Documentation/parakeet-tdt/ docs/parakeet-tdt/ diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 9358752c8..08d1dcab9 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -4,14 +4,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru ## Reproduction -All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/run_parakeet_benchmarks.sh`](../../Scripts/run_parakeet_benchmarks.sh): +All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/run_parakeet_subset.sh`](../../Scripts/run_parakeet_subset.sh): ```bash # Download models and datasets (requires internet) -./Scripts/run_parakeet_benchmarks.sh --download +./Scripts/run_parakeet_subset.sh --download # Run all 4 benchmarks offline (100 files each, sleep-prevented) -./Scripts/run_parakeet_benchmarks.sh +./Scripts/run_parakeet_subset.sh ``` ## Environment diff --git a/Documentation/Diarization/BenchmarkAMISubset.md b/Documentation/Diarization/BenchmarkAMISubset.md index 19b99f040..ee022b185 100644 --- a/Documentation/Diarization/BenchmarkAMISubset.md +++ b/Documentation/Diarization/BenchmarkAMISubset.md @@ -22,7 +22,7 @@ Pyannote segmentation + WeSpeaker embeddings + PLDA scoring + VBx clustering. Default configuration: step ratio 0.2, minSegmentDurationSeconds 1.0, clustering threshold 0.7. ```bash -Scripts/run_diarizer_benchmarks.sh +Scripts/run_diarizer_subset.sh # or manually: swift run -c release fluidaudiocli diarization-benchmark --mode offline \ --dataset ami-sdm --auto-download @@ -50,7 +50,7 @@ Pyannote segmentation + WeSpeaker embeddings + online SpeakerManager clustering. Best streaming configuration: 5s chunks, 0s overlap, 0.8 clustering threshold. ```bash -Scripts/run_diarizer_benchmarks.sh +Scripts/run_diarizer_subset.sh # or manually: swift run -c release fluidaudiocli diarization-benchmark --mode streaming \ --dataset ami-sdm --chunk-seconds 5.0 --overlap-seconds 0.0 \ @@ -81,7 +81,7 @@ NVIDIA end-to-end Sortformer model, 30.4s chunk config. Model: [FluidInference/diar-streaming-sortformer-coreml](https://huggingface.co/FluidInference/diar-streaming-sortformer-coreml) ```bash -Scripts/run_diarizer_benchmarks.sh +Scripts/run_diarizer_subset.sh # or manually: swift run -c release fluidaudiocli sortformer-benchmark \ --nvidia-high-latency --hf --auto-download @@ -109,7 +109,7 @@ Linear Streaming End-to-End Neural Diarization from Westlake University. Model: [GradientDescent2718/ls-eend-coreml](https://huggingface.co/GradientDescent2718/ls-eend-coreml) ```bash -Scripts/run_diarizer_benchmarks.sh +Scripts/run_diarizer_subset.sh # or manually: swift run -c release fluidaudiocli lseend-benchmark \ --variant ami --auto-download @@ -135,13 +135,13 @@ Full 16-meeting results: 20.7% DER, 74.5x RTFx. See [Benchmarks.md](../Benchmark Run all 4 systems on the default 4-meeting subset: ```bash -./Scripts/run_diarizer_benchmarks.sh +./Scripts/run_diarizer_subset.sh ``` Run on all 16 AMI meetings: ```bash -./Scripts/run_diarizer_benchmarks.sh --all +./Scripts/run_diarizer_subset.sh --all ``` Results are saved to `benchmark_results/` with timestamps. The script uses `caffeinate` to prevent sleep during long runs. diff --git a/Scripts/run_diarizer_benchmarks.sh b/Scripts/run_diarizer_subset.sh similarity index 97% rename from Scripts/run_diarizer_benchmarks.sh rename to Scripts/run_diarizer_subset.sh index ea582270c..d86701952 100755 --- a/Scripts/run_diarizer_benchmarks.sh +++ b/Scripts/run_diarizer_subset.sh @@ -8,10 +8,10 @@ # 4. LS-EEND — LSEENDDiarizer, AMI variant # # Usage: -# ./Scripts/run_diarizer_benchmarks.sh # quick run (4 meetings) -# ./Scripts/run_diarizer_benchmarks.sh --all # full run (all 16 meetings) -# ./Scripts/run_diarizer_benchmarks.sh --max-files 8 # custom subset -# ./Scripts/run_diarizer_benchmarks.sh --download # download missing assets, then exit +# ./Scripts/run_diarizer_subset.sh # quick run (4 meetings) +# ./Scripts/run_diarizer_subset.sh --all # full run (all 16 meetings) +# ./Scripts/run_diarizer_subset.sh --max-files 8 # custom subset +# ./Scripts/run_diarizer_subset.sh --download # download missing assets, then exit # # The script verifies all models and dataset files exist locally before running. # If anything is missing it will tell you exactly what and exit (unless --download). @@ -156,7 +156,7 @@ if ! verify_assets; then log "" log "ERROR: Missing assets — cannot run offline." log "Run with --download first while connected to the internet:" - log " ./Scripts/run_diarizer_benchmarks.sh --download" + log " ./Scripts/run_diarizer_subset.sh --download" exit 1 fi log "All assets verified locally." diff --git a/Scripts/run_parakeet_benchmarks.sh b/Scripts/run_parakeet_subset.sh similarity index 98% rename from Scripts/run_parakeet_benchmarks.sh rename to Scripts/run_parakeet_subset.sh index a54b32fec..186e13e1d 100755 --- a/Scripts/run_parakeet_benchmarks.sh +++ b/Scripts/run_parakeet_subset.sh @@ -10,8 +10,8 @@ # 6. Nemotron streaming — nemotron 1120ms on LibriSpeech test-clean # # Usage: -# ./Scripts/run_parakeet_benchmarks.sh # verify + run -# ./Scripts/run_parakeet_benchmarks.sh --download # download missing assets, then exit +# ./Scripts/run_parakeet_subset.sh # verify + run +# ./Scripts/run_parakeet_subset.sh --download # download missing assets, then exit # # The script verifies all models and dataset files exist locally before running. # If anything is missing it will tell you exactly what and exit (unless --download). @@ -175,7 +175,7 @@ if ! verify_assets; then log "" log "ERROR: Missing assets — cannot run offline." log "Run with --download first while connected to the internet:" - log " ./Scripts/run_parakeet_benchmarks.sh --download" + log " ./Scripts/run_parakeet_subset.sh --download" exit 1 fi log "All assets verified locally." From 42a8c8b78d8e1f534a327a98f49b0d6b9b1b3d7a Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 23:14:43 -0400 Subject: [PATCH 15/16] Rename subset scripts: drop run_ prefix, add _benchmark suffix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit run_diarizer_subset.sh → diarizer_subset_benchmark.sh run_parakeet_subset.sh → parakeet_subset_benchmark.sh --- .gitignore | 4 ++-- Documentation/ASR/benchmarks100.md | 6 +++--- Documentation/Diarization/BenchmarkAMISubset.md | 12 ++++++------ ...arizer_subset.sh => diarizer_subset_benchmark.sh} | 10 +++++----- ...rakeet_subset.sh => parakeet_subset_benchmark.sh} | 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) rename Scripts/{run_diarizer_subset.sh => diarizer_subset_benchmark.sh} (97%) rename Scripts/{run_parakeet_subset.sh => parakeet_subset_benchmark.sh} (98%) diff --git a/.gitignore b/.gitignore index 9fd3f851e..08103c740 100644 --- a/.gitignore +++ b/.gitignore @@ -102,8 +102,8 @@ Resources/ !Sources/FluidAudio/Resources/ !Sources/FluidAudio/Resources/** scripts/ -!Scripts/run_parakeet_subset.sh -!Scripts/run_diarizer_subset.sh +!Scripts/parakeet_subset_benchmark.sh +!Scripts/diarizer_subset_benchmark.sh Documentation/parakeet-tdt/ docs/parakeet-tdt/ diff --git a/Documentation/ASR/benchmarks100.md b/Documentation/ASR/benchmarks100.md index 08d1dcab9..1162d4a6a 100644 --- a/Documentation/ASR/benchmarks100.md +++ b/Documentation/ASR/benchmarks100.md @@ -4,14 +4,14 @@ Benchmark comparison between `main` and PR #440 (`standardize-asr-directory-stru ## Reproduction -All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/run_parakeet_subset.sh`](../../Scripts/run_parakeet_subset.sh): +All batch TDT and CTC earnings benchmarks can be reproduced with [`Scripts/parakeet_subset_benchmark.sh`](../../Scripts/parakeet_subset_benchmark.sh): ```bash # Download models and datasets (requires internet) -./Scripts/run_parakeet_subset.sh --download +./Scripts/parakeet_subset_benchmark.sh --download # Run all 4 benchmarks offline (100 files each, sleep-prevented) -./Scripts/run_parakeet_subset.sh +./Scripts/parakeet_subset_benchmark.sh ``` ## Environment diff --git a/Documentation/Diarization/BenchmarkAMISubset.md b/Documentation/Diarization/BenchmarkAMISubset.md index ee022b185..b8f698bee 100644 --- a/Documentation/Diarization/BenchmarkAMISubset.md +++ b/Documentation/Diarization/BenchmarkAMISubset.md @@ -22,7 +22,7 @@ Pyannote segmentation + WeSpeaker embeddings + PLDA scoring + VBx clustering. Default configuration: step ratio 0.2, minSegmentDurationSeconds 1.0, clustering threshold 0.7. ```bash -Scripts/run_diarizer_subset.sh +Scripts/diarizer_subset_benchmark.sh # or manually: swift run -c release fluidaudiocli diarization-benchmark --mode offline \ --dataset ami-sdm --auto-download @@ -50,7 +50,7 @@ Pyannote segmentation + WeSpeaker embeddings + online SpeakerManager clustering. Best streaming configuration: 5s chunks, 0s overlap, 0.8 clustering threshold. ```bash -Scripts/run_diarizer_subset.sh +Scripts/diarizer_subset_benchmark.sh # or manually: swift run -c release fluidaudiocli diarization-benchmark --mode streaming \ --dataset ami-sdm --chunk-seconds 5.0 --overlap-seconds 0.0 \ @@ -81,7 +81,7 @@ NVIDIA end-to-end Sortformer model, 30.4s chunk config. Model: [FluidInference/diar-streaming-sortformer-coreml](https://huggingface.co/FluidInference/diar-streaming-sortformer-coreml) ```bash -Scripts/run_diarizer_subset.sh +Scripts/diarizer_subset_benchmark.sh # or manually: swift run -c release fluidaudiocli sortformer-benchmark \ --nvidia-high-latency --hf --auto-download @@ -109,7 +109,7 @@ Linear Streaming End-to-End Neural Diarization from Westlake University. Model: [GradientDescent2718/ls-eend-coreml](https://huggingface.co/GradientDescent2718/ls-eend-coreml) ```bash -Scripts/run_diarizer_subset.sh +Scripts/diarizer_subset_benchmark.sh # or manually: swift run -c release fluidaudiocli lseend-benchmark \ --variant ami --auto-download @@ -135,13 +135,13 @@ Full 16-meeting results: 20.7% DER, 74.5x RTFx. See [Benchmarks.md](../Benchmark Run all 4 systems on the default 4-meeting subset: ```bash -./Scripts/run_diarizer_subset.sh +./Scripts/diarizer_subset_benchmark.sh ``` Run on all 16 AMI meetings: ```bash -./Scripts/run_diarizer_subset.sh --all +./Scripts/diarizer_subset_benchmark.sh --all ``` Results are saved to `benchmark_results/` with timestamps. The script uses `caffeinate` to prevent sleep during long runs. diff --git a/Scripts/run_diarizer_subset.sh b/Scripts/diarizer_subset_benchmark.sh similarity index 97% rename from Scripts/run_diarizer_subset.sh rename to Scripts/diarizer_subset_benchmark.sh index d86701952..a273cd94b 100755 --- a/Scripts/run_diarizer_subset.sh +++ b/Scripts/diarizer_subset_benchmark.sh @@ -8,10 +8,10 @@ # 4. LS-EEND — LSEENDDiarizer, AMI variant # # Usage: -# ./Scripts/run_diarizer_subset.sh # quick run (4 meetings) -# ./Scripts/run_diarizer_subset.sh --all # full run (all 16 meetings) -# ./Scripts/run_diarizer_subset.sh --max-files 8 # custom subset -# ./Scripts/run_diarizer_subset.sh --download # download missing assets, then exit +# ./Scripts/diarizer_subset_benchmark.sh # quick run (4 meetings) +# ./Scripts/diarizer_subset_benchmark.sh --all # full run (all 16 meetings) +# ./Scripts/diarizer_subset_benchmark.sh --max-files 8 # custom subset +# ./Scripts/diarizer_subset_benchmark.sh --download # download missing assets, then exit # # The script verifies all models and dataset files exist locally before running. # If anything is missing it will tell you exactly what and exit (unless --download). @@ -156,7 +156,7 @@ if ! verify_assets; then log "" log "ERROR: Missing assets — cannot run offline." log "Run with --download first while connected to the internet:" - log " ./Scripts/run_diarizer_subset.sh --download" + log " ./Scripts/diarizer_subset_benchmark.sh --download" exit 1 fi log "All assets verified locally." diff --git a/Scripts/run_parakeet_subset.sh b/Scripts/parakeet_subset_benchmark.sh similarity index 98% rename from Scripts/run_parakeet_subset.sh rename to Scripts/parakeet_subset_benchmark.sh index 186e13e1d..504b333d0 100755 --- a/Scripts/run_parakeet_subset.sh +++ b/Scripts/parakeet_subset_benchmark.sh @@ -10,8 +10,8 @@ # 6. Nemotron streaming — nemotron 1120ms on LibriSpeech test-clean # # Usage: -# ./Scripts/run_parakeet_subset.sh # verify + run -# ./Scripts/run_parakeet_subset.sh --download # download missing assets, then exit +# ./Scripts/parakeet_subset_benchmark.sh # verify + run +# ./Scripts/parakeet_subset_benchmark.sh --download # download missing assets, then exit # # The script verifies all models and dataset files exist locally before running. # If anything is missing it will tell you exactly what and exit (unless --download). @@ -175,7 +175,7 @@ if ! verify_assets; then log "" log "ERROR: Missing assets — cannot run offline." log "Run with --download first while connected to the internet:" - log " ./Scripts/run_parakeet_subset.sh --download" + log " ./Scripts/parakeet_subset_benchmark.sh --download" exit 1 fi log "All assets verified locally." From a2eaf0e2ddbf30235562cabe5ab1ba3089bbef14 Mon Sep 17 00:00:00 2001 From: Alex-Wengg Date: Sat, 28 Mar 2026 23:26:07 -0400 Subject: [PATCH 16/16] Remove dead python invocation in merge_json_results The first python3 call wrote to an unintended timestamp-less path and imported unused glob. Keep only the second invocation that correctly writes to the output_file parameter. --- Scripts/diarizer_subset_benchmark.sh | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/Scripts/diarizer_subset_benchmark.sh b/Scripts/diarizer_subset_benchmark.sh index a273cd94b..cbb4fae39 100755 --- a/Scripts/diarizer_subset_benchmark.sh +++ b/Scripts/diarizer_subset_benchmark.sh @@ -190,22 +190,6 @@ merge_json_results() { shift local tmp_files=("$@") python3 -c " -import json, sys, glob -results = [] -for f in sys.argv[1:]: - try: - with open(f) as fh: - data = json.load(fh) - if isinstance(data, list): - results.extend(data) - else: - results.append(data) - except: pass -with open(sys.argv[1].rsplit('_tmp_', 1)[0] + '.json', 'w') as out: - json.dump(results, out, indent=2) -" "$@" 2>/dev/null - # Also write to the expected output path - python3 -c " import json, sys results = [] for f in sys.argv[2:]: