From 3b7fb7600c6ef466d5b8ee02b99dc9bf271d432f Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:39:55 -0800 Subject: [PATCH 01/24] Vectorized embedding quality calculation --- Sources/FluidAudio/Diarizer/Core/DiarizerManager.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/FluidAudio/Diarizer/Core/DiarizerManager.swift b/Sources/FluidAudio/Diarizer/Core/DiarizerManager.swift index 00d27c3ae..a671358d5 100644 --- a/Sources/FluidAudio/Diarizer/Core/DiarizerManager.swift +++ b/Sources/FluidAudio/Diarizer/Core/DiarizerManager.swift @@ -474,7 +474,7 @@ public final class DiarizerManager { } private func calculateEmbeddingQuality(_ embedding: [Float]) -> Float { - let magnitude = sqrt(embedding.map { $0 * $0 }.reduce(0, +)) + let magnitude = sqrt(vDSP.sumOfSquares(embedding)) return min(1.0, magnitude / 10.0) } From 80aa250a6bbc95273619b39ce940df88f0e51585 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:40:39 -0800 Subject: [PATCH 02/24] Added SpeakerInitializationMode enum to dictate behavior when initializing known speakers --- .../Diarizer/Clustering/SpeakerTypes.swift | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift index 77562d117..9145814d2 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift @@ -136,6 +136,8 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { } /// Merge another speaker into this one + /// - Parameters: + /// - other: other Speaker to merge public func mergeWith(_ other: Speaker, keepName: String? = nil) { // Merge raw embeddings var allEmbeddings = rawEmbeddings + other.rawEmbeddings @@ -239,3 +241,14 @@ public struct SendableSpeaker: Sendable, Identifiable, Hashable { return lhs.id == rhs.id && lhs.name == rhs.name } } + +public enum SpeakerInitializationMode { + /// reset the speaker database and add the new speakers + case reset + /// merge new speakers whose IDs match with existing ones + case merge + /// overwrite existing speakers with the same IDs as the new ones + case overwrite + /// skip speakers whose IDs match existing ones + case skip +} From 8978171410062566f3f777defbea4ca079e19a4a Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Sun, 2 Nov 2025 19:52:30 -0800 Subject: [PATCH 03/24] Implemented a lot more features for finer control --- .../Diarizer/Clustering/SpeakerManager.swift | 193 +++++++++++++++++- 1 file changed, 183 insertions(+), 10 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index fccad9bae..97b3f48c3 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -4,7 +4,7 @@ import OSLog /// In-memory speaker database for streaming diarization /// Tracks speakers across chunks and maintains consistent IDs -public class SpeakerManager { +public class SpeakerManager { internal let logger = AppLogger(category: "SpeakerManager") // Constants @@ -34,8 +34,16 @@ public class SpeakerManager { self.minSpeechDuration = minSpeechDuration self.minEmbeddingUpdateDuration = minEmbeddingUpdateDuration } - - public func initializeKnownSpeakers(_ speakers: [Speaker]) { + + /// Add known speakers to the database + /// - Parameters: + /// - speakers: list of `Speaker`s to add + /// - mode: mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) + public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip) { + if mode == .reset { + self.reset() + } + queue.sync(flags: .barrier) { var maxNumericId = 0 @@ -45,8 +53,26 @@ public class SpeakerManager { "Skipping speaker \(speaker.id) - invalid embedding size: \(speaker.currentEmbedding.count)") continue } - - speakerDatabase[speaker.id] = speaker + + // Check if the speaker ID is uninitialized + if self.speakerDatabase[speaker.id] == nil { + speakerDatabase[speaker.id] = speaker + } else { + // Handle duplicate speaker + switch mode { + case .reset: + fallthrough + case .overwrite: + logger.warning("Speaker \(speaker.id) is already initialized. Overwriting old speaker.") + speakerDatabase[speaker.id] = speaker + case .merge: + logger.warning("Speaker \(speaker.id) is already initialized. Merging with old speaker.") + speakerDatabase[speaker.id]?.mergeWith(speaker, keepName: speaker.name) + case .skip: + logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") + continue + } + } // Try to extract numeric ID if it's a pure number if let numericId = Int(speaker.id) { @@ -67,10 +93,20 @@ public class SpeakerManager { } } + /// Match the embedding to the closest existing speaker if sufficiently similar or create a new one if not. + /// - Parameters: + /// - embedding: 256D speaker embedding vector + /// - speechDuration: duration of the speech segment during which this speaker was active + /// - confidence: confidence in the embedding vector being correct + /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) + /// - newName: name to assign the speaker if a new one is created (default: `Speaker $id`) + /// - Returns: a `Speaker` object if a match was found or a new one was created. Returns `nil` if an error occured. public func assignSpeaker( _ embedding: [Float], speechDuration: Float, - confidence: Float = 1.0 + confidence: Float = 1.0, + speakerThreshold: Float? = nil, + newName: String? = nil ) -> Speaker? { guard !embedding.isEmpty && embedding.count == Self.embeddingSize else { logger.error("Invalid embedding size: \(embedding.count)") @@ -78,7 +114,8 @@ public class SpeakerManager { } let normalizedEmbedding = VDSPOperations.l2Normalize(embedding) - + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + return queue.sync(flags: .barrier) { let (closestSpeaker, distance) = findClosestSpeaker(to: normalizedEmbedding) @@ -116,7 +153,134 @@ public class SpeakerManager { return nil } } - + + /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. + /// - Parameters: + /// - embedding: 256D speaker embedding vector + /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) + /// - Returns: the ID of the match (if found) and the distance to that match. + public func findSpeaker(with embedding: [Float], speakerThreshold: Float? = nil) -> (speaker: String?, distance: Float) { + let (closestSpeakerId, minDistance) = findClosestSpeaker(to: embedding) + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + if let closestSpeakerId, minDistance <= speakerThreshold { + return (closestSpeakerId, minDistance) + } + return (nil, .infinity) + } + + /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. + /// - Parameters: + /// - embedding: 256D speaker embedding vector + /// - speakerThreshold: the maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) + /// - Returns: a list of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. + public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(speaker: String, distance: Float)] { + var matches: [(speaker: String, distance: Float)] = [] + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + + for (speakerId, speaker) in speakerDatabase { + let distance = cosineDistance(embedding, speaker.currentEmbedding) + if distance <= speakerThreshold { + matches.append( (speakerId, distance) ) + } + } + matches.sort { $0.distance < $1.distance } + return matches + } + + /// Creates a new speaker + /// - Parameters: + /// - embedding: 256D speaker embedding vector + /// - duration: the duration for which this speaker has been speaking + /// - name: the name to assign the speaker (default: `Speaker $id`) + /// - Returns: the ID of the new speaker, if successful. Returns `nil` if issues were encountered. + public func createNewSpeaker(embedding: [Float], + duration: Float, + name: String? = nil) -> String? { + guard embedding.count == 256 else { + logger.error("Invalid embedding length: \(embedding.count)") + return nil + } + + let minDistance = findDistanceToClosestSpeaker(to: embedding) + return self.createNewSpeaker( + embedding: embedding, + duration: duration, + distanceToClosest: minDistance, + name: name + ) + } + + /// Merge two speakers in the database. + /// - Parameters: + /// - sourceId: ID of the `Speaker` being merged + /// - destinationId: ID of the `Speaker` that absorbs the other one + /// - mergedName: new name for the merged speaker (uses `destination`'s name if not provided) + /// - Returns: `true` if merge was successful, `false` if not. + public func merge(speaker sourceId: String, into destinationId: String, mergedName: String? = nil) -> Bool { + + // don't merge a speaker into itself + guard sourceId != destinationId else { + return false + } + + // ensure both speakers exist + guard let speakerToMerge = speakerDatabase[sourceId], + let destinationSpeaker = speakerDatabase[destinationId] else { + return false + } + + // merge source into destination + destinationSpeaker.mergeWith(speakerToMerge, keepName: mergedName) + + // remove source speaker + speakerDatabase.removeValue(forKey: sourceId) + return true + } + + /// Remove a speaker from the database + /// - Parameters: + /// - speakerID: ID of the speaker being removed + public func remove(speaker speakerID: String) { + if let speaker = speakerDatabase.removeValue(forKey: speakerID) { + logger.info("Removing speaker: \(speakerID)") + } else { + logger.warning("Failed to remove speaker: \(speakerID) (Speaker not found)") + } + } + + /// Remove all speakers that were inactive since a given date + public func removeSpeakersInactive(since date: Date) { + for (speakerId, speaker) in speakerDatabase where speaker.updatedAt < date { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) due to inactivity") + } + } + + /// remove speakers that have been inactive for a given duration + public func removeSpeakersInactive(for durationInactive: TimeInterval) { + let date = Date().addingTimeInterval(-durationInactive) + self.removeSpeakersInactive(since: date) + } + + /// remove speakers that meet a certain predicate + public func removeAllSpeakers(where predicate: (Speaker) -> Bool) { + for (speakerId, speaker) in speakerDatabase where predicate(speaker) { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) based on predicate") + } + } + + /// find all speakers that meet a certain predicate + public func findAllSpeakers(where predicate: (Speaker) -> Bool) -> [String] { + return speakerDatabase.filter { predicate($0.value) }.map(\.key) + } + + private func findDistanceToClosestSpeaker(to embedding: [Float]) -> Float { + return speakerDatabase.values.reduce(Float.infinity) { + min($0, cosineDistance(embedding, $1.currentEmbedding)) + } + } + private func findClosestSpeaker(to embedding: [Float]) -> (speakerId: String?, distance: Float) { var minDistance: Float = Float.infinity var closestSpeakerId: String? @@ -167,17 +331,19 @@ public class SpeakerManager { private func createNewSpeaker( embedding: [Float], duration: Float, - distanceToClosest: Float + distanceToClosest: Float, + name: String? = nil ) -> String { let normalizedEmbedding = VDSPOperations.l2Normalize(embedding) let newSpeakerId = String(nextSpeakerId) + let newSpeakerName = name ?? "Speaker \(newSpeakerId)" // Default name with number if not provided nextSpeakerId += 1 highestSpeakerId = max(highestSpeakerId, nextSpeakerId - 1) // Create new Speaker object let newSpeaker = Speaker( id: newSpeakerId, - name: "Speaker \(newSpeakerId)", // Default name with number + name: newSpeakerName, currentEmbedding: normalizedEmbedding, duration: duration ) @@ -212,6 +378,13 @@ public class SpeakerManager { return speakerDatabase } } + + /// Get list of all speakers. + public func getSpeakerList() -> [Speaker] { + queue.sync { + return [Speaker](speakerDatabase.values) + } + } public func getSpeaker(for speakerId: String) -> Speaker? { queue.sync { speakerDatabase[speakerId] } From aa0f95e0e159c7f977bc8153e0d17422d09cecbe Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 13:23:40 -0800 Subject: [PATCH 04/24] Added speaker permanence and merge detection --- .../Diarizer/Clustering/SpeakerManager.swift | 169 ++++++++++++++---- .../Diarizer/Clustering/SpeakerTypes.swift | 29 ++- 2 files changed, 166 insertions(+), 32 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 97b3f48c3..4066d3e03 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -4,7 +4,7 @@ import OSLog /// In-memory speaker database for streaming diarization /// Tracks speakers across chunks and maintains consistent IDs -public class SpeakerManager { +public class SpeakerManager { internal let logger = AppLogger(category: "SpeakerManager") // Constants @@ -39,7 +39,8 @@ public class SpeakerManager { /// - Parameters: /// - speakers: list of `Speaker`s to add /// - mode: mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) - public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip) { + /// - preservePermanent: whether to avoid overwriting/merging pre-existing permanent speakers + public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preservePermanent: Bool = true) { if mode == .reset { self.reset() } @@ -54,24 +55,28 @@ public class SpeakerManager { continue } - // Check if the speaker ID is uninitialized - if self.speakerDatabase[speaker.id] == nil { - speakerDatabase[speaker.id] = speaker - } else { + // Check if the speaker ID is already initialized + if let oldSpeaker = self.speakerDatabase[speaker.id] { // Handle duplicate speaker switch mode { case .reset: fallthrough case .overwrite: - logger.warning("Speaker \(speaker.id) is already initialized. Overwriting old speaker.") - speakerDatabase[speaker.id] = speaker + if !(oldSpeaker.isPermanent && preservePermanent) { + logger.warning("Speaker \(speaker.id) is already initialized. Overwriting old speaker.") + speakerDatabase[speaker.id] = speaker + } else { + logger.warning("Failed to overwrite Speaker \(speaker.id) because it is permanent. Skipping") + } case .merge: logger.warning("Speaker \(speaker.id) is already initialized. Merging with old speaker.") - speakerDatabase[speaker.id]?.mergeWith(speaker, keepName: speaker.name) + oldSpeaker.mergeWith(speaker, keepName: speaker.name) case .skip: logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") continue } + } else { + speakerDatabase[speaker.id] = speaker } // Try to extract numeric ID if it's a pure number @@ -195,7 +200,8 @@ public class SpeakerManager { /// - Returns: the ID of the new speaker, if successful. Returns `nil` if issues were encountered. public func createNewSpeaker(embedding: [Float], duration: Float, - name: String? = nil) -> String? { + name: String? = nil, + isPermanent: Bool = false) -> String? { guard embedding.count == 256 else { logger.error("Invalid embedding length: \(embedding.count)") return nil @@ -206,7 +212,8 @@ public class SpeakerManager { embedding: embedding, duration: duration, distanceToClosest: minDistance, - name: name + name: name, + isPermanent: isPermanent ) } @@ -215,8 +222,9 @@ public class SpeakerManager { /// - sourceId: ID of the `Speaker` being merged /// - destinationId: ID of the `Speaker` that absorbs the other one /// - mergedName: new name for the merged speaker (uses `destination`'s name if not provided) + /// - stopIfPermanent: whether to stop merging if the source speaker is permanent /// - Returns: `true` if merge was successful, `false` if not. - public func merge(speaker sourceId: String, into destinationId: String, mergedName: String? = nil) -> Bool { + public func merge(speaker sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Bool { // don't merge a speaker into itself guard sourceId != destinationId else { @@ -229,6 +237,11 @@ public class SpeakerManager { return false } + // don't merge permanent speakers into another one + guard !(stopIfPermanent && speakerToMerge.isPermanent) else { + return false + } + // merge source into destination destinationSpeaker.mergeWith(speakerToMerge, keepName: mergedName) @@ -240,7 +253,15 @@ public class SpeakerManager { /// Remove a speaker from the database /// - Parameters: /// - speakerID: ID of the speaker being removed - public func remove(speaker speakerID: String) { + /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + public func remove(speaker speakerID: String, keepIfPermanent: Bool = true) { + // determine if we should skip the removal due to permanence + if keepIfPermanent, let speaker = self.speakerDatabase[speakerID], speaker.isPermanent { + logger.warning("Failed to remove speaker: \(speakerID) (Speaker is permanent)") + return + } + + // attempt to remove the speaker if let speaker = speakerDatabase.removeValue(forKey: speakerID) { logger.info("Removing speaker: \(speakerID)") } else { @@ -248,33 +269,107 @@ public class SpeakerManager { } } - /// Remove all speakers that were inactive since a given date - public func removeSpeakersInactive(since date: Date) { - for (speakerId, speaker) in speakerDatabase where speaker.updatedAt < date { - speakerDatabase.removeValue(forKey: speakerId) - logger.info("Removing speaker \(speakerId) due to inactivity") + /// Remove all speakers that were inactive since a given `date` + /// - Parameters: + /// - data: Speakers who have not been active after this date will be removed. + /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + public func removeSpeakersInactive(since date: Date, keepIfPermanent: Bool = true) { + if keepIfPermanent { + // don't remove permanent speakers + for (speakerId, speaker) in speakerDatabase where speaker.updatedAt < date && !speaker.isPermanent { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) due to inactivity") + } + } else { + // remove all inactive speakers + for (speakerId, speaker) in speakerDatabase where speaker.updatedAt < date { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) due to inactivity") + } } } /// remove speakers that have been inactive for a given duration - public func removeSpeakersInactive(for durationInactive: TimeInterval) { + /// - Parameters: + /// - durationInactive: Minimum duration for which a speaker needs to be inactive to be removed + /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + public func removeSpeakersInactive(for durationInactive: TimeInterval, keepIfPermanent: Bool = true) { let date = Date().addingTimeInterval(-durationInactive) - self.removeSpeakersInactive(since: date) + self.removeSpeakersInactive(since: date, keepIfPermanent: keepIfPermanent) } /// remove speakers that meet a certain predicate - public func removeAllSpeakers(where predicate: (Speaker) -> Bool) { - for (speakerId, speaker) in speakerDatabase where predicate(speaker) { - speakerDatabase.removeValue(forKey: speakerId) - logger.info("Removing speaker \(speakerId) based on predicate") + /// - Parameters: + /// - predicate: the predicate to determine whether the speaker should be removed + /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + public func removeAllSpeakers(where predicate: (Speaker) -> Bool, keepIfPermanent: Bool = true) { + if keepIfPermanent { + // don't remove permanent speakers + for (speakerId, speaker) in speakerDatabase where predicate(speaker) && !speaker.isPermanent { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) based on predicate") + } + } else { + for (speakerId, speaker) in speakerDatabase where predicate(speaker) { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) based on predicate") + } } } /// find all speakers that meet a certain predicate + /// - Parameter predicate: the condition that the speakers must meet to be returned + /// - Returns: a list of all Speaker IDs corresponding to Speakers that meet the predicate public func findAllSpeakers(where predicate: (Speaker) -> Bool) -> [String] { return speakerDatabase.filter { predicate($0.value) }.map(\.key) } + /// Find all pairs of speakers that can be merged + /// - Parameters: + /// - speakerThreshold: the max cosine distance between speakers to let them be considered mergeable + /// - excludeIfBothPermanent: whether to exclude speaker pairs where both speakers are permanent + /// - Returns: a list of speaker ID pairs that belong to speakers that are similar enough to be merged + public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + var pairs: [(speakerToMerge: String, destination: String)] = [] + + for i in (0.. Float { return speakerDatabase.values.reduce(Float.infinity) { min($0, cosineDistance(embedding, $1.currentEmbedding)) @@ -332,7 +427,8 @@ public class SpeakerManager { embedding: [Float], duration: Float, distanceToClosest: Float, - name: String? = nil + name: String? = nil, + isPermanent: Bool = false ) -> String { let normalizedEmbedding = VDSPOperations.l2Normalize(embedding) let newSpeakerId = String(nextSpeakerId) @@ -345,7 +441,8 @@ public class SpeakerManager { id: newSpeakerId, name: newSpeakerName, currentEmbedding: normalizedEmbedding, - duration: duration + duration: duration, + isPermanent: isPermanent ) // Add initial raw embedding @@ -403,7 +500,7 @@ public class SpeakerManager { ) } - /// Upsert a speaker - update if exists, insert if new + /// Upsert a speaker - update if ID exists, insert if new /// /// - Parameters: /// - id: The speaker ID @@ -413,6 +510,7 @@ public class SpeakerManager { /// - updateCount: Number of updates to this speaker /// - createdAt: Creation timestamp /// - updatedAt: Last update timestamp + /// - isPermanent: whether the speaker should be protected from merges and removals by default public func upsertSpeaker( id: String, currentEmbedding: [Float], @@ -420,7 +518,8 @@ public class SpeakerManager { rawEmbeddings: [RawEmbedding] = [], updateCount: Int = 1, createdAt: Date? = nil, - updatedAt: Date? = nil + updatedAt: Date? = nil, + isPermanent: Bool = false ) { queue.sync(flags: .barrier) { let now = Date() @@ -432,6 +531,7 @@ public class SpeakerManager { existingSpeaker.rawEmbeddings = rawEmbeddings existingSpeaker.updateCount = updateCount existingSpeaker.updatedAt = updatedAt ?? now + existingSpeaker.isPermanent = isPermanent // Keep original createdAt and name speakerDatabase[id] = existingSpeaker @@ -444,7 +544,8 @@ public class SpeakerManager { currentEmbedding: currentEmbedding, duration: duration, createdAt: createdAt ?? now, - updatedAt: updatedAt ?? now + updatedAt: updatedAt ?? now, + isPermanent: isPermanent ) newSpeaker.rawEmbeddings = rawEmbeddings @@ -463,9 +564,15 @@ public class SpeakerManager { } } - public func reset() { + /// Reset the speaker database + /// - Parameter keepPermanent: whether to keep permanent speakers + public func reset(keepPermanent: Bool = false) { queue.sync(flags: .barrier) { - speakerDatabase.removeAll() + if !keepPermanent { + speakerDatabase.removeAll() + } else { + speakerDatabase = speakerDatabase.filter(\.value.isPermanent) + } nextSpeakerId = 1 highestSpeakerId = 0 logger.info("Speaker database reset") diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift index 9145814d2..36521d28f 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift @@ -4,22 +4,43 @@ import Foundation /// Speaker profile representation for tracking speakers across audio /// This represents a speaker's identity, not a specific segment public final class Speaker: Identifiable, Codable, Equatable, Hashable { + /// Speaker ID public let id: String + /// Speaker name public var name: String + /// Main embedding vector for this speaker's voice public var currentEmbedding: [Float] + /// Total speech duration for this speaker public var duration: Float = 0 + /// Date that this speaker object was created public var createdAt: Date + /// Date that this speaker object was last updated public var updatedAt: Date + /// Number of times the embedding vector was updated public var updateCount: Int = 1 + /// Array of raw embedding vectors public var rawEmbeddings: [RawEmbedding] = [] + /// Whether this speaker can be deleted due to inactivity or merging + public var isPermanent: Bool = false + /// - Parameters: + /// - id: Speaker ID + /// - name: Speaker name + /// - currentEmbedding: Main embedding vector for this speaker's voice + /// - duration: Total speech duration for this speaker + /// - createdAt: Date that this speaker object was last updated + /// - updatedAt: Number of times the embedding vector was updated + /// - updateCount: Array of raw embedding vectors + /// - rawEmbeddings: Array of raw embedding vectors + /// - isPermanent: Whether this speaker can be deleted due to inactivity or merging public init( id: String? = nil, name: String? = nil, currentEmbedding: [Float], duration: Float = 0, createdAt: Date? = nil, - updatedAt: Date? = nil + updatedAt: Date? = nil, + isPermanent: Bool = false ) { let now = Date() self.id = id ?? UUID().uuidString @@ -30,6 +51,7 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { self.updatedAt = updatedAt ?? now self.updateCount = 1 self.rawEmbeddings = [] + self.isPermanent = isPermanent } /// Convert to SendableSpeaker format for cross-boundary usage. @@ -38,6 +60,9 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { } /// Update main embedding with new segment data using exponential moving average + /// - Parameters: + /// - duration: segment duration + /// - public func updateMainEmbedding( duration: Float, embedding: [Float], @@ -138,6 +163,7 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { /// Merge another speaker into this one /// - Parameters: /// - other: other Speaker to merge + /// - keepName: the resulting name after the merge public func mergeWith(_ other: Speaker, keepName: String? = nil) { // Merge raw embeddings var allEmbeddings = rawEmbeddings + other.rawEmbeddings @@ -252,3 +278,4 @@ public enum SpeakerInitializationMode { /// skip speakers whose IDs match existing ones case skip } + From ac05a6360d354d9dcf3ed65cb60b81db80a25939 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 18:16:50 -0800 Subject: [PATCH 05/24] Updated Documentation --- Documentation/SpeakerManager.md | 162 ++++++++++++++++- .../Diarizer/Clustering/SpeakerManager.swift | 164 +++++++++--------- 2 files changed, 247 insertions(+), 79 deletions(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 8e8a9850b..2bb72a21f 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -1,3 +1,4 @@ + # SpeakerManager API Tracks and manages speaker identities across audio chunks for streaming diarization. @@ -73,6 +74,21 @@ let bob = Speaker(id: "bob", name: "Bob", currentEmbedding: bobEmbedding) speakerManager.initializeKnownSpeakers([alice, bob]) ``` +Sometimes, there are already speakers in the database that may have the same ID. +```swift +let alice = Speaker(id: "alice", name: "Alice", currentEmbedding: aliceEmbedding) +let bob = Speaker(id: "bob", name: "Bob", currentEmbedding: bobEmbedding) +speakerManager.initializeKnownSpeakers([alice, bob], mode: .overwrite, preservePermanent: false) // replace any speakers with ID "alice" or "bob" with the new speakers, even if the old ones were marked as permanent. +``` + +> The `mode` argument dictates how to handle redundant speakers. It is of type `SpeakerInitializationMode`, and can take on one of four values: +> - `.reset`: reset the speaker database and add the new speakers +> - `.merge`: merge new speakers whose IDs match with existing ones +> - `.overwrite`: overwrite existing speakers with the same IDs as the new ones +> - `.skip`: skip adding speakers whose IDs match existing ones +> +> The `preservePermanent` argument determines whether existing speakers marked as permanent should be preserved (i.e., not overwritten or merged). It is `true` by default. + **Use case:** When you have pre-recorded voice samples of known speakers and want to recognize them by name instead of numeric IDs. #### upsertSpeaker @@ -91,6 +107,7 @@ speakerManager.upsertSpeaker( updateCount: 5, // optional createdAt: Date(), // optional updatedAt: Date() // optional + isPermanent: false // optional ) ``` @@ -98,9 +115,130 @@ speakerManager.upsertSpeaker( - If speaker ID exists: updates the existing speaker's data - If speaker ID is new: inserts as a new speaker - Maintains ID uniqueness and tracks numeric IDs for auto-increment +- If `isPermanent` is true, then the new speaker or the existing speaker will become permanent. This means that the speaker will not be merged or removed without an override. + +#### mergeSpeaker +```swift +// merge speaker 1 into "alice" +speakerManager.mergeSpeaker("1", into: "alice") + +// merge speaker 2 into speaker 3 under the name "bob", regardless of whether speaker 2 is permanent. +speakerManager.mergeSpeaker("2", into: "3", mergedName: "Bob", stopIfPermanent: false) +``` + +**Behavior:** +- Unless `stopIfPermanent` is `false`, the merge will be stopped if the first speaker is permanent. +- Otherwise: Merges the first speaker into the destination speaker and removes the first speaker from the known speaker database. +- If `mergedName` is provided, the destination speaker will be renamed. Otherwise, its name will be preserved. + +> Note: the `mergedName` argument is optional. +> Note: `stopIfPermanent` is `true` by default. + +#### removeSpeaker +Remove a speaker from the database. + +```swift +// remove speaker 1 +speakerManager.removeSpeaker("1") + +// remove "alice" from the known speaker database, even if they are marked as permanent +speakerManager.removeSpeaker("alice", keepIfPermanent: false) +``` +> Note: `keepIfPermanent` is `true` by default. + +#### removeSpeakersInactive +Remove speaker that have been inactive since a certain date or for a certain duration. + +```swift +// remove speakers that have been inactive since `date` +speakerManager.removeSpeakersInactive(since: date) + +// remove speakers that have been active for 10 seconds, even if they were marked as permanent +speakerManager.removeSpeakersInactive(for: 10.0, keepIfPermanent: false) +``` + +> Note: Both versions of the method have an optional `keepIfPermanent` argument that defaults to `true`. + +#### removeAllSpeakers +Remove all speakers that match a given predicate. + +```swift +// remove all speakers with less than 5 seconds of speaking time +speakerManager.removeSpeakers( + where: { $0.duration < 5.0 }, + keepIfPermanent: false // also remove permanent speakers (optional) +) + +// Alternate syntax (does NOT remove permanent speakers) +speakerManager.removeSpeakers { + $0.duration < 5.0 +} +``` + +> Note: the predicate should take in a `Speaker` object and return a `Bool`. + +#### makeSpeakerPermanent +Make the speaker permanent. + +```swift +speakerManager.makeSpeakerPermanent("alice") // mark "alice" as permanent +``` + +#### revokePermanence +Make the speaker not permanent. + +```swift +speakerManager.revokePermanence(from: "alice") // mark "alice" as not permanent +``` ### Speaker Retrieval +#### findSpeaker +Find the best matching speaker to an embedding vector and the cosine distance to them, unless no match is found. + +```swift +let (id, distance) = speakerManager.findSpeaker(with: embedding) +``` +> Note: there is an optional `speakerThreshold` argument to use a threshold other than the default. + +#### findMatchingSpeakers +Find all speakers within the maximum `speakerThreshold` to an embedding vector. + +```swift +for speaker in speakerManager.findMatchingSpeakers(with: embedding) { + print("ID: \(speaker.id), Distance: \(speaker.distance)") +} +``` + +> Note: there is an optional `speakerThreshold` argument to use a threshold other than the default. + +#### findSpeakers +Find all speakers that meet a certain predicate. +```swift +// two ways to find all speakers with > 5.0s of speaking time. +speakerManager.findSpeakers(where: { $0.duration > 5.0 }) +speakerManager.findSpeakers{ $0.duration > 5.0 } +// Returns an array of IDs corresponding to speakers that meet the predicate. +``` + +> Note: the predicate should take in a `Speaker` object and return a `Bool`. + +#### findMergeablePairs +Find all pairs of speakers that might be the same person. Specifically, find the pairs of speakers such that the cosine distance between them is less than the `speakerThreshold`. + +Returns a list of pairs of speaker IDs. + +```swift +let pairs = speakerManager.findMergeablePairs( + speakerThreshold = 0.6, // optional + excludeIfBothPermanent = true // optional +) + +for pair in pairs { + print("Merge Speaker \(pair.speakerToMerge) into Speaker \(pair.destination)") +} +``` + #### getSpeaker Get a specific speaker by ID. @@ -118,6 +256,13 @@ let allSpeakers = speakerManager.getAllSpeakers() // Returns: [String: Speaker] - dictionary keyed by speaker ID ``` +#### getSpeakerList +Get all speakers in the database as an array of speakers (for testing/debugging) +```swift +let allSpeakers = speakerManager.getSpeakerList() +// Returns: [Speaker] - Array of speakers +``` + #### speakerCount Get the total number of tracked speakers. @@ -140,6 +285,7 @@ Clear all speakers from the database. ```swift speakerManager.reset() +speakerManager.reset(keepPermanent: true) // remove all non-permanent speakers from the database ``` Useful for: @@ -147,6 +293,8 @@ Useful for: - Freeing memory between recordings - Resetting speaker tracking + + ## Speaker Enrollment The `Speaker` class includes a `name` field for speaker enrollment workflows: @@ -237,6 +385,7 @@ public final class Speaker: Identifiable, Codable { public var updatedAt: Date // Last update timestamp public var updateCount: Int // Number of updates public var rawEmbeddings: [RawEmbedding] // Historical embeddings (max 50) + public var isPermanent: Bool // Permanence flag } ``` @@ -547,7 +696,17 @@ class RealtimeDiarizer { | Method | Returns | Description | |--------|---------|-------------| | `assignSpeaker(_:speechDuration:confidence:)` | `Speaker?` | Assign/create speaker from embedding | -| `initializeKnownSpeakers(_:)` | `Void` | Pre-load known speaker profiles | +| `initializeKnownSpeakers(_:mode:preservePermanent:)` | `Void` | Pre-load known speaker profiles | +| `findSpeaker(with:speakerThreshold:)` | `(id: String?, distance: Float)` | Find speaker that matches an embedding | +| `findMatchingSpeaker(with:speakerThreshold:)` | `[(id: String, distance: Float)]` | Find all speakers that match an embedding | +| `findSpeakers(where:)` | [String] | Find all speakers that meet a certain predicate +| findMergeablePairs(speakerThreshold:excludeIfBothPermanent:) | [(speakerToMerge: String, destination: String)] | Find all pairs of very similar speakers | +| `removeSpeaker(_:keepIfPermanent:)` | `Bool` | Remove a speaker from the database | +| `removeSpeakersInactive(since:keepIfPermanent:)` | `Bool` | Remove speakers inactive since a given date | +| `removeSpeakersInactive(for:keepIfPermanent:)` | `Bool` | Remove speakers inactive for a given duration | +| `removeSpeakers(where:)` | `Bool` | Remove speakers that satisfy a given predicate | +| `removeSpeakers(where:keepIfPermanent:)` | `Bool` | Remove speakers that satisfy a given predicate | +| `mergeSpeaker(_:into:mergedName:stopIfPermanent:)` | `Bool` | Merge a speaker into another one | | `upsertSpeaker(_:)` | `Void` | Update or insert speaker (from object) | | `upsertSpeaker(id:currentEmbedding:duration:...)` | `Void` | Update or insert speaker (from params) | | `getSpeaker(for:)` | `Speaker?` | Get speaker by ID | @@ -580,6 +739,7 @@ class RealtimeDiarizer { | `updatedAt` | `Date` | Last update timestamp | | `updateCount` | `Int` | Number of embedding updates | | `rawEmbeddings` | `[RawEmbedding]` | Historical embeddings (max 50) | +| `isPermanent` | `Bool` | Permanence flag | ### Speaker Methods diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 57668caed..0fb169001 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -164,7 +164,7 @@ public class SpeakerManager { /// - embedding: 256D speaker embedding vector /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) /// - Returns: the ID of the match (if found) and the distance to that match. - public func findSpeaker(with embedding: [Float], speakerThreshold: Float? = nil) -> (speaker: String?, distance: Float) { + public func findSpeaker(with embedding: [Float], speakerThreshold: Float? = nil) -> (id: String?, distance: Float) { let (closestSpeakerId, minDistance) = findClosestSpeaker(to: embedding) let speakerThreshold = speakerThreshold ?? self.speakerThreshold if let closestSpeakerId, minDistance <= speakerThreshold { @@ -178,8 +178,8 @@ public class SpeakerManager { /// - embedding: 256D speaker embedding vector /// - speakerThreshold: the maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) /// - Returns: a list of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. - public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(speaker: String, distance: Float)] { - var matches: [(speaker: String, distance: Float)] = [] + public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(id: String, distance: Float)] { + var matches: [(id: String, distance: Float)] = [] let speakerThreshold = speakerThreshold ?? self.speakerThreshold for (speakerId, speaker) in speakerDatabase { @@ -192,29 +192,37 @@ public class SpeakerManager { return matches } - /// Creates a new speaker - /// - Parameters: - /// - embedding: 256D speaker embedding vector - /// - duration: the duration for which this speaker has been speaking - /// - name: the name to assign the speaker (default: `Speaker $id`) - /// - Returns: the ID of the new speaker, if successful. Returns `nil` if issues were encountered. - public func createNewSpeaker(embedding: [Float], - duration: Float, - name: String? = nil, - isPermanent: Bool = false) -> String? { - guard embedding.count == 256 else { - logger.error("Invalid embedding length: \(embedding.count)") - return nil + /// find all speakers that meet a certain predicate + /// - Parameter predicate: the condition that the speakers must meet to be returned + /// - Returns: a list of all Speaker IDs corresponding to Speakers that meet the predicate + public func findSpeakers(where predicate: (Speaker) -> Bool) -> [String] { + return speakerDatabase.filter { predicate($0.value) }.map(\.key) + } + + /// Mark a speaker as permanent + /// - Parameter speakerId: the ID of the speaker to mark as permanent + /// - returns: `true` if the speaker is now permanent, `false` if not found. + public func makeSpeakerPermanent(_ speakerId: String) -> Bool { + if let speaker = speakerDatabase[speakerId] { + logger.info("Marking speaker \(speakerId) as permanent.") + speaker.isPermanent = true + return true } - - let minDistance = findDistanceToClosestSpeaker(to: embedding) - return self.createNewSpeaker( - embedding: embedding, - duration: duration, - distanceToClosest: minDistance, - name: name, - isPermanent: isPermanent - ) + logger.warning("Failed to mark speaker \(speakerId) as permanent (speaker not found).") + return false + } + + /// Remove a speaker's permanent marker + /// - Parameter speakerId: the ID of the speaker to mark as permanent + /// - returns: `true` if the speaker is no longer permanent, `false` if not found. + public func revokePermanence(from speakerId: String) -> Bool { + if let speaker = speakerDatabase[speakerId] { + logger.info("Revoking permanence from speaker \(speakerId).") + speaker.isPermanent = false + return true + } + logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") + return false } /// Merge two speakers in the database. @@ -224,7 +232,7 @@ public class SpeakerManager { /// - mergedName: new name for the merged speaker (uses `destination`'s name if not provided) /// - stopIfPermanent: whether to stop merging if the source speaker is permanent /// - Returns: `true` if merge was successful, `false` if not. - public func merge(speaker sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Bool { + public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Bool { // don't merge a speaker into itself guard sourceId != destinationId else { @@ -250,11 +258,57 @@ public class SpeakerManager { return true } + /// Find all pairs of speakers that can be merged + /// - Parameters: + /// - speakerThreshold: the max cosine distance between speakers to let them be considered mergeable + /// - excludeIfBothPermanent: whether to exclude speaker pairs where both speakers are permanent + /// - Returns: a list of speaker ID pairs that belong to speakers that are similar enough to be merged + public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + var pairs: [(speakerToMerge: String, destination: String)] = [] + + for i in (0.. Bool, keepIfPermanent: Bool = true) { + public func removeSpeakers(where predicate: (Speaker) -> Bool, keepIfPermanent: Bool = true) { if keepIfPermanent { // don't remove permanent speakers for (speakerId, speaker) in speakerDatabase where predicate(speaker) && !speaker.isPermanent { @@ -317,57 +371,11 @@ public class SpeakerManager { } } - /// find all speakers that meet a certain predicate - /// - Parameter predicate: the condition that the speakers must meet to be returned - /// - Returns: a list of all Speaker IDs corresponding to Speakers that meet the predicate - public func findAllSpeakers(where predicate: (Speaker) -> Bool) -> [String] { - return speakerDatabase.filter { predicate($0.value) }.map(\.key) - } - - /// Find all pairs of speakers that can be merged + /// remove non-permanent speakers that meet a certain predicate /// - Parameters: - /// - speakerThreshold: the max cosine distance between speakers to let them be considered mergeable - /// - excludeIfBothPermanent: whether to exclude speaker pairs where both speakers are permanent - /// - Returns: a list of speaker ID pairs that belong to speakers that are similar enough to be merged - public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { - let speakerThreshold = speakerThreshold ?? self.speakerThreshold - var pairs: [(speakerToMerge: String, destination: String)] = [] - - for i in (0.. Bool) { + removeSpeakers(where: predicate, keepIfPermanent: true) } private func findDistanceToClosestSpeaker(to embedding: [Float]) -> Float { From a976c9cf2334a0630375327e9d02859bce2eb04c Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 21:20:32 -0800 Subject: [PATCH 06/24] Improved Concurrency Safety & fixed typos in documentation --- Documentation/SpeakerManager.md | 50 +++- .../Diarizer/Clustering/SpeakerManager.swift | 264 ++++++++++-------- 2 files changed, 186 insertions(+), 128 deletions(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 2bb72a21f..9bee0379d 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -78,7 +78,7 @@ Sometimes, there are already speakers in the database that may have the same ID. ```swift let alice = Speaker(id: "alice", name: "Alice", currentEmbedding: aliceEmbedding) let bob = Speaker(id: "bob", name: "Bob", currentEmbedding: bobEmbedding) -speakerManager.initializeKnownSpeakers([alice, bob], mode: .overwrite, preservePermanent: false) // replace any speakers with ID "alice" or "bob" with the new speakers, even if the old ones were marked as permanent. +speakerManager.initializeKnownSpeakers([alice, bob], mode: .overwrite, preserveIfPermanent: false) // replace any speakers with ID "alice" or "bob" with the new speakers, even if the old ones were marked as permanent. ``` > The `mode` argument dictates how to handle redundant speakers. It is of type `SpeakerInitializationMode`, and can take on one of four values: @@ -87,7 +87,7 @@ speakerManager.initializeKnownSpeakers([alice, bob], mode: .overwrite, preserveP > - `.overwrite`: overwrite existing speakers with the same IDs as the new ones > - `.skip`: skip adding speakers whose IDs match existing ones > -> The `preservePermanent` argument determines whether existing speakers marked as permanent should be preserved (i.e., not overwritten or merged). It is `true` by default. +> The `preserveIfPermanent` argument determines whether existing speakers marked as permanent should be preserved (i.e., not overwritten or merged). It is `true` by default. **Use case:** When you have pre-recorded voice samples of known speakers and want to recognize them by name instead of numeric IDs. @@ -191,6 +191,13 @@ Make the speaker not permanent. speakerManager.revokePermanence(from: "alice") // mark "alice" as not permanent ``` +#### resetPermanentFlags +Mark all speakers as not permanent. + +```swift +speakerManager.resetPermanentFlags() +``` + ### Speaker Retrieval #### findSpeaker @@ -263,6 +270,15 @@ let allSpeakers = speakerManager.getSpeakerList() // Returns: [Speaker] - Array of speakers ``` +#### hasSpeaker +Check if the speaker database has a speaker with a given ID. + +```swift +if speakerManager.hasSpeaker("alice") { + print("Alice was found in the database") +} +``` + #### speakerCount Get the total number of tracked speakers. @@ -285,7 +301,7 @@ Clear all speakers from the database. ```swift speakerManager.reset() -speakerManager.reset(keepPermanent: true) // remove all non-permanent speakers from the database +speakerManager.reset(keepIfPermanent: true) // remove all non-permanent speakers from the database ``` Useful for: @@ -696,23 +712,25 @@ class RealtimeDiarizer { | Method | Returns | Description | |--------|---------|-------------| | `assignSpeaker(_:speechDuration:confidence:)` | `Speaker?` | Assign/create speaker from embedding | -| `initializeKnownSpeakers(_:mode:preservePermanent:)` | `Void` | Pre-load known speaker profiles | +| `initializeKnownSpeakers(_:mode:preserveIfPermanent:)` | `Void` | Pre-load known speaker profiles | | `findSpeaker(with:speakerThreshold:)` | `(id: String?, distance: Float)` | Find speaker that matches an embedding | -| `findMatchingSpeaker(with:speakerThreshold:)` | `[(id: String, distance: Float)]` | Find all speakers that match an embedding | -| `findSpeakers(where:)` | [String] | Find all speakers that meet a certain predicate -| findMergeablePairs(speakerThreshold:excludeIfBothPermanent:) | [(speakerToMerge: String, destination: String)] | Find all pairs of very similar speakers | -| `removeSpeaker(_:keepIfPermanent:)` | `Bool` | Remove a speaker from the database | -| `removeSpeakersInactive(since:keepIfPermanent:)` | `Bool` | Remove speakers inactive since a given date | -| `removeSpeakersInactive(for:keepIfPermanent:)` | `Bool` | Remove speakers inactive for a given duration | -| `removeSpeakers(where:)` | `Bool` | Remove speakers that satisfy a given predicate | -| `removeSpeakers(where:keepIfPermanent:)` | `Bool` | Remove speakers that satisfy a given predicate | -| `mergeSpeaker(_:into:mergedName:stopIfPermanent:)` | `Bool` | Merge a speaker into another one | +| `findMatchingSpeakers(with:speakerThreshold:)` | `[(id: String, distance: Float)]` | Find all speakers that match an embedding | +| `findSpeakers(where:)` | `[String]` | Find all speakers that meet a certain predicate +| `findMergeablePairs(speakerThreshold:excludeIfBothPermanent:)` | `[(speakerToMerge: String, destination: String)]` | Find all pairs of very similar speakers | +| `removeSpeaker(_:keepIfPermanent:)` | `Void` | Remove a speaker from the database | +| `removeSpeakersInactive(since:keepIfPermanent:)` | `Void` | Remove speakers inactive since a given date | +| `removeSpeakersInactive(for:keepIfPermanent:)` | `Void` | Remove speakers inactive for a given duration | +| `removeSpeakers(where:)` | `Void` | Remove speakers that satisfy a given predicate | +| `removeSpeakers(where:keepIfPermanent:)` | `Void` | Remove speakers that satisfy a given predicate | +| `mergeSpeaker(_:into:mergedName:stopIfPermanent:)` | `Void` | Merge a speaker into another one | | `upsertSpeaker(_:)` | `Void` | Update or insert speaker (from object) | | `upsertSpeaker(id:currentEmbedding:duration:...)` | `Void` | Update or insert speaker (from params) | | `getSpeaker(for:)` | `Speaker?` | Get speaker by ID | | `getAllSpeakers()` | `[String: Speaker]` | Get all speakers (debugging) | -| `reset()` | `Void` | Clear speaker database | -| `reassignSegment(segmentId:from:to:)` | `Bool` | Move segment between speakers | +| `getSpeakerList()` | `[Speaker]` | Get array of all speakers (debugging) | +| `hasSpeaker(_:)` | `Bool` | Check if database has a speaker with a given ID | +| `reset(keepIfPermanent:)` | `Void` | Clear speaker database | +| `resetPermanentFlag()` | `Void` | Mark all speakers as not permanent | | `getCurrentSpeakerNames()` | `[String]` | Get sorted speaker IDs | | `getGlobalSpeakerStats()` | `(Int, Float, Float, Int)` | Aggregate statistics | @@ -726,6 +744,7 @@ class RealtimeDiarizer { | `minEmbeddingUpdateDuration` | `Float` | Min duration to update embeddings (seconds) | | `speakerCount` | `Int` | Number of tracked speakers | | `speakerIds` | `[String]` | Sorted array of speaker IDs | +| `permanentSpeakerIds` | `[String]` | Sorted array of speaker IDs of permanent speakers | ### Speaker Properties @@ -762,6 +781,7 @@ class RealtimeDiarizer { | `averageEmbeddings(_:)` | `[Float]?` | Average multiple embeddings | | `createSpeaker(id:name:duration:embedding:config:)` | `Speaker?` | Create validated speaker | | `updateEmbedding(current:new:alpha:)` | `[Float]?` | EMA update (pure function) | +| `reassignSegment(segmentId:from:to:)` | `Bool` | Move segment between speakers | ## See Also diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 0fb169001..09d3b82a1 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -40,9 +40,9 @@ public class SpeakerManager { /// - speakers: list of `Speaker`s to add /// - mode: mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) /// - preservePermanent: whether to avoid overwriting/merging pre-existing permanent speakers - public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preservePermanent: Bool = true) { + public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preserveIfPermanent: Bool = true) { if mode == .reset { - self.reset(keepPermanent: preservePermanent) + self.reset(keepIfPermanent: preserveIfPermanent) } queue.sync(flags: .barrier) { @@ -165,12 +165,14 @@ public class SpeakerManager { /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) /// - Returns: the ID of the match (if found) and the distance to that match. public func findSpeaker(with embedding: [Float], speakerThreshold: Float? = nil) -> (id: String?, distance: Float) { - let (closestSpeakerId, minDistance) = findClosestSpeaker(to: embedding) - let speakerThreshold = speakerThreshold ?? self.speakerThreshold - if let closestSpeakerId, minDistance <= speakerThreshold { - return (closestSpeakerId, minDistance) + queue.sync { + let (closestSpeakerId, minDistance) = findClosestSpeaker(to: embedding) + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + if let closestSpeakerId, minDistance <= speakerThreshold { + return (closestSpeakerId, minDistance) + } + return (nil, .infinity) } - return (nil, .infinity) } /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. @@ -179,50 +181,55 @@ public class SpeakerManager { /// - speakerThreshold: the maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) /// - Returns: a list of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(id: String, distance: Float)] { - var matches: [(id: String, distance: Float)] = [] - let speakerThreshold = speakerThreshold ?? self.speakerThreshold - - for (speakerId, speaker) in speakerDatabase { - let distance = cosineDistance(embedding, speaker.currentEmbedding) - if distance <= speakerThreshold { - matches.append( (speakerId, distance) ) + queue.sync { + var matches: [(id: String, distance: Float)] = [] + let speakerThreshold = speakerThreshold ?? self.speakerThreshold + + for (speakerId, speaker) in speakerDatabase { + let distance = cosineDistance(embedding, speaker.currentEmbedding) + if distance <= speakerThreshold { + matches.append( (speakerId, distance) ) + } } + matches.sort { $0.distance < $1.distance } + return matches } - matches.sort { $0.distance < $1.distance } - return matches } /// find all speakers that meet a certain predicate /// - Parameter predicate: the condition that the speakers must meet to be returned /// - Returns: a list of all Speaker IDs corresponding to Speakers that meet the predicate public func findSpeakers(where predicate: (Speaker) -> Bool) -> [String] { - return speakerDatabase.filter { predicate($0.value) }.map(\.key) + queue.sync { + return speakerDatabase.filter { predicate($0.value) }.map(\.key) + } } /// Mark a speaker as permanent /// - Parameter speakerId: the ID of the speaker to mark as permanent - /// - returns: `true` if the speaker is now permanent, `false` if not found. - public func makeSpeakerPermanent(_ speakerId: String) -> Bool { - if let speaker = speakerDatabase[speakerId] { + public func makeSpeakerPermanent(_ speakerId: String) { + queue.sync(flags: .barrier) { + guard let speaker = speakerDatabase[speakerId] else { + logger.warning("Failed to mark speaker \(speakerId) as permanent (speaker not found).") + return + } logger.info("Marking speaker \(speakerId) as permanent.") speaker.isPermanent = true - return true } - logger.warning("Failed to mark speaker \(speakerId) as permanent (speaker not found).") - return false } /// Remove a speaker's permanent marker /// - Parameter speakerId: the ID of the speaker to mark as permanent - /// - returns: `true` if the speaker is no longer permanent, `false` if not found. - public func revokePermanence(from speakerId: String) -> Bool { - if let speaker = speakerDatabase[speakerId] { + public func revokePermanence(from speakerId: String) { + queue.sync(flags: .barrier) { + guard let speaker = speakerDatabase[speakerId] else { + logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") + return + } + logger.info("Revoking permanence from speaker \(speakerId).") speaker.isPermanent = false - return true } - logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") - return false } /// Merge two speakers in the database. @@ -232,30 +239,30 @@ public class SpeakerManager { /// - mergedName: new name for the merged speaker (uses `destination`'s name if not provided) /// - stopIfPermanent: whether to stop merging if the source speaker is permanent /// - Returns: `true` if merge was successful, `false` if not. - public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Bool { - + public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Void { // don't merge a speaker into itself guard sourceId != destinationId else { - return false - } - - // ensure both speakers exist - guard let speakerToMerge = speakerDatabase[sourceId], - let destinationSpeaker = speakerDatabase[destinationId] else { - return false + return } - // don't merge permanent speakers into another one - guard !(stopIfPermanent && speakerToMerge.isPermanent) else { - return false + queue.sync(flags: .barrier) { + // ensure both speakers exist + guard let speakerToMerge = speakerDatabase[sourceId], + let destinationSpeaker = speakerDatabase[destinationId] else { + return + } + + // don't merge permanent speakers into another one + guard !(stopIfPermanent && speakerToMerge.isPermanent) else { + return + } + + // merge source into destination + destinationSpeaker.mergeWith(speakerToMerge, keepName: mergedName) + + // remove source speaker + speakerDatabase.removeValue(forKey: sourceId) } - - // merge source into destination - destinationSpeaker.mergeWith(speakerToMerge, keepName: mergedName) - - // remove source speaker - speakerDatabase.removeValue(forKey: sourceId) - return true } /// Find all pairs of speakers that can be merged @@ -264,44 +271,47 @@ public class SpeakerManager { /// - excludeIfBothPermanent: whether to exclude speaker pairs where both speakers are permanent /// - Returns: a list of speaker ID pairs that belong to speakers that are similar enough to be merged public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { - let speakerThreshold = speakerThreshold ?? self.speakerThreshold - var pairs: [(speakerToMerge: String, destination: String)] = [] - - for i in (0.. Bool, keepIfPermanent: Bool = true) { - if keepIfPermanent { - // don't remove permanent speakers - for (speakerId, speaker) in speakerDatabase where predicate(speaker) && !speaker.isPermanent { - speakerDatabase.removeValue(forKey: speakerId) - logger.info("Removing speaker \(speakerId) based on predicate") - } - } else { - for (speakerId, speaker) in speakerDatabase where predicate(speaker) { - speakerDatabase.removeValue(forKey: speakerId) - logger.info("Removing speaker \(speakerId) based on predicate") + queue.sync(flags: .barrier) { + if keepIfPermanent { + // don't remove permanent speakers + for (speakerId, speaker) in speakerDatabase where predicate(speaker) && !speaker.isPermanent { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) based on predicate") + } + } else { + for (speakerId, speaker) in speakerDatabase where predicate(speaker) { + speakerDatabase.removeValue(forKey: speakerId) + logger.info("Removing speaker \(speakerId) based on predicate") + } } } } @@ -378,6 +394,15 @@ public class SpeakerManager { removeSpeakers(where: predicate, keepIfPermanent: true) } + /// Check if the speaker database has a speaker with a given ID. + /// - Parameter speakerId: the ID to check + /// - Returns: `true` if a speaker is found, `false` if not + public func hasSpeaker(_ speakerId: String) -> Bool { + queue.sync { + return speakerDatabase.keys.contains(speakerId) + } + } + private func findDistanceToClosestSpeaker(to embedding: [Float]) -> Float { return speakerDatabase.values.reduce(Float.infinity) { min($0, cosineDistance(embedding, $1.currentEmbedding)) @@ -476,6 +501,10 @@ public class SpeakerManager { public var speakerIds: [String] { queue.sync { Array(speakerDatabase.keys).sorted() } } + + public var permanentSpeakerIds: [String] { + queue.sync { Array(speakerDatabase.filter(\.value.isPermanent).keys).sorted() } + } /// Get all speakers (for testing/debugging). public func getAllSpeakers() -> [String: Speaker] { @@ -574,10 +603,10 @@ public class SpeakerManager { } /// Reset the speaker database - /// - Parameter keepPermanent: whether to keep permanent speakers - public func reset(keepPermanent: Bool = false) { + /// - Parameter keepIfPermanent: whether to keep permanent speakers + public func reset(keepIfPermanent: Bool = false) { queue.sync(flags: .barrier) { - if !keepPermanent { + if !keepIfPermanent { speakerDatabase.removeAll() } else { speakerDatabase = speakerDatabase.filter(\.value.isPermanent) @@ -587,4 +616,13 @@ public class SpeakerManager { logger.info("Speaker database reset") } } + + /// Mark all speakers as not permanent + public func resetPermanentFlags() { + queue.sync(flags: .barrier) { + speakerDatabase.forEach { + $0.value.isPermanent = false + } + } + } } From c00d1e92ddbf85e145256ab4514c0cef61c46729 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 21:37:42 -0800 Subject: [PATCH 07/24] Fixed capitalization in docstrings --- .../Diarizer/Clustering/SpeakerManager.swift | 77 ++++++++++--------- .../Diarizer/Clustering/SpeakerTypes.swift | 21 ++--- 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 09d3b82a1..508c26b3f 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -37,9 +37,9 @@ public class SpeakerManager { /// Add known speakers to the database /// - Parameters: - /// - speakers: list of `Speaker`s to add - /// - mode: mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) - /// - preservePermanent: whether to avoid overwriting/merging pre-existing permanent speakers + /// - speakers: Array of `Speaker`s to add + /// - mode: Mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` Will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) + /// - preservePermanent: Whether to avoid overwriting/merging pre-existing permanent speakers public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preserveIfPermanent: Bool = true) { if mode == .reset { self.reset(keepIfPermanent: preserveIfPermanent) @@ -62,7 +62,7 @@ public class SpeakerManager { case .reset: fallthrough case .overwrite: - if !(oldSpeaker.isPermanent && preservePermanent) { + if !(oldSpeaker.isPermanent && preserveIfPermanent) { logger.warning("Speaker \(speaker.id) is already initialized. Overwriting old speaker.") speakerDatabase[speaker.id] = speaker } else { @@ -101,11 +101,11 @@ public class SpeakerManager { /// Match the embedding to the closest existing speaker if sufficiently similar or create a new one if not. /// - Parameters: /// - embedding: 256D speaker embedding vector - /// - speechDuration: duration of the speech segment during which this speaker was active - /// - confidence: confidence in the embedding vector being correct - /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) - /// - newName: name to assign the speaker if a new one is created (default: `Speaker $id`) - /// - Returns: a `Speaker` object if a match was found or a new one was created. Returns `nil` if an error occured. + /// - speechDuration: Duration of the speech segment during which this speaker was active + /// - confidence: Confidence in the embedding vector being correct + /// - speakerThreshold: The maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) + /// - newName: Name to assign the speaker if a new one is created (default: `Speaker $id`) + /// - Returns: A `Speaker` object if a match was found or a new one was created. Returns `nil` if an error occured. public func assignSpeaker( _ embedding: [Float], speechDuration: Float, @@ -162,8 +162,8 @@ public class SpeakerManager { /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. /// - Parameters: /// - embedding: 256D speaker embedding vector - /// - speakerThreshold: the maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) - /// - Returns: the ID of the match (if found) and the distance to that match. + /// - speakerThreshold: Maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) + /// - Returns: ID of the match (if found) and the distance to that match. public func findSpeaker(with embedding: [Float], speakerThreshold: Float? = nil) -> (id: String?, distance: Float) { queue.sync { let (closestSpeakerId, minDistance) = findClosestSpeaker(to: embedding) @@ -178,8 +178,8 @@ public class SpeakerManager { /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. /// - Parameters: /// - embedding: 256D speaker embedding vector - /// - speakerThreshold: the maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) - /// - Returns: a list of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. + /// - speakerThreshold: Maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) + /// - Returns: Array of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(id: String, distance: Float)] { queue.sync { var matches: [(id: String, distance: Float)] = [] @@ -196,9 +196,9 @@ public class SpeakerManager { } } - /// find all speakers that meet a certain predicate - /// - Parameter predicate: the condition that the speakers must meet to be returned - /// - Returns: a list of all Speaker IDs corresponding to Speakers that meet the predicate + /// Find all speakers that meet a certain predicate + /// - Parameter predicate: Condition the speakers must meet to be returned + /// - Returns: A list of all Speaker IDs corresponding to Speakers that meet the predicate public func findSpeakers(where predicate: (Speaker) -> Bool) -> [String] { queue.sync { return speakerDatabase.filter { predicate($0.value) }.map(\.key) @@ -206,7 +206,7 @@ public class SpeakerManager { } /// Mark a speaker as permanent - /// - Parameter speakerId: the ID of the speaker to mark as permanent + /// - Parameter speakerId: ID of the speaker to mark as permanent public func makeSpeakerPermanent(_ speakerId: String) { queue.sync(flags: .barrier) { guard let speaker = speakerDatabase[speakerId] else { @@ -219,10 +219,11 @@ public class SpeakerManager { } /// Remove a speaker's permanent marker - /// - Parameter speakerId: the ID of the speaker to mark as permanent + /// - Parameter speakerId: ID of the speaker to mark as permanent public func revokePermanence(from speakerId: String) { queue.sync(flags: .barrier) { - guard let speaker = speakerDatabase[speakerId] else { + guard let speaker = speaker + base[speakerId] else { logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") return } @@ -236,9 +237,9 @@ public class SpeakerManager { /// - Parameters: /// - sourceId: ID of the `Speaker` being merged /// - destinationId: ID of the `Speaker` that absorbs the other one - /// - mergedName: new name for the merged speaker (uses `destination`'s name if not provided) - /// - stopIfPermanent: whether to stop merging if the source speaker is permanent - /// - Returns: `true` if merge was successful, `false` if not. + /// - mergedName: New name for the merged speaker (uses `destination`'s name if not provided) + /// - stopIfPermanent: Whether to stop merging if the source speaker is permanent + /// - Returns: `true` If merge was successful, `false` if not. public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Void { // don't merge a speaker into itself guard sourceId != destinationId else { @@ -267,9 +268,9 @@ public class SpeakerManager { /// Find all pairs of speakers that can be merged /// - Parameters: - /// - speakerThreshold: the max cosine distance between speakers to let them be considered mergeable - /// - excludeIfBothPermanent: whether to exclude speaker pairs where both speakers are permanent - /// - Returns: a list of speaker ID pairs that belong to speakers that are similar enough to be merged + /// - speakerThreshold: Max cosine distance between speakers to let them be considered mergeable + /// - excludeIfBothPermanent: Whether to exclude speaker pairs where both speakers are permanent + /// - Returns: Array of speaker ID pairs that belong to speakers that are similar enough to be merged public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { queue.sync { let speakerThreshold = speakerThreshold ?? self.speakerThreshold @@ -317,7 +318,7 @@ public class SpeakerManager { /// Remove a speaker from the database /// - Parameters: /// - speakerID: ID of the speaker being removed - /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + /// - keepIfPermanent: Whether to stop the removal if the speaker is marked as permanent public func removeSpeaker(_ speakerID: String, keepIfPermanent: Bool = true) { queue.sync(flags: .barrier) { // determine if we should skip the removal due to permanence @@ -337,8 +338,8 @@ public class SpeakerManager { /// Remove all speakers that were inactive since a given `date` /// - Parameters: - /// - data: Speakers who have not been active after this date will be removed. - /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + /// - date: Speakers who have not been active after this date will be removed. + /// - keepIfPermanent: Whether to stop the removal if the speaker is marked as permanent public func removeSpeakersInactive(since date: Date, keepIfPermanent: Bool = true) { queue.sync(flags: .barrier) { if keepIfPermanent { @@ -357,19 +358,19 @@ public class SpeakerManager { } } - /// remove speakers that have been inactive for a given duration + /// Remove speakers that have been inactive for a given duration /// - Parameters: /// - durationInactive: Minimum duration for which a speaker needs to be inactive to be removed - /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + /// - keepIfPermanent: Whether to stop the removal if the speaker is marked as permanent public func removeSpeakersInactive(for durationInactive: TimeInterval, keepIfPermanent: Bool = true) { let date = Date().addingTimeInterval(-durationInactive) self.removeSpeakersInactive(since: date, keepIfPermanent: keepIfPermanent) } - /// remove speakers that meet a certain predicate + /// Remove speakers that meet a certain predicate /// - Parameters: - /// - predicate: the predicate to determine whether the speaker should be removed - /// - keepIfPermanent: whether to stop the removal if the speaker is marked as permanent + /// - predicate: The predicate to determine whether the speaker should be removed + /// - keepIfPermanent: Whether to stop the removal if the speaker is marked as permanent public func removeSpeakers(where predicate: (Speaker) -> Bool, keepIfPermanent: Bool = true) { queue.sync(flags: .barrier) { if keepIfPermanent { @@ -387,15 +388,15 @@ public class SpeakerManager { } } - /// remove non-permanent speakers that meet a certain predicate + /// Remove non-permanent speakers that meet a certain predicate /// - Parameters: - /// - predicate: the predicate to determine whether the speaker should be removed + /// - predicate: Predicate to determine whether the speaker should be removed public func removeSpeakers(where predicate: (Speaker) -> Bool) { removeSpeakers(where: predicate, keepIfPermanent: true) } /// Check if the speaker database has a speaker with a given ID. - /// - Parameter speakerId: the ID to check + /// - Parameter speakerId: ID to check /// - Returns: `true` if a speaker is found, `false` if not public func hasSpeaker(_ speakerId: String) -> Bool { queue.sync { @@ -548,7 +549,7 @@ public class SpeakerManager { /// - updateCount: Number of updates to this speaker /// - createdAt: Creation timestamp /// - updatedAt: Last update timestamp - /// - isPermanent: whether the speaker should be protected from merges and removals by default + /// - isPermanent: Whether the speaker should be protected from merges and removals by default public func upsertSpeaker( id: String, currentEmbedding: [Float], @@ -603,7 +604,7 @@ public class SpeakerManager { } /// Reset the speaker database - /// - Parameter keepIfPermanent: whether to keep permanent speakers + /// - Parameter keepIfPermanent: Whether to keep permanent speakers public func reset(keepIfPermanent: Bool = false) { queue.sync(flags: .barrier) { if !keepIfPermanent { diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift index 36521d28f..09aee12cc 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift @@ -59,17 +59,18 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { return SendableSpeaker(from: self) } - /// Update main embedding with new segment data using exponential moving average + /// Update main embedding with new segment data using exponential moving average (EMA) /// - Parameters: - /// - duration: segment duration - /// - + /// - duration: Segment duration + /// - embedding: 256D speaker embedding vector + /// - segmentId: The ID of the segment + /// - alpha: EMA blending parameter public func updateMainEmbedding( duration: Float, embedding: [Float], segmentId: UUID, alpha: Float = 0.9 ) { - // Validate embedding quality var sumSquares: Float = 0 vDSP_svesq(embedding, 1, &sumSquares, vDSP_Length(embedding.count)) @@ -162,8 +163,8 @@ public final class Speaker: Identifiable, Codable, Equatable, Hashable { /// Merge another speaker into this one /// - Parameters: - /// - other: other Speaker to merge - /// - keepName: the resulting name after the merge + /// - other: Other Speaker to merge + /// - keepName: The resulting name after the merge public func mergeWith(_ other: Speaker, keepName: String? = nil) { // Merge raw embeddings var allEmbeddings = rawEmbeddings + other.rawEmbeddings @@ -269,13 +270,13 @@ public struct SendableSpeaker: Sendable, Identifiable, Hashable { } public enum SpeakerInitializationMode { - /// reset the speaker database and add the new speakers + /// Reset the speaker database and add the new speakers case reset - /// merge new speakers whose IDs match with existing ones + /// Merge new speakers whose IDs match with existing ones case merge - /// overwrite existing speakers with the same IDs as the new ones + /// Overwrite existing speakers with the same IDs as the new ones case overwrite - /// skip speakers whose IDs match existing ones + /// Skip speakers whose IDs match existing ones case skip } From 44d5b550bea385d5f973ac65033259ab6b9bacce Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 21:59:08 -0800 Subject: [PATCH 08/24] Fixed issue where merging did not respect permanent speakers in initialization --- .../FluidAudio/Diarizer/Clustering/SpeakerManager.swift | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 508c26b3f..22c57836c 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -69,8 +69,12 @@ public class SpeakerManager { logger.warning("Failed to overwrite Speaker \(speaker.id) because it is permanent. Skipping") } case .merge: - logger.warning("Speaker \(speaker.id) is already initialized. Merging with old speaker.") - oldSpeaker.mergeWith(speaker, keepName: speaker.name) + if !(oldSpeaker.isPermanent && preserveIfPermanent) { + logger.warning("Speaker \(speaker.id) is already initialized. Merging with old speaker.") + oldSpeaker.mergeWith(speaker, keepName: speaker.name) + } else { + logger.warning("Failed to merge Speaker \(speaker.id) into Speaker \(oldSpeaker.id) because the existing speaker is permanent. Skipping") + } case .skip: logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") continue From f9a48a0cf0ca148ee0c4d88a53f1307c611c7354 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 22:03:33 -0800 Subject: [PATCH 09/24] fixed typos --- .../Diarizer/Clustering/SpeakerManager.swift | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index 22c57836c..cf4e9903a 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -109,7 +109,7 @@ public class SpeakerManager { /// - confidence: Confidence in the embedding vector being correct /// - speakerThreshold: The maximum cosine distance to an existing speaker to create a new one (uses the default threshold for this `SpeakerManager` object if none is provided) /// - newName: Name to assign the speaker if a new one is created (default: `Speaker $id`) - /// - Returns: A `Speaker` object if a match was found or a new one was created. Returns `nil` if an error occured. + /// - Returns: A `Speaker` object if a match was found or a new one was created. Returns `nil` if an error occurred. public func assignSpeaker( _ embedding: [Float], speechDuration: Float, @@ -223,11 +223,10 @@ public class SpeakerManager { } /// Remove a speaker's permanent marker - /// - Parameter speakerId: ID of the speaker to mark as permanent + /// - Parameter speakerId: ID of the speaker from which to remove the permanent marker public func revokePermanence(from speakerId: String) { queue.sync(flags: .barrier) { - guard let speaker = speaker - base[speakerId] else { + guard let speaker = speakerDatabase[speakerId] else { logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") return } @@ -243,8 +242,7 @@ public class SpeakerManager { /// - destinationId: ID of the `Speaker` that absorbs the other one /// - mergedName: New name for the merged speaker (uses `destination`'s name if not provided) /// - stopIfPermanent: Whether to stop merging if the source speaker is permanent - /// - Returns: `true` If merge was successful, `false` if not. - public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) -> Void { + public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) { // don't merge a speaker into itself guard sourceId != destinationId else { return From bacd0b17145a7da8ee21887e9b2d9ce33fd9ced1 Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Wed, 5 Nov 2025 22:13:08 -0800 Subject: [PATCH 10/24] Update Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Diarizer/Clustering/SpeakerManager.swift | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index cf4e9903a..f39ed045a 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -611,11 +611,20 @@ public class SpeakerManager { queue.sync(flags: .barrier) { if !keepIfPermanent { speakerDatabase.removeAll() + nextSpeakerId = 1 + highestSpeakerId = 0 } else { speakerDatabase = speakerDatabase.filter(\.value.isPermanent) + // Recalculate nextSpeakerId and highestSpeakerId based on remaining permanent speakers + var maxNumericId = 0 + for id in speakerDatabase.keys { + if let numericId = Int(id) { + maxNumericId = max(maxNumericId, numericId) + } + } + highestSpeakerId = maxNumericId + nextSpeakerId = maxNumericId + 1 } - nextSpeakerId = 1 - highestSpeakerId = 0 logger.info("Speaker database reset") } } From 545d561a0d863b68a8b605e8d6ef990b621665d0 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:27:33 -0800 Subject: [PATCH 11/24] fixed a typo --- Documentation/SpeakerManager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 9bee0379d..e25fb7951 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -730,7 +730,7 @@ class RealtimeDiarizer { | `getSpeakerList()` | `[Speaker]` | Get array of all speakers (debugging) | | `hasSpeaker(_:)` | `Bool` | Check if database has a speaker with a given ID | | `reset(keepIfPermanent:)` | `Void` | Clear speaker database | -| `resetPermanentFlag()` | `Void` | Mark all speakers as not permanent | +| `resetPermanentFlags()` | `Void` | Mark all speakers as not permanent | | `getCurrentSpeakerNames()` | `[String]` | Get sorted speaker IDs | | `getGlobalSpeakerStats()` | `(Int, Float, Float, Int)` | Aggregate statistics | From afdb6d8a6c93e8da8b5639c9c70cd62fee20f052 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:31:52 -0800 Subject: [PATCH 12/24] fixed formatting for a logger warning --- Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index f39ed045a..ebb74d271 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -50,8 +50,7 @@ public class SpeakerManager { for speaker in speakers { guard speaker.currentEmbedding.count == Self.embeddingSize else { - logger.warning( - "Skipping speaker \(speaker.id) - invalid embedding size: \(speaker.currentEmbedding.count)") + logger.warning("Skipping speaker \(speaker.id) - invalid embedding size: \(speaker.currentEmbedding.count)") continue } From 189e38b0e78c05afeaf523539864c06be5d3608f Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Thu, 6 Nov 2025 10:33:45 -0800 Subject: [PATCH 13/24] removed extra newline from end of file --- Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift index 09aee12cc..741285ebb 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerTypes.swift @@ -269,6 +269,7 @@ public struct SendableSpeaker: Sendable, Identifiable, Hashable { } } +/// Configuration for handling initializing known speakers public enum SpeakerInitializationMode { /// Reset the speaker database and add the new speakers case reset @@ -279,4 +280,3 @@ public enum SpeakerInitializationMode { /// Skip speakers whose IDs match existing ones case skip } - From f57b86d0227848723ba0fc151b82ecc09030e7b0 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Thu, 6 Nov 2025 12:16:07 -0800 Subject: [PATCH 14/24] I think i fixed the formatting issue --- .../Diarizer/Clustering/SpeakerManager.swift | 128 ++++++++++-------- 1 file changed, 71 insertions(+), 57 deletions(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index ebb74d271..deb5c1095 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -34,45 +34,49 @@ public class SpeakerManager { self.minSpeechDuration = minSpeechDuration self.minEmbeddingUpdateDuration = minEmbeddingUpdateDuration } - + /// Add known speakers to the database /// - Parameters: /// - speakers: Array of `Speaker`s to add - /// - mode: Mode for handling overlapping ID conflicts. `.reset` will reset the speaker database before initializing new speakers. `.overwrite` will overwrite the old speakers and replace them with the new ones. `.merge` Will merge the new speakers with the old ones, keeping the new name. `.skip` will skip new speakers if their ID matches an existing one. (Default: `.skip`) + /// - mode: Mode for handling overlapping ID conflicts. /// - preservePermanent: Whether to avoid overwriting/merging pre-existing permanent speakers - public func initializeKnownSpeakers(_ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preserveIfPermanent: Bool = true) { + public func initializeKnownSpeakers( + _ speakers: [Speaker], mode: SpeakerInitializationMode = .skip, preserveIfPermanent: Bool = true + ) { if mode == .reset { self.reset(keepIfPermanent: preserveIfPermanent) } - + queue.sync(flags: .barrier) { var maxNumericId = 0 for speaker in speakers { guard speaker.currentEmbedding.count == Self.embeddingSize else { - logger.warning("Skipping speaker \(speaker.id) - invalid embedding size: \(speaker.currentEmbedding.count)") + logger.warning( + "Skipping speaker \(speaker.id) - invalid embedding size: \(speaker.currentEmbedding.count)") continue } - + // Check if the speaker ID is already initialized if let oldSpeaker = self.speakerDatabase[speaker.id] { // Handle duplicate speaker switch mode { - case .reset: - fallthrough - case .overwrite: + case .reset, .overwrite: if !(oldSpeaker.isPermanent && preserveIfPermanent) { logger.warning("Speaker \(speaker.id) is already initialized. Overwriting old speaker.") speakerDatabase[speaker.id] = speaker } else { - logger.warning("Failed to overwrite Speaker \(speaker.id) because it is permanent. Skipping") + logger.warning( + "Failed to overwrite Speaker \(speaker.id) because it is permanent. Skipping") } case .merge: if !(oldSpeaker.isPermanent && preserveIfPermanent) { logger.warning("Speaker \(speaker.id) is already initialized. Merging with old speaker.") oldSpeaker.mergeWith(speaker, keepName: speaker.name) } else { - logger.warning("Failed to merge Speaker \(speaker.id) into Speaker \(oldSpeaker.id) because the existing speaker is permanent. Skipping") + logger.warning( + "Failed to merge Speaker \(speaker.id) into Speaker \(oldSpeaker.id) because the existing speaker is permanent. Skipping" + ) } case .skip: logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") @@ -123,7 +127,7 @@ public class SpeakerManager { let normalizedEmbedding = VDSPOperations.l2Normalize(embedding) let speakerThreshold = speakerThreshold ?? self.speakerThreshold - + return queue.sync(flags: .barrier) { let (closestSpeaker, distance) = findClosestSpeaker(to: normalizedEmbedding) @@ -161,7 +165,7 @@ public class SpeakerManager { return nil } } - + /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. /// - Parameters: /// - embedding: 256D speaker embedding vector @@ -177,28 +181,30 @@ public class SpeakerManager { return (nil, .infinity) } } - + /// Find the closest existing speaker to an embedding, up to a maximum cosine distance of `speakerThreshold`. /// - Parameters: /// - embedding: 256D speaker embedding vector /// - speakerThreshold: Maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) /// - Returns: Array of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. - public func findMatchingSpeakers(with embedding: [Float], speakerThreshold: Float? = nil) -> [(id: String, distance: Float)] { + public func findMatchingSpeakers( + with embedding: [Float], speakerThreshold: Float? = nil + ) -> [(id: String, distance: Float)] { queue.sync { var matches: [(id: String, distance: Float)] = [] let speakerThreshold = speakerThreshold ?? self.speakerThreshold - + for (speakerId, speaker) in speakerDatabase { let distance = cosineDistance(embedding, speaker.currentEmbedding) if distance <= speakerThreshold { - matches.append( (speakerId, distance) ) + matches.append((speakerId, distance)) } } matches.sort { $0.distance < $1.distance } return matches } } - + /// Find all speakers that meet a certain predicate /// - Parameter predicate: Condition the speakers must meet to be returned /// - Returns: A list of all Speaker IDs corresponding to Speakers that meet the predicate @@ -207,7 +213,7 @@ public class SpeakerManager { return speakerDatabase.filter { predicate($0.value) }.map(\.key) } } - + /// Mark a speaker as permanent /// - Parameter speakerId: ID of the speaker to mark as permanent public func makeSpeakerPermanent(_ speakerId: String) { @@ -220,7 +226,7 @@ public class SpeakerManager { speaker.isPermanent = true } } - + /// Remove a speaker's permanent marker /// - Parameter speakerId: ID of the speaker from which to remove the permanent marker public func revokePermanence(from speakerId: String) { @@ -229,93 +235,101 @@ public class SpeakerManager { logger.warning("Failed to revoke permanence from speaker \(speakerId) (speaker not found).") return } - + logger.info("Revoking permanence from speaker \(speakerId).") speaker.isPermanent = false } } - + /// Merge two speakers in the database. /// - Parameters: /// - sourceId: ID of the `Speaker` being merged /// - destinationId: ID of the `Speaker` that absorbs the other one /// - mergedName: New name for the merged speaker (uses `destination`'s name if not provided) /// - stopIfPermanent: Whether to stop merging if the source speaker is permanent - public func mergeSpeaker(_ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true) { + public func mergeSpeaker( + _ sourceId: String, into destinationId: String, mergedName: String? = nil, stopIfPermanent: Bool = true + ) { // don't merge a speaker into itself guard sourceId != destinationId else { return } - + queue.sync(flags: .barrier) { // ensure both speakers exist guard let speakerToMerge = speakerDatabase[sourceId], - let destinationSpeaker = speakerDatabase[destinationId] else { + let destinationSpeaker = speakerDatabase[destinationId] + else { return } - + // don't merge permanent speakers into another one guard !(stopIfPermanent && speakerToMerge.isPermanent) else { return } - + // merge source into destination destinationSpeaker.mergeWith(speakerToMerge, keepName: mergedName) - + // remove source speaker speakerDatabase.removeValue(forKey: sourceId) } } - + /// Find all pairs of speakers that can be merged /// - Parameters: /// - speakerThreshold: Max cosine distance between speakers to let them be considered mergeable /// - excludeIfBothPermanent: Whether to exclude speaker pairs where both speakers are permanent /// - Returns: Array of speaker ID pairs that belong to speakers that are similar enough to be merged - public func findMergeablePairs(speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true) -> [(speakerToMerge: String, destination: String)] { + public func findMergeablePairs( + speakerThreshold: Float? = nil, excludeIfBothPermanent: Bool = true + ) -> [(speakerToMerge: String, destination: String)] { queue.sync { let speakerThreshold = speakerThreshold ?? self.speakerThreshold var pairs: [(speakerToMerge: String, destination: String)] = [] let ids = Array(speakerDatabase.keys) - + for i in (0.. Bool) { removeSpeakers(where: predicate, keepIfPermanent: true) } - + /// Check if the speaker database has a speaker with a given ID. /// - Parameter speakerId: ID to check /// - Returns: `true` if a speaker is found, `false` if not @@ -404,13 +418,13 @@ public class SpeakerManager { return speakerDatabase.keys.contains(speakerId) } } - + private func findDistanceToClosestSpeaker(to embedding: [Float]) -> Float { return speakerDatabase.values.reduce(Float.infinity) { min($0, cosineDistance(embedding, $1.currentEmbedding)) } } - + private func findClosestSpeaker(to embedding: [Float]) -> (speakerId: String?, distance: Float) { var minDistance: Float = Float.infinity var closestSpeakerId: String? @@ -503,7 +517,7 @@ public class SpeakerManager { public var speakerIds: [String] { queue.sync { Array(speakerDatabase.keys).sorted() } } - + public var permanentSpeakerIds: [String] { queue.sync { Array(speakerDatabase.filter(\.value.isPermanent).keys).sorted() } } @@ -514,7 +528,7 @@ public class SpeakerManager { return speakerDatabase } } - + /// Get list of all speakers. public func getSpeakerList() -> [Speaker] { queue.sync { @@ -627,7 +641,7 @@ public class SpeakerManager { logger.info("Speaker database reset") } } - + /// Mark all speakers as not permanent public func resetPermanentFlags() { queue.sync(flags: .barrier) { From a492c82df24f69fb91aca58223bd4dfde992b951 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 18:05:42 -0800 Subject: [PATCH 15/24] Added Test Cases with Codex --- .../FluidAudioTests/SpeakerManagerTests.swift | 230 ++++++++++++++++++ 1 file changed, 230 insertions(+) diff --git a/Tests/FluidAudioTests/SpeakerManagerTests.swift b/Tests/FluidAudioTests/SpeakerManagerTests.swift index b8c0ab350..12b0e8605 100644 --- a/Tests/FluidAudioTests/SpeakerManagerTests.swift +++ b/Tests/FluidAudioTests/SpeakerManagerTests.swift @@ -15,6 +15,10 @@ final class SpeakerManagerTests: XCTestCase { return embedding } + private func normalizedEmbedding(pattern: Int) -> [Float] { + VDSPOperations.l2Normalize(createDistinctEmbedding(pattern: pattern)) + } + // MARK: - Basic Operations func testInitialization() { @@ -119,6 +123,80 @@ final class SpeakerManagerTests: XCTestCase { XCTAssertEqual(assignedSpeaker?.id, "Alice") } + func testInitializeKnownSpeakersPreservesPermanentByDefault() { + let manager = SpeakerManager() + + let original = Speaker( + id: "Alice", + name: "Original", + currentEmbedding: createDistinctEmbedding(pattern: 10), + duration: 4.0 + ) + manager.initializeKnownSpeakers([original]) + manager.makeSpeakerPermanent("Alice") + + let replacement = Speaker( + id: "Alice", + name: "Replacement", + currentEmbedding: createDistinctEmbedding(pattern: 20), + duration: 8.0 + ) + + manager.initializeKnownSpeakers([replacement], mode: .overwrite, preserveIfPermanent: true) + + let stored = manager.getSpeaker(for: "Alice") + XCTAssertEqual(stored?.name, "Original") + XCTAssertEqual(stored?.duration, 4.0) + } + + func testInitializeKnownSpeakersOverwriteCanReplacePermanentWhenAllowed() { + let manager = SpeakerManager() + + let original = Speaker( + id: "Alice", + name: "Original", + currentEmbedding: createDistinctEmbedding(pattern: 10), + duration: 4.0, + isPermanent: true + ) + manager.initializeKnownSpeakers([original]) + + let replacement = Speaker( + id: "Alice", + name: "Replacement", + currentEmbedding: createDistinctEmbedding(pattern: 20), + duration: 10.0 + ) + + manager.initializeKnownSpeakers([replacement], mode: .overwrite, preserveIfPermanent: false) + + let stored = manager.getSpeaker(for: "Alice") + XCTAssertEqual(stored?.name, "Replacement") + XCTAssertEqual(stored?.duration, 10.0) + } + + func testInitializeKnownSpeakersMergeCombinesDurations() { + let manager = SpeakerManager() + + let base = Speaker( + id: "Alice", + name: "Alice", + currentEmbedding: createDistinctEmbedding(pattern: 10), + duration: 2.0 + ) + let incoming = Speaker( + id: "Alice", + name: "Alice", + currentEmbedding: createDistinctEmbedding(pattern: 11), + duration: 3.0 + ) + + manager.initializeKnownSpeakers([base]) + manager.initializeKnownSpeakers([incoming], mode: .merge) + + XCTAssertEqual(manager.getSpeaker(for: "Alice")?.duration, 5.0) + } + func testInvalidEmbeddingSize() { let manager = SpeakerManager() @@ -206,6 +284,46 @@ final class SpeakerManagerTests: XCTestCase { } } + // MARK: - Lookup Helpers + + func testFindSpeakerAndMatchingSpeakers() { + let manager = SpeakerManager(speakerThreshold: 0.8) + + manager.upsertSpeaker(id: "A", currentEmbedding: normalizedEmbedding(pattern: 1), duration: 5.0) + manager.upsertSpeaker(id: "B", currentEmbedding: normalizedEmbedding(pattern: 2), duration: 5.0) + + let (matchId, distance) = manager.findSpeaker(with: normalizedEmbedding(pattern: 1)) + XCTAssertEqual(matchId, "A") + XCTAssertEqual(distance, 0.0, accuracy: 0.0001) + + let (missingId, missingDistance) = manager.findSpeaker( + with: normalizedEmbedding(pattern: 90), + speakerThreshold: 0.1 + ) + XCTAssertNil(missingId) + XCTAssertEqual(missingDistance, .infinity) + + let combined = zip(normalizedEmbedding(pattern: 1), normalizedEmbedding(pattern: 2)).map { ($0 + $1) / 2 } + let matches = manager.findMatchingSpeakers( + with: VDSPOperations.l2Normalize(combined), + speakerThreshold: 2.0 + ) + + XCTAssertEqual(matches.count, 2) + XCTAssertLessThanOrEqual(matches[0].distance, matches[1].distance) + XCTAssertEqual(Set(matches.map(\.id)), Set(["A", "B"])) + } + + func testFindSpeakersWhereFiltersByPredicate() { + let manager = SpeakerManager() + manager.upsertSpeaker(id: "short", currentEmbedding: normalizedEmbedding(pattern: 10), duration: 1.0) + manager.upsertSpeaker(id: "long", currentEmbedding: normalizedEmbedding(pattern: 20), duration: 8.0) + + let filtered = manager.findSpeakers { $0.duration > 5.0 } + XCTAssertEqual(filtered.count, 1) + XCTAssertEqual(filtered.first, "long") + } + // MARK: - Clear Operations func testResetSpeakers() { @@ -426,6 +544,118 @@ final class SpeakerManagerTests: XCTestCase { XCTAssertEqual(info?.rawEmbeddings.count, 1) } + // MARK: - Permanence & Merge Operations + + func testMakeAndRevokePermanentSpeakers() throws { + let manager = SpeakerManager() + let speaker = manager.assignSpeaker(createDistinctEmbedding(pattern: 1), speechDuration: 2.5) + let id = try XCTUnwrap(speaker?.id) + + manager.makeSpeakerPermanent(id) + XCTAssertTrue(manager.permanentSpeakerIds.contains(id)) + + manager.removeSpeaker(id) + XCTAssertTrue(manager.hasSpeaker(id)) + + manager.revokePermanence(from: id) + manager.removeSpeaker(id) + XCTAssertFalse(manager.hasSpeaker(id)) + } + + func testMergeSpeakerRespectsPermanentFlag() throws { + let manager = SpeakerManager() + let speaker1 = manager.assignSpeaker(createDistinctEmbedding(pattern: 1), speechDuration: 3.0) + let speaker2 = manager.assignSpeaker(createDistinctEmbedding(pattern: 2), speechDuration: 4.0) + + let id1 = try XCTUnwrap(speaker1?.id) + let id2 = try XCTUnwrap(speaker2?.id) + + manager.makeSpeakerPermanent(id1) + manager.mergeSpeaker(id1, into: id2) + XCTAssertTrue(manager.hasSpeaker(id1)) + XCTAssertTrue(manager.hasSpeaker(id2)) + + manager.mergeSpeaker(id1, into: id2, mergedName: "Merged Speaker", stopIfPermanent: false) + XCTAssertFalse(manager.hasSpeaker(id1)) + let merged = try XCTUnwrap(manager.getSpeaker(for: id2)) + XCTAssertEqual(merged.name, "Merged Speaker") + XCTAssertEqual(manager.speakerCount, 1) + XCTAssertGreaterThan(merged.duration, 4.0) + } + + func testFindMergeablePairsRespectsPermanentExclusion() { + let manager = SpeakerManager(speakerThreshold: 0.3) + let base = normalizedEmbedding(pattern: 1) + var close = base + close[0] += 0.001 + close = VDSPOperations.l2Normalize(close) + let far = normalizedEmbedding(pattern: 80) + + manager.upsertSpeaker(id: "A", currentEmbedding: base, duration: 5.0) + manager.upsertSpeaker(id: "B", currentEmbedding: close, duration: 5.0) + manager.upsertSpeaker(id: "C", currentEmbedding: far, duration: 5.0) + + let pairs = manager.findMergeablePairs(speakerThreshold: 0.2) + XCTAssertEqual(pairs.count, 1) + XCTAssertEqual(Set([pairs[0].speakerToMerge, pairs[0].destination]), Set(["A", "B"])) + + manager.makeSpeakerPermanent("A") + manager.makeSpeakerPermanent("B") + + let filtered = manager.findMergeablePairs(speakerThreshold: 0.2, excludeIfBothPermanent: true) + XCTAssertTrue(filtered.isEmpty) + + let unfiltered = manager.findMergeablePairs(speakerThreshold: 0.2, excludeIfBothPermanent: false) + XCTAssertEqual(unfiltered.count, 1) + XCTAssertEqual(Set([unfiltered[0].speakerToMerge, unfiltered[0].destination]), Set(["A", "B"])) + } + + // MARK: - Removal & Reset + + func testRemoveSpeakersInactiveAndPredicateVariants() { + let manager = SpeakerManager() + let now = Date() + manager.upsertSpeaker( + id: "old", + currentEmbedding: normalizedEmbedding(pattern: 3), + duration: 2.0, + updatedAt: now.addingTimeInterval(-120) + ) + manager.upsertSpeaker( + id: "recent", + currentEmbedding: normalizedEmbedding(pattern: 4), + duration: 2.0, + updatedAt: now + ) + + manager.removeSpeakersInactive(since: now.addingTimeInterval(-60)) + XCTAssertFalse(manager.hasSpeaker("old")) + XCTAssertTrue(manager.hasSpeaker("recent")) + + manager.makeSpeakerPermanent("recent") + manager.removeSpeakers { $0.duration <= 2.0 } + XCTAssertTrue(manager.hasSpeaker("recent")) + + manager.removeSpeakers(where: { $0.duration <= 2.0 }, keepIfPermanent: false) + XCTAssertFalse(manager.hasSpeaker("recent")) + } + + func testResetKeepsPermanentSpeakers() throws { + let manager = SpeakerManager() + let speaker1 = manager.assignSpeaker(createDistinctEmbedding(pattern: 1), speechDuration: 2.0) + let speaker2 = manager.assignSpeaker(createDistinctEmbedding(pattern: 2), speechDuration: 2.0) + + let id1 = try XCTUnwrap(speaker1?.id) + let id2 = try XCTUnwrap(speaker2?.id) + + manager.makeSpeakerPermanent(id1) + manager.reset(keepIfPermanent: true) + + XCTAssertTrue(manager.hasSpeaker(id1)) + XCTAssertFalse(manager.hasSpeaker(id2)) + XCTAssertEqual(manager.speakerIds, [id1]) + } + // MARK: - Embedding Update Tests func testEmbeddingUpdateWithinAssignSpeaker() { From ba6803070cf67c173bbf0afbbe9bc4cb3f17140d Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 19:24:32 -0800 Subject: [PATCH 16/24] fixed the test that failed --- Tests/FluidAudioTests/SpeakerManagerTests.swift | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/Tests/FluidAudioTests/SpeakerManagerTests.swift b/Tests/FluidAudioTests/SpeakerManagerTests.swift index 12b0e8605..ad1010687 100644 --- a/Tests/FluidAudioTests/SpeakerManagerTests.swift +++ b/Tests/FluidAudioTests/SpeakerManagerTests.swift @@ -291,14 +291,20 @@ final class SpeakerManagerTests: XCTestCase { manager.upsertSpeaker(id: "A", currentEmbedding: normalizedEmbedding(pattern: 1), duration: 5.0) manager.upsertSpeaker(id: "B", currentEmbedding: normalizedEmbedding(pattern: 2), duration: 5.0) + let (matchId, distance) = manager.findSpeaker(with: normalizedEmbedding(pattern: 1)) XCTAssertEqual(matchId, "A") XCTAssertEqual(distance, 0.0, accuracy: 0.0001) + var orthogonalEmbedding0 = [Float](repeating: 0, count: 256) + var orthogonalEmbedding1 = [Float](repeating: 0, count: 256) + orthogonalEmbedding0[0] = 1 + orthogonalEmbedding1[1] = 1 + manager.upsertSpeaker(id: "C", currentEmbedding: orthogonalEmbedding0, duration: 5.0) let (missingId, missingDistance) = manager.findSpeaker( - with: normalizedEmbedding(pattern: 90), - speakerThreshold: 0.1 + with: orthogonalEmbedding1, + speakerThreshold: 0.5 ) XCTAssertNil(missingId) XCTAssertEqual(missingDistance, .infinity) @@ -309,9 +315,9 @@ final class SpeakerManagerTests: XCTestCase { speakerThreshold: 2.0 ) - XCTAssertEqual(matches.count, 2) + XCTAssertEqual(matches.count, 3) XCTAssertLessThanOrEqual(matches[0].distance, matches[1].distance) - XCTAssertEqual(Set(matches.map(\.id)), Set(["A", "B"])) + XCTAssertEqual(Set(matches.map(\.id)), Set(["A", "B", "C"])) } func testFindSpeakersWhereFiltersByPredicate() { From 432ee7c7d72f1c64a11f3d0160f750c7a827e709 Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:14:57 -0800 Subject: [PATCH 17/24] Update Documentation/SpeakerManager.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Documentation/SpeakerManager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index e25fb7951..6f91fab92 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -147,7 +147,7 @@ speakerManager.removeSpeaker("alice", keepIfPermanent: false) > Note: `keepIfPermanent` is `true` by default. #### removeSpeakersInactive -Remove speaker that have been inactive since a certain date or for a certain duration. +Remove speakers that have been inactive since a certain date or for a certain duration. ```swift // remove speakers that have been inactive since `date` From de77a389ec511c3d8a108a7805620177c1e8b3c0 Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:35:19 -0800 Subject: [PATCH 18/24] Update Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index deb5c1095..e1dbe93e4 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -186,7 +186,7 @@ public class SpeakerManager { /// - Parameters: /// - embedding: 256D speaker embedding vector /// - speakerThreshold: Maximum cosine distance between `embedding` and another speaker for them to be a match (default: `self.speakerThreshold`) - /// - Returns: Array of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by descending cosine distances. + /// - Returns: Array of the `maxCount` nearest speakers and the distances to them from `embedding`, sorted by ascending cosine distances (from closest to farthest). public func findMatchingSpeakers( with embedding: [Float], speakerThreshold: Float? = nil ) -> [(id: String, distance: Float)] { From 7204b4d71d29cb286d27ce928f273e8eb43a020b Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:36:11 -0800 Subject: [PATCH 19/24] Update Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index e1dbe93e4..87caa8a33 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -77,6 +77,7 @@ public class SpeakerManager { logger.warning( "Failed to merge Speaker \(speaker.id) into Speaker \(oldSpeaker.id) because the existing speaker is permanent. Skipping" ) + continue } case .skip: logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") From e9ea4d206a746d340592c62861069fb79c0130da Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:36:23 -0800 Subject: [PATCH 20/24] Update Documentation/SpeakerManager.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Documentation/SpeakerManager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 6f91fab92..860b8ddf0 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -237,7 +237,7 @@ Returns a list of pairs of speaker IDs. ```swift let pairs = speakerManager.findMergeablePairs( - speakerThreshold = 0.6, // optional + speakerThreshold: 0.6, // optional excludeIfBothPermanent = true // optional ) From 4b60a44fce6094b12c3be7a7bf94e92fa6bd89e2 Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:36:40 -0800 Subject: [PATCH 21/24] Update Documentation/SpeakerManager.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Documentation/SpeakerManager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 860b8ddf0..2002c96be 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -238,7 +238,7 @@ Returns a list of pairs of speaker IDs. ```swift let pairs = speakerManager.findMergeablePairs( speakerThreshold: 0.6, // optional - excludeIfBothPermanent = true // optional + excludeIfBothPermanent: true // optional ) for pair in pairs { From 125c9afd683b4a443f88cbafeb4d2e691b3594f4 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:38:57 -0800 Subject: [PATCH 22/24] Fixed Minor bugs --- Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift index deb5c1095..60836f4a0 100644 --- a/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift +++ b/Sources/FluidAudio/Diarizer/Clustering/SpeakerManager.swift @@ -68,6 +68,7 @@ public class SpeakerManager { } else { logger.warning( "Failed to overwrite Speaker \(speaker.id) because it is permanent. Skipping") + continue } case .merge: if !(oldSpeaker.isPermanent && preserveIfPermanent) { @@ -77,6 +78,7 @@ public class SpeakerManager { logger.warning( "Failed to merge Speaker \(speaker.id) into Speaker \(oldSpeaker.id) because the existing speaker is permanent. Skipping" ) + continue } case .skip: logger.warning("Speaker \(speaker.id) is already initialized. Skipping new speaker.") @@ -585,7 +587,7 @@ public class SpeakerManager { existingSpeaker.rawEmbeddings = rawEmbeddings existingSpeaker.updateCount = updateCount existingSpeaker.updatedAt = updatedAt ?? now - existingSpeaker.isPermanent = isPermanent + existingSpeaker.isPermanent = existingSpeaker.isPermanent || isPermanent // Keep original createdAt and name speakerDatabase[id] = existingSpeaker From 4a07829491e9c76ff76e0b0ec6fd0d790ae0a11e Mon Sep 17 00:00:00 2001 From: Benjamin Lee <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 21:53:07 -0800 Subject: [PATCH 23/24] Update Documentation/SpeakerManager.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- Documentation/SpeakerManager.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/SpeakerManager.md b/Documentation/SpeakerManager.md index 2002c96be..026ff7ea1 100644 --- a/Documentation/SpeakerManager.md +++ b/Documentation/SpeakerManager.md @@ -153,7 +153,7 @@ Remove speakers that have been inactive since a certain date or for a certain du // remove speakers that have been inactive since `date` speakerManager.removeSpeakersInactive(since: date) -// remove speakers that have been active for 10 seconds, even if they were marked as permanent +// remove speakers that have been inactive for 10 seconds, even if they were marked as permanent speakerManager.removeSpeakersInactive(for: 10.0, keepIfPermanent: false) ``` From a6b99fde31bdd2d90ed6ae9f5f322f92394a66a1 Mon Sep 17 00:00:00 2001 From: Gradient Descent <48599511+SGD2718@users.noreply.github.com> Date: Fri, 7 Nov 2025 22:14:39 -0800 Subject: [PATCH 24/24] Fixed Formatting Issues (I think) --- Tests/FluidAudioTests/SpeakerManagerTests.swift | 1 - 1 file changed, 1 deletion(-) diff --git a/Tests/FluidAudioTests/SpeakerManagerTests.swift b/Tests/FluidAudioTests/SpeakerManagerTests.swift index ad1010687..84986f620 100644 --- a/Tests/FluidAudioTests/SpeakerManagerTests.swift +++ b/Tests/FluidAudioTests/SpeakerManagerTests.swift @@ -291,7 +291,6 @@ final class SpeakerManagerTests: XCTestCase { manager.upsertSpeaker(id: "A", currentEmbedding: normalizedEmbedding(pattern: 1), duration: 5.0) manager.upsertSpeaker(id: "B", currentEmbedding: normalizedEmbedding(pattern: 2), duration: 5.0) - let (matchId, distance) = manager.findSpeaker(with: normalizedEmbedding(pattern: 1)) XCTAssertEqual(matchId, "A")