FluidInference
diff --git a/‎Documentation/CLI.md‎
Lines changed: 10 additions & 2 deletions b/‎Documentation/CLI.md‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎Documentation/Guides/MCP.md‎
Lines changed: 0 additions & 24 deletions b/‎Documentation/Guides/MCP.md‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎Documentation/VAD/GettingStarted.md‎
Lines changed: 153 additions & 34 deletions b/‎Documentation/VAD/GettingStarted.md‎
Lines changed: 153 additions & 34 deletions
diff --git a/‎Documentation/VAD/Segmentation.md‎
Lines changed: 83 additions & 0 deletions b/‎Documentation/VAD/Segmentation.md‎
Lines changed: 83 additions & 0 deletions
@@ -42,13 +42,22 @@ swift run fluidaudio diarization-benchmark --dataset ami-sdm \
 ## VAD
 
 ```bash
+# Offline segmentation with seconds output (default mode)
+swift run fluidaudio vad-analyze path/to/audio.wav
+
+# Streaming only with 128 ms chunks and a custom threshold (timestamps emitted in seconds)
+swift run fluidaudio vad-analyze path/to/audio.wav --streaming --threshold 0.65 --min-silence-ms 400
+
 # Run VAD benchmark (mini50 dataset by default)
 swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
 
-# Save results and enable debug output
+# Save benchmark results and enable debug output
 swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug
 ```
 
+`swift run fluidaudio vad-analyze --help` lists every tuning option (padding,
+negative threshold overrides, max-duration splitting, etc.).
+
 ## Datasets
 
 ```bash
@@ -58,4 +67,3 @@ swift run fluidaudio download --dataset librispeech-test-other
 swift run fluidaudio download --dataset ami-sdm
 swift run fluidaudio download --dataset vad
 ```
-
 
@@ -1,59 +1,178 @@
 # Voice Activity Detection (VAD)
 
-The current VAD APIs require careful tuning for your specific use case. If you need help integrating VAD, reach out in our Discord channel.
+Fluid Audio ships the Silero VAD converted for Core ML together with Silero-style
+timestamp extraction and streaming hysteresis. If you need help tuning the
+parameters for your use case, reach out on Discord.
 
-Our goal is to provide a streamlined API similar to Apple's upcoming SpeechDetector in [OS26](https://developer.apple.com/documentation/speech/speechdetector).
+## Quick Start
 
-## Quick Start (Code)
+Need chunk-level probabilities or state for custom pipelines? Call `process(_:)`
+to inspect every 256 ms hop:
+
+```swift
+let results = try await manager.process(samples)
+for (index, chunk) in results.enumerated() {
+    print(
+        String(
+            format: "Chunk %02d: prob=%.3f, inference=%.4fs",
+            index,
+            chunk.probability,
+            chunk.processingTime
+        )
+    )
+}
+```
+
+## Offline Segmentation (Code)
+
+`VadManager` can now emit ready-to-use speech intervals directly from PCM
+samples. The segmentation logic mirrors the Silero reference implementation,
+including minimum speech duration, silence padding, and max-duration splitting.
 
 ```swift
 import FluidAudio
 
-// Programmatic VAD over an audio file
 Task {
-    // 1) Initialize VAD (async loads Silero model)
-    let vad = try await VadManager(
-        config: VadConfig(threshold: 0.85, debugMode: false)
+    let manager = try await VadManager(
+        config: VadConfig(threshold: 0.75)
+    )
+
+    // Convert any supported file to 16 kHz mono Float32
+    let audioURL = URL(fileURLWithPath: "path/to/audio.wav")
+    let samples = try AudioConverter().resampleAudioFile(audioURL)
+
+    // Tune segmentation behavior with VadSegmentationConfig
+    var segmentation = VadSegmentationConfig.default
+    segmentation.minSpeechDuration = 0.25
+    segmentation.minSilenceDuration = 0.4
+    segmentation.speechPadding = 0.12
+
+    let segments = try await manager.segmentSpeech(samples, config: segmentation)
+    for (index, segment) in segments.enumerated() {
+        print(String(
+            format: "Segment %02d: %.2f–%.2fs",
+            index + 1,
+            segment.startTime,
+            segment.endTime
+        ))
+    }
+
+    // Need audio chunks instead of timestamps?
+    let clips = try await manager.segmentSpeechAudio(samples, config: segmentation)
+    print("Extracted \(clips.count) buffered segments ready for ASR")
+}
+```
+
+Need chunk-level probabilities for each 256 ms hop? Use `process(_:)` and inspect
+`VadResult` directly:
+
+```swift
+let results = try await manager.process(samples)
+for (index, chunk) in results.enumerated() {
+    print(
+        String(
+            format: "Chunk %02d: prob=%.3f, inference=%.4fs",
+            index,
+            chunk.probability,
+            chunk.processingTime
+        )
     )
+}
+```
+
+Key knobs in `VadSegmentationConfig`:
+- `minSpeechDuration`: discard very short bursts.
+- `minSilenceDuration`: silence length required to close a segment.
+- `maxSpeechDuration`: automatically split long spans using the last detected silence (default 14 s).
+- `speechPadding`: context added on both sides of each returned segment.
+- `negativeThreshold`/`negativeThresholdOffset`: control hysteresis the same way as Silero's `threshold`/`neg_threshold`.
+
+### Measuring Offline RTF
 
-    // 2) Process any supported file; conversion to 16 kHz mono is automatic
-    let url = URL(fileURLWithPath: "path/to/audio.wav")
-    let results = try await vad.process(url)
-
-    // 3) Convert per-frame decisions into segments (512-sample frames @ 16 kHz)
-    let sampleRate = 16000.0
-    let frame = 512.0
-
-    var startIndex: Int? = nil
-    for (i, r) in results.enumerated() {
-        if r.isVoiceActive {
-            startIndex = startIndex ?? i
-        } else if let s = startIndex {
-            let startSec = (Double(s) * frame) / sampleRate
-            let endSec = (Double(i + 1) * frame) / sampleRate
-            print(String(format: "Speech: %.2f–%.2fs", startSec, endSec))
-            startIndex = nil
+If you prefer to keep the per-chunk `VadResult` output, you can measure the
+real-time factor (RTFx) of non-streaming runs by comparing total inference time
+with the audio duration:
+
+```swift
+let results = try await manager.process(samples)
+let totalInference = results.reduce(0.0) { $0 + $1.processingTime }
+let audioSeconds = Double(samples.count) / Double(VadManager.sampleRate)
+let rtf = audioSeconds / totalInference
+print(String(format: "VAD RTFx: %.1f", rtf))
+```
+
+`VadResult.processingTime` is reported per 4096-sample chunk, so summing across
+the array yields the full pass latency.
+
+## Streaming API
+
+For streaming workloads you control the chunk size and maintain a
+`VadStreamState`. Each call emits at most one `VadStreamEvent` describing a
+speech start or end boundary.
+
+```swift
+import FluidAudio
+
+Task {
+    let manager = try await VadManager()
+    var state = await manager.makeStreamState()
+
+    for chunk in microphoneChunks { // chunk length ~256 ms at 16 kHz
+        let result = try await manager.processStreamingChunk(
+            chunk,
+            state: state,
+            config: .default,
+            returnSeconds: true,
+            timeResolution: 2
+        )
+
+        state = result.state
+        if let event = result.event {
+            switch event.kind {
+            case .speechStart:
+                print("Speech began at \(event.time ?? 0) s")
+            case .speechEnd:
+                print("Speech ended at \(event.time ?? 0) s")
+            }
         }
     }
 }
 ```
 
 Notes:
-- You can also call `process(_ buffer: AVAudioPCMBuffer)` or `process(_ samples: [Float])`.
-- Frame size is `512` samples (32 ms at 16 kHz). Threshold defaults to `0.85`.
+- Stream chunks do not need to be exactly 4096 samples; choose what matches your input cadence.
+- Call `makeStreamState()` whenever you reset your audio stream (equivalent to Silero's `reset_states`).
+- When requesting seconds (`returnSeconds: true`), timestamps are rounded using `timeResolution` decimal places.
 
 ## CLI
 
+Start with the general-purpose `process` command, which runs the diarization
+pipeline (and therefore VAD) end-to-end on a single file:
+
 ```bash
-# Run VAD benchmark (mini50 dataset by default)
-swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
+swift run fluidaudio process path/to/audio.wav
+```
 
-# Save results and enable debug output
-swift run fluidaudio vad-benchmark --all-files --output vad_results.json --debug
+Once you need to experiment with the VAD-specific heuristics directly, use the
+CLI commands below:
 
-# VOiCES subset mixed-condition benchmark (high-precision setting)
-swift run fluidaudio vad-benchmark --dataset voices-subset --all-files --threshold 0.85
+```bash
+# Inspect offline segments (default mode is offline only)
+swift run fluidaudio vad-analyze path/to/audio.wav
 
-# Download VAD dataset if needed
-swift run fluidaudio download --dataset vad
+# Streaming only, 128 ms chunks, tighter silence rules (timestamps are emitted in seconds)
+swift run fluidaudio vad-analyze path/to/audio.wav --streaming --min-silence-ms 300
+
+# Run both offline + streaming in one pass
+swift run fluidaudio vad-analyze path/to/audio.wav --mode both
+
+# Classic benchmark tooling remains available
+swift run fluidaudio vad-benchmark --num-files 50 --threshold 0.3
 ```
+
+`swift run fluidaudio vad-analyze --help` prints the full list of tuning
+options, including negative-threshold overrides and max-duration splitting.
+Offline runs emit an RTFx summary calculated from per-chunk inference time. Use
+`--mode both` if you also want to see streaming start/end events in the same run.
+
+Datasets for benchmarking can be fetched with `swift run fluidaudio download --dataset vad`.
@@ -0,0 +1,83 @@
+# Configuration fields
+
+Configuration for turning raw VAD probabilities into stable speech segments.
+
+This struct applies rules for minimum durations, thresholds, and hysteresis to avoid jittery cuts and to produce clean, ASR-ready segments.
+
+```swift
+public struct VadSegmentationConfig: Sendable {
+    /// Minimum length of detected speech to keep as a segment (default: 0.15s).
+    /// Prevents clicks or coughs from being treated as speech.
+    public var minSpeechDuration: TimeInterval
+
+    /// Minimum silence required to end a segment (default: 0.75s).
+    /// Prevents early cut-offs when a speaker pauses briefly.
+    public var minSilenceDuration: TimeInterval
+
+    /// Maximum length of a single speech segment (default: 14s).
+    /// Segments longer than this will be forcibly split to match ASR model limits.
+    public var maxSpeechDuration: TimeInterval
+
+    /// Extra padding (before and after) each detected speech segment (default: 0.1s).
+    /// Keeps context around words so they aren’t clipped.
+    public var speechPadding: TimeInterval
+
+    /// Probability threshold below which audio is treated as silence (default: 0.3).
+    /// Lower = stricter silence detection, higher = more tolerant.
+    public var silenceThresholdForSplit: Float
+
+    /// Explicit override for the *exit* hysteresis threshold (default: nil).
+    /// If not set, the system computes it automatically from the base threshold minus `negativeThresholdOffset`.
+    public var negativeThreshold: Float?
+
+    /// How far below the base threshold the *exit* threshold should be (default: 0.15).
+    /// Example: if entry = 0.5, exit = 0.35. Prevents rapid flipping on noisy inputs.
+    public var negativeThresholdOffset: Float
+
+    /// Minimum silence enforced when splitting a max-length segment (default: 0.098s).
+    /// Ensures forced splits don’t land mid-phoneme.
+    public var minSilenceAtMaxSpeech: TimeInterval
+
+    /// If true, try to split at the longest silence near the max duration cutoff.
+    /// Produces cleaner segment boundaries compared to a hard cut.
+    public var useMaxPossibleSilenceAtMaxSpeech: Bool
+
+    public static let `default` = VadSegmentationConfig()
+
+    public init(
+        minSpeechDuration: TimeInterval = 0.15,
+        minSilenceDuration: TimeInterval = 0.75,
+        maxSpeechDuration: TimeInterval = 14.0,
+        speechPadding: TimeInterval = 0.1,
+        silenceThresholdForSplit: Float = 0.3,
+        negativeThreshold: Float? = nil,
+        negativeThresholdOffset: Float = 0.15,
+        minSilenceAtMaxSpeech: TimeInterval = 0.098,
+        useMaxPossibleSilenceAtMaxSpeech: Bool = true
+    ) {
+        self.minSpeechDuration = minSpeechDuration
+        self.minSilenceDuration = minSilenceDuration
+        self.maxSpeechDuration = maxSpeechDuration
+        self.speechPadding = speechPadding
+        self.silenceThresholdForSplit = silenceThresholdForSplit
+        self.negativeThreshold = negativeThreshold
+        self.negativeThresholdOffset = negativeThresholdOffset
+        self.minSilenceAtMaxSpeech = minSilenceAtMaxSpeech
+        self.useMaxPossibleSilenceAtMaxSpeech = useMaxPossibleSilenceAtMaxSpeech
+    }
+
+    /// Computes the working negative threshold for hysteresis:
+    /// - If `negativeThreshold` is set, that value is used.
+    /// - Otherwise, it is computed as (baseThreshold – negativeThresholdOffset).
+    /// - This creates a "sticky zone" between thresholds:
+    ///   - Enter speech when prob > baseThreshold
+    ///   - Exit speech when prob < negativeThreshold
+    ///   - Stay in current state in between
+    public func effectiveNegativeThreshold(baseThreshold: Float) -> Float {
+        if let override = negativeThreshold {
+            return override
+        }
+        return max(baseThreshold - negativeThresholdOffset, 0.01)
+    }
+}
+```