enthropy7
diff --git a/‎.github/workflows/hw-decode.yml‎
Lines changed: 56 additions & 0 deletions b/‎.github/workflows/hw-decode.yml‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 3 deletions b/‎README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎context.md‎
Lines changed: 2 additions & 2 deletions b/‎context.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/yscv-detect/src/yolo.rs‎
Lines changed: 13 additions & 4 deletions b/‎crates/yscv-detect/src/yolo.rs‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎crates/yscv-imgproc/src/ops/fast.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/yscv-imgproc/src/ops/fast.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/yscv-imgproc/src/ops/u8_color.rs‎
Lines changed: 2 additions & 2 deletions b/‎crates/yscv-imgproc/src/ops/u8_color.rs‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎crates/yscv-model/src/safetensors.rs‎
Lines changed: 3 additions & 0 deletions b/‎crates/yscv-model/src/safetensors.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/yscv-model/src/weights.rs‎
Lines changed: 3 additions & 0 deletions b/‎crates/yscv-model/src/weights.rs‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/yscv-onnx/src/runner/gpu.rs‎
Lines changed: 3 additions & 6 deletions b/‎crates/yscv-onnx/src/runner/gpu.rs‎
Lines changed: 3 additions & 6 deletions
@@ -0,0 +1,56 @@
+name: HW Decode Platforms
+
+on:
+  push:
+    paths:
+      - 'crates/yscv-video/src/hw_decode.rs'
+      - '.github/workflows/hw-decode.yml'
+  pull_request:
+    paths:
+      - 'crates/yscv-video/src/hw_decode.rs'
+
+jobs:
+  hw-decode:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: macos-latest
+            features: videotoolbox
+            name: macOS + VideoToolbox
+          - os: ubuntu-latest
+            features: ""
+            name: Linux (SW fallback)
+          - os: windows-latest
+            features: ""
+            name: Windows (SW fallback)
+
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Rust
+        uses: dtolnay/rust-toolchain@stable
+
+      - name: Build (default features)
+        run: cargo build -p yscv-video
+
+      - name: Build (HW features)
+        if: matrix.features != ''
+        run: cargo build -p yscv-video --features ${{ matrix.features }}
+
+      - name: Test (default)
+        run: cargo test -p yscv-video
+
+      - name: Test (HW features)
+        if: matrix.features != ''
+        run: cargo test -p yscv-video --features ${{ matrix.features }}
+
+      - name: Clippy
+        run: cargo clippy -p yscv-video -- -D warnings
+
+      - name: Clippy (HW features)
+        if: matrix.features != ''
+        run: cargo clippy -p yscv-video --features ${{ matrix.features }} -- -D warnings
@@ -10,7 +10,7 @@ The framework covers the full pipeline: tensors and autograd, neural network lay
 
 ## Project shape
 
-The workspace has 14 library crates, 2 application binaries, and an examples crate. There are 1,659 tests, 12 criterion microbenchmarks, and CI with regression gates on GitHub Actions.
+The workspace has 14 library crates, 2 application binaries, and an examples crate. There are 1,693 tests across 15 crates, 12 criterion microbenchmarks, and CI with regression gates on GitHub Actions (macOS + Linux + Windows + ARM64).
 
 Key crates and what they do:
 
@@ -20,7 +20,7 @@ Key crates and what they do:
 - **yscv-optim** — 8 optimizers (SGD/Adam/AdamW/RAdam/RmsProp/Adagrad/Lamb/Lars) all with NEON+AVX+SSE SIMD, Lookahead meta-optimizer, 11 LR schedulers.
 - **yscv-model** — 39 layer types (25 trainable), Trainer API, model zoo (ResNet/VGG/MobileNet/EfficientNet/AlexNet/ViT/DeiT), LoRA, EMA, mixed precision, TensorBoard logging, StreamingDataLoader, distributed training.
 - **yscv-imgproc** — 178 image processing ops. The u8 operations (grayscale, blur, morphology, edge detection, resize) have hand-written NEON, AVX2 and SSE/SSSE3 SIMD and beat OpenCV 4.13 on all benchmarked operations.
-- **yscv-video** — H.264 decoder (I/P/B-slices), HEVC infrastructure, MP4 parsing, camera I/O.
+- **yscv-video** — H.264/HEVC software decode (4.5×/1.4× faster than ffmpeg), MP4/MKV demux, HW decode (VideoToolbox/VAAPI/NVDEC/MediaFoundation), audio metadata extraction, camera I/O. 220 tests, 29 NEON + 31 SSE2 SIMD blocks.
 - **yscv-detect** — YOLOv8 ONNX pipeline, NMS, heatmap decoding, anchor generation.
 - **yscv-track** — DeepSORT, ByteTrack, Kalman filter, Hungarian assignment, re-identification.
 - **yscv-recognize** — cosine similarity matching, VP-Tree ANN indexing.
 
@@ -1,8 +1,8 @@
 # yscv
 
-A complete computer vision and deep learning framework in pure Rust. One `cargo add yscv` gives you image processing (178 ops, faster than OpenCV), neural network training (39 layer types, 5 optimizers), ONNX inference (128+ operators, INT8 quantization), real-time detection + tracking + recognition (67µs per frame), H.264 video decoding, and GPU compute via Vulkan/Metal/DX12 — all in a single statically-linked binary with zero Python or C++ dependencies.
+A complete computer vision and deep learning framework in pure Rust. One `cargo add yscv` gives you image processing (178 ops, faster than OpenCV), neural network training (39 layer types, 8 optimizers), ONNX inference (128+ operators, INT8 quantization), real-time detection + tracking + recognition (67µs per frame), H.264/HEVC video decoding (4.5× faster than ffmpeg), hardware decode (VideoToolbox/VAAPI/NVDEC), and GPU compute via Vulkan/Metal/DX12 — all in a single statically-linked binary with zero Python or C++ dependencies.
 
-We built this because deploying ML in production shouldn't require Docker containers with PyTorch, CUDA drivers, and a prayer. YSCV compiles to one binary that runs on a Raspberry Pi, a cloud VM, or a factory floor computer. Every hot path has hand-tuned SIMD for ARM and x86 — 295 functions with runtime dispatch. It's faster than NumPy, PyTorch, and OpenCV on every operation we benchmarked (76 wins, 0 losses).
+We built this because deploying ML in production shouldn't require Docker containers with PyTorch, CUDA drivers, and a prayer. YSCV compiles to one binary that runs on a Raspberry Pi, a cloud VM, or a factory floor computer. Every hot path has hand-tuned SIMD for ARM and x86 — 298 functions with runtime dispatch. It's faster than NumPy, PyTorch, OpenCV, and ffmpeg on every operation we benchmarked (85 wins, 0 losses).
 
 ## Quick Start
 
@@ -88,7 +88,7 @@ The detect → track → recognize pipeline runs in 67µs per frame end-to-end.
 
 ## Performance
 
-We benchmark every hot path against NumPy, PyTorch, OpenCV, onnxruntime, ffmpeg, and CoreML. Current score: **88 wins, ~5 parity, 0 losses.** H.264 decode is **4.5× faster than ffmpeg**, HEVC decode is **1.7× faster**. MPSGraph GPU inference is **3.4× faster than Apple CoreML** on YOLOv8n.
+We benchmark every hot path against NumPy, PyTorch, OpenCV, onnxruntime, ffmpeg, and CoreML. Current score: **85 wins, ~4 parity, 1 close, 0 losses.** H.264 decode is **4.5× faster than ffmpeg**, HEVC is **1.4× faster** (full color). MPSGraph GPU inference is **3.4× faster than Apple CoreML** on YOLOv8n. 1693 tests across 15 crates.
 
 Every operation has hand-tuned SIMD on all platforms — NEON on ARM, AVX/SSE on x86, with optional Intel MKL and ARM Performance Libraries for the last few percent.
 
 
@@ -11,10 +11,10 @@ yscv (umbrella re-export)
 ├── yscv-tensor          ← 115 ops, f32/f16/bf16, 50 SIMD functions
 ├── yscv-kernels         ← 61 kernel ops, 49 SIMD, 20 GPU WGSL shaders
 ├── yscv-autograd        ← dynamic computation graph, 40+ backward ops
-├── yscv-optim           ← 20 optimizers, 11 LR schedulers
+├── yscv-optim           ← 8 optimizers (SGD/Adam/AdamW/RAdam/RmsProp/Adagrad/Lamb/Lars), 11 LR schedulers
 ├── yscv-model           ← 39 layer types, 13 model zoo architectures, 17 losses
 ├── yscv-imgproc         ← 178 image ops, u8 NEON/SSE/AVX SIMD, GCD/rayon threading
-├── yscv-video           ← H.264 decoder (3,069 LOC), HEVC decoder (6,678 LOC), camera I/O
+├── yscv-video           ← H.264/HEVC decode (23K LOC, 4.5× ffmpeg), MP4/MKV demux, HW decode (VT/VAAPI/NVDEC/MF), camera I/O, audio metadata
 ├── yscv-detect          ← YOLOv8 pipeline, NMS, heatmap, RoI align
 ├── yscv-recognize       ← cosine matching, VP-Tree ANN
 ├── yscv-track           ← DeepSORT, ByteTrack, Kalman, re-id
 
@@ -98,6 +98,12 @@ pub fn decode_yolov8_output(
 
     let mut candidates = Vec::new();
 
+    // Bounds guard: ensure tensor data is large enough for all accesses
+    let required_len = (4 + num_classes) * num_preds;
+    if data.len() < required_len {
+        return Vec::new();
+    }
+
     for i in 0..num_preds {
         // Output is laid out row-major: data[row * num_preds + col]
         let cx = data[i];
@@ -187,11 +193,14 @@ pub fn decode_yolov11_output(
 
     let mut candidates = Vec::new();
 
-    // Skip batch dimension offset if present
-    let base = if shape.len() == 3 { 0 } else { 0 };
+    // Bounds guard
+    let required_len = num_preds * cols;
+    if data.len() < required_len {
+        return Vec::new();
+    }
 
     for i in 0..num_preds {
-        let row = base + i * cols;
+        let row = i * cols;
         let cx = data[row];
         let cy = data[row + 1];
         let w = data[row + 2];
@@ -320,7 +329,7 @@ pub fn letterbox_preprocess(image: &Tensor, target_size: usize) -> (Tensor, f32,
 ///
 /// This is a pure layout transformation — no normalisation is applied
 /// (the input is assumed to already be in `[0, 1]`).
-#[allow(dead_code)]
+#[cfg(any(feature = "onnx", test))]
 fn hwc_to_nchw(hwc: &Tensor) -> Vec<f32> {
     let shape = hwc.shape();
     let h = shape[0];
 
@@ -217,12 +217,12 @@ pub fn fast9_detect_raw(
                 x += 1;
             }
 
-            *results[row_idx].lock().expect("mutex poisoned") = row_kps;
+            *results[row_idx].lock().unwrap_or_else(|e| e.into_inner()) = row_kps;
         });
 
         results
             .into_iter()
-            .map(|m| m.into_inner().expect("mutex poisoned"))
+            .map(|m| m.into_inner().unwrap_or_else(|e| e.into_inner()))
             .collect()
     };
 
 
@@ -419,13 +419,13 @@ pub fn histogram_u8(src: &[u8], len: usize) -> [u32; 256] {
             local[chunk[i] as usize] += 1;
         }
 
-        *local_hists[t].lock().expect("mutex poisoned") = local;
+        *local_hists[t].lock().unwrap_or_else(|e| e.into_inner()) = local;
     });
 
     // Merge all thread-local histograms
     let mut hist = [0u32; 256];
     for lh in &local_hists {
-        let local = lh.lock().expect("mutex poisoned");
+        let local = lh.lock().unwrap_or_else(|e| e.into_inner());
         for i in 0..256 {
             hist[i] += local[i];
         }
 
@@ -71,6 +71,9 @@ pub struct SafeTensorFile {
 
 impl SafeTensorFile {
     /// Parse a SafeTensors file from disk.
+    ///
+    /// Reads the entire file into memory. For very large models,
+    /// the OS will return an error if insufficient memory is available.
     pub fn from_file(path: &Path) -> Result<Self, ModelError> {
         let bytes = std::fs::read(path).map_err(|e| ModelError::SafeTensorsIo {
             path: path.display().to_string(),
 
@@ -61,6 +61,9 @@ pub fn save_weights(path: &Path, tensors: &HashMap<String, Tensor>) -> Result<()
 }
 
 /// Loads named tensors from a binary weight file.
+///
+/// Reads the entire file into memory. For very large models (>RAM),
+/// consider using memory-mapped I/O or streaming instead.
 pub fn load_weights(path: &Path) -> Result<HashMap<String, Tensor>, ModelError> {
     let file_data = std::fs::read(path).map_err(|e| ModelError::DatasetLoadIo {
         path: path.display().to_string(),
 
@@ -3121,12 +3121,9 @@ fn exec_conv_f16(
             let input_buf = &gc
                 .get(input_name)
                 .unwrap_or_else(|| {
-                    panic!(
-                        "f16 conv: input '{}' not in gc for node '{}' (op {}). gc keys: {:?}",
-                        input_name,
-                        node.name,
-                        node.op_type,
-                        gc.keys().take(20).collect::<Vec<_>>()
+                    unreachable!(
+                        "f16 conv: input '{}' not in gc for node '{}' (op {}). Bug in graph scheduling.",
+                        input_name, node.name, node.op_type,
                     )
                 })
                 .buf;
Original file line number	Diff line number	Diff line change
`@@ -419,13 +419,13 @@ pub fn histogram_u8(src: &[u8], len: usize) -> [u32; 256] {`
`419`	`419`	`local[chunk[i] as usize] += 1;`
`420`	`420`	`}`
`421`	`421`
`422`		`- *local_hists[t].lock().expect("mutex poisoned") = local;`
	`422`	`+ *local_hists[t].lock().unwrap_or_else(\|e\| e.into_inner()) = local;`
`423`	`423`	`});`
`424`	`424`
`425`	`425`	`// Merge all thread-local histograms`
`426`	`426`	`let mut hist = [0u32; 256];`
`427`	`427`	`for lh in &local_hists {`
`428`		`- let local = lh.lock().expect("mutex poisoned");`
	`428`	`+ let local = lh.lock().unwrap_or_else(\|e\| e.into_inner());`
`429`	`429`	`for i in 0..256 {`
`430`	`430`	`hist[i] += local[i];`
`431`	`431`	`}`
Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ pub fn save_weights(path: &Path, tensors: &HashMap<String, Tensor>) -> Result<()`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`/// Loads named tensors from a binary weight file.`
	`64`	`+///`
	`65`	`+/// Reads the entire file into memory. For very large models (>RAM),`
	`66`	`+/// consider using memory-mapped I/O or streaming instead.`
`64`	`67`	`pub fn load_weights(path: &Path) -> Result<HashMap<String, Tensor>, ModelError> {`
`65`	`68`	`let file_data = std::fs::read(path).map_err(\|e\| ModelError::DatasetLoadIo {`
`66`	`69`	`path: path.display().to_string(),`