feat(pipeline): YOLOv8 + ByteTrack video processing via supervision; CLI command; deps & mypy config

AI Assistant · AI Assistant · commit 3bd833de9a74 · 2025-08-18T15:54:45.000-04:00
diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@ Edge-deployable computer vision application scaffold, optimized for Python 3.11+
 
 - Typer-powered CLI with `run` command
 - Simple CV pipeline with OpenCV (dummy inference + annotation)
+ - Video processing pipeline with YOLOv8 detection + ByteTrack MOT using `supervision`
 - Strict typing (mypy --strict), linting (ruff), tests (pytest)
 - Pre-commit hooks configured
 - Dockerfile optimized for CPU-based CV workloads
@@ -61,6 +62,9 @@ poetry run yardvision run --source 0
 
 # From video file with display window
 poetry run yardvision run --source ./sample.mp4 --display
+
+# Process a video file and save annotated output
+poetry run yardvision process-video ./input.mp4 ./output.mp4 --model yolov8m.pt
 ```
 
 ### Tests
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,8 @@ opencv-python-headless = "^4.9.0.80"
 Pillow = "^10.4.0"
 onnxruntime = "^1.18.0"
 tqdm = "^4.66.4"
+ultralytics = "^8.2.0"
+supervision = "^0.18.0"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.2.1"
@@ -88,6 +90,12 @@ ignore_missing_imports = true
 [tool.mypy.onnxruntime.*]
 ignore_missing_imports = true
 
+[tool.mypy.ultralytics.*]
+ignore_missing_imports = true
+
+[tool.mypy.supervision.*]
+ignore_missing_imports = true
+
 [build-system]
 requires = ["poetry-core>=1.8.0"]
 build-backend = "poetry.core.masonry.api"
diff --git a/src/vision/pipeline/processor.py b/src/vision/pipeline/processor.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Iterator, Optional
+from typing import Iterator
 
 import cv2
 import numpy as np
@@ -94,3 +94,87 @@ def process_stream(self, source: str, display: bool = False) -> None:
                 except Exception:  # noqa: BLE001
                     logger.warning("Failed to destroy windows (likely headless environment)")
 
+
+class VideoProcessor:
+    """Batch video processor using YOLOv8 + ByteTrack via supervision.
+
+    This processor reads frames from an input video, performs detection and
+    tracking, annotates results, and writes an output video.
+    """
+
+    def __init__(self, model_path: str = "yolov8m.pt") -> None:
+        # Lazy imports to avoid importing heavy deps where not needed
+        from ultralytics import YOLO  # type: ignore[import-not-found]
+        import supervision as sv  # type: ignore[import-not-found]
+
+        self._sv = sv
+        self._model = YOLO(model_path)
+        # ByteTrack tracker
+        self._tracker = sv.ByteTrack()
+        # Annotators
+        self._box_annotator = sv.BoundingBoxAnnotator()
+        self._label_annotator = sv.LabelAnnotator()
+        # COCO class name mapping from model
+        try:
+            self._class_names = self._model.model.names  # type: ignore[attr-defined]
+        except Exception:  # noqa: BLE001
+            # Fallback to standard COCO mapping indices used by YOLOv8
+            self._class_names = {
+                0: "person",
+                1: "bicycle",
+                2: "car",
+                3: "motorcycle",
+                5: "bus",
+                7: "truck",
+            }
+
+        logger.info("Initialized VideoProcessor with model: {}", model_path)
+
+    def _filter_detections(self, detections: "np.ndarray | object") -> "object":
+        """Filter detection classes to person (0), car (2), truck (7).
+
+        Works with supervision.Detections instance which supports numpy-like
+        indexing using a boolean mask.
+        """
+        sv = self._sv
+        assert isinstance(detections, sv.Detections)
+        allowed = np.array([0, 2, 7])
+        mask = np.isin(detections.class_id, allowed)
+        return detections[mask]
+
+    def process_video(self, input_path: str, output_path: str) -> None:
+        import supervision as sv  # type: ignore[import-not-found]
+
+        video_info = sv.VideoInfo.from_video_path(input_path)
+        frames = sv.get_video_frames_generator(input_path)
+
+        # Use a broadly supported codec for MP4 writing in headless envs
+        with sv.VideoSink(output_path, video_info, codec="mp4v") as sink:
+            for frame in frames:
+                # Inference
+                result = self._model(frame, verbose=False)[0]
+                detections = sv.Detections.from_ultralytics(result)
+                detections = self._filter_detections(detections)
+
+                # Tracking
+                tracked = self._tracker.update_with_detections(detections)
+
+                # Labels for annotation
+                labels = []
+                for i in range(len(tracked)):
+                    class_id = int(tracked.class_id[i]) if tracked.class_id is not None else -1
+                    confidence = float(tracked.confidence[i]) if tracked.confidence is not None else 0.0
+                    track_id = int(tracked.tracker_id[i]) if tracked.tracker_id is not None else -1
+                    class_name = self._class_names.get(class_id, str(class_id))
+                    labels.append(f"{class_name} #{track_id} {confidence:.2f}")
+
+                # Annotation
+                annotated = self._box_annotator.annotate(scene=frame.copy(), detections=tracked)
+                annotated = self._label_annotator.annotate(
+                    scene=annotated,
+                    detections=tracked,
+                    labels=labels,
+                )
+
+                sink.write_frame(annotated)
+
diff --git a/src/yardvision/cli.py b/src/yardvision/cli.py
@@ -9,6 +9,7 @@
 
 from vision.core.config import AppConfig, load_config
 from vision.pipeline.processor import FrameProcessor
+from vision.pipeline.processor import VideoProcessor
 
 
 install_rich_traceback(show_locals=True)
@@ -50,6 +51,22 @@ def run(
         raise typer.Exit(code=1) from exc
 
 
+@app.command("process-video")
+def process_video(
+    input_path: Path = typer.Argument(..., exists=True, dir_okay=False, readable=True),
+    output_path: Path = typer.Argument(...),
+    model: str = typer.Option("yolov8m.pt", help="YOLOv8 model weights path or alias"),
+) -> None:
+    """Process an input video file and save annotated detections/tracks to output."""
+    try:
+        processor = VideoProcessor(model_path=str(model))
+        processor.process_video(str(input_path), str(output_path))
+        console.print(f"[green]Saved:[/green] {output_path}")
+    except Exception as exc:  # noqa: BLE001
+        console.print(f"[red]Error:[/red] {exc}")
+        raise typer.Exit(code=1) from exc
+
+
 if __name__ == "__main__":
     app()