diff --git a/skills/detection/yolo-detection-2026/requirements_mps.txt b/skills/detection/yolo-detection-2026/requirements_mps.txt index a9e282fa..822288d1 100644 --- a/skills/detection/yolo-detection-2026/requirements_mps.txt +++ b/skills/detection/yolo-detection-2026/requirements_mps.txt @@ -1,10 +1,8 @@ # YOLO 2026 — MPS (Apple Silicon) requirements -# Standard PyTorch — MPS backend is included by default on macOS -torch>=2.4.0 -torchvision>=0.19.0 -ultralytics>=8.3.0 -coremltools>=8.0 +# Uses ONNX Runtime + CoreML EP for GPU/ANE acceleration. +# Pre-built yolo26n.onnx is shipped in the repo, so torch/ultralytics +# are NOT needed at runtime — only onnxruntime for inference. +onnxruntime>=1.19.0 numpy>=1.24.0,<2.0.0 opencv-python-headless>=4.8.0 Pillow>=10.0.0 - diff --git a/skills/detection/yolo-detection-2026/scripts/env_config.py b/skills/detection/yolo-detection-2026/scripts/env_config.py index 7c46c05b..b7d3c6f7 100644 --- a/skills/detection/yolo-detection-2026/scripts/env_config.py +++ b/skills/detection/yolo-detection-2026/scripts/env_config.py @@ -58,11 +58,12 @@ class BackendSpec: ), "mps": BackendSpec( name="mps", - export_format="coreml", - model_suffix=".mlpackage", - half=True, - extra_export_args={"nms": False}, - compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM + export_format="onnx", + model_suffix=".onnx", + half=False, # ONNX Runtime handles precision internally + # ONNX Runtime + CoreMLExecutionProvider bypasses the broken + # MPSGraphExecutable MLIR pipeline on macOS 26.x while still + # leveraging GPU/ANE via CoreML under the hood. ), "intel": BackendSpec( name="intel", @@ -78,6 +79,106 @@ class BackendSpec: ), } +# ─── ONNX + CoreML EP wrapper ──────────────────────────────────────────────── +# Provides an ultralytics-compatible model interface using onnxruntime directly +# with CoreMLExecutionProvider for ~6ms inference on Apple Silicon (vs 21ms when +# ultralytics defaults to CPUExecutionProvider). + +class _BoxResult: + """Minimal replacement for ultralytics Boxes result.""" + __slots__ = ('xyxy', 'conf', 'cls') + + def __init__(self, xyxy, conf, cls): + self.xyxy = xyxy # [[x1,y1,x2,y2]] + self.conf = conf # [conf] + self.cls = cls # [cls_id] + + +class _DetResult: + """Minimal replacement for ultralytics Results.""" + __slots__ = ('boxes',) + + def __init__(self, boxes: list): + self.boxes = boxes + + +class _OnnxCoreMLModel: + """ONNX Runtime model with CoreML EP, compatible with ultralytics API. + + Supports: model(image_path_or_pil, conf=0.5, verbose=False) + Returns: list of _DetResult with .boxes iterable of _BoxResult + """ + + def __init__(self, session, class_names: dict): + self.session = session + self.names = class_names + self._input_name = session.get_inputs()[0].name + # Expected input shape: [1, 3, H, W] + shape = session.get_inputs()[0].shape + self._input_h = shape[2] if isinstance(shape[2], int) else 640 + self._input_w = shape[3] if isinstance(shape[3], int) else 640 + + def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs): + """Run inference on an image path or PIL Image.""" + import numpy as np + from PIL import Image + + # Load image + if isinstance(source, str): + img = Image.open(source).convert("RGB") + elif isinstance(source, Image.Image): + img = source.convert("RGB") + else: + img = Image.fromarray(source).convert("RGB") + + orig_w, orig_h = img.size + + # Letterbox resize to input size + scale = min(self._input_w / orig_w, self._input_h / orig_h) + new_w, new_h = int(orig_w * scale), int(orig_h * scale) + img_resized = img.resize((new_w, new_h), Image.BILINEAR) + + # Pad to input size (center) + pad_x = (self._input_w - new_w) // 2 + pad_y = (self._input_h - new_h) // 2 + canvas = np.full((self._input_h, self._input_w, 3), 114, dtype=np.uint8) + canvas[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = np.array(img_resized) + + # HWC→CHW, normalize, add batch dim + blob = canvas.transpose(2, 0, 1).astype(np.float32) / 255.0 + blob = np.expand_dims(blob, 0) + + # Run inference + outputs = self.session.run(None, {self._input_name: blob}) + preds = outputs[0] # shape: [1, num_detections, 6] + + # Parse detections: [x1, y1, x2, y2, confidence, class_id] + boxes = [] + for det in preds[0]: + det_conf = float(det[4]) + if det_conf < conf: + continue + + # Scale coordinates back to original image space + x1 = (float(det[0]) - pad_x) / scale + y1 = (float(det[1]) - pad_y) / scale + x2 = (float(det[2]) - pad_x) / scale + y2 = (float(det[3]) - pad_y) / scale + + # Clip to image bounds + x1 = max(0, min(x1, orig_w)) + y1 = max(0, min(y1, orig_h)) + x2 = max(0, min(x2, orig_w)) + y2 = max(0, min(y2, orig_h)) + + boxes.append(_BoxResult( + xyxy=np.array([[x1, y1, x2, y2]]), + conf=np.array([det_conf]), + cls=np.array([int(det[5])]), + )) + + return [_DetResult(boxes)] + # ─── Hardware detection ────────────────────────────────────────────────────── @@ -133,31 +234,79 @@ def detect() -> "HardwareEnv": return env def _try_cuda(self) -> bool: - """Detect NVIDIA GPU via nvidia-smi and torch.""" - if not shutil.which("nvidia-smi"): - return False + """Detect NVIDIA GPU via nvidia-smi (with Windows path search) and WMI fallback.""" + nvidia_smi = shutil.which("nvidia-smi") + + # Windows: check well-known paths if not on PATH + if not nvidia_smi and platform.system() == "Windows": + for candidate in [ + Path(os.environ.get("PROGRAMFILES", r"C:\Program Files")) + / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe", + Path(os.environ.get("WINDIR", r"C:\Windows")) + / "System32" / "nvidia-smi.exe", + ]: + if candidate.is_file(): + nvidia_smi = str(candidate) + _log(f"Found nvidia-smi at {nvidia_smi}") + break + + if nvidia_smi: + try: + result = subprocess.run( + [nvidia_smi, "--query-gpu=name,memory.total,driver_version", + "--format=csv,noheader,nounits"], + capture_output=True, text=True, timeout=10, + ) + if result.returncode == 0: + line = result.stdout.strip().split("\n")[0] + parts = [p.strip() for p in line.split(",")] + if len(parts) >= 3: + self.backend = "cuda" + self.device = "cuda" + self.gpu_name = parts[0] + self.gpu_memory_mb = int(float(parts[1])) + self.driver_version = parts[2] + self.detection_details["nvidia_smi"] = line + _log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})") + return True + except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e: + _log(f"nvidia-smi probe failed: {e}") + + # Windows WMI fallback: detect NVIDIA GPU even without nvidia-smi on PATH + if platform.system() == "Windows": + return self._try_cuda_wmi() + + return False + + def _try_cuda_wmi(self) -> bool: + """Windows-only: detect NVIDIA GPU via WMI (wmic).""" try: result = subprocess.run( - ["nvidia-smi", "--query-gpu=name,memory.total,driver_version", - "--format=csv,noheader,nounits"], + ["wmic", "path", "win32_VideoController", "get", + "Name,AdapterRAM,DriverVersion", "/format:csv"], capture_output=True, text=True, timeout=10, ) if result.returncode != 0: return False - line = result.stdout.strip().split("\n")[0] - parts = [p.strip() for p in line.split(",")] - if len(parts) >= 3: - self.backend = "cuda" - self.device = "cuda" - self.gpu_name = parts[0] - self.gpu_memory_mb = int(float(parts[1])) - self.driver_version = parts[2] - self.detection_details["nvidia_smi"] = line - _log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})") - return True + for line in result.stdout.strip().split("\n"): + if "NVIDIA" in line.upper(): + parts = [p.strip() for p in line.split(",")] + # CSV format: Node,AdapterRAM,DriverVersion,Name + if len(parts) >= 4: + self.backend = "cuda" + self.device = "cuda" + self.gpu_name = parts[3] + try: + self.gpu_memory_mb = int(int(parts[1]) / (1024 * 1024)) + except (ValueError, IndexError): + pass + self.driver_version = parts[2] if len(parts) > 2 else "" + self.detection_details["wmi"] = line + _log(f"NVIDIA GPU (WMI): {self.gpu_name} ({self.gpu_memory_mb}MB)") + return True except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e: - _log(f"nvidia-smi probe failed: {e}") + _log(f"WMI probe failed: {e}") return False def _try_rocm(self) -> bool: @@ -363,12 +512,28 @@ def _check_rocm_runtime(self): _log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm") raise ImportError("ROCmExecutionProvider not available") + def _check_mps_runtime(self): + """Verify onnxruntime has CoreML provider for Apple GPU/ANE acceleration. + + ONNX Runtime + CoreMLExecutionProvider bypasses the broken + MPSGraphExecutable MLIR pipeline (macOS 26.x) while still routing + inference through CoreML to leverage GPU and Neural Engine. + """ + import onnxruntime + providers = onnxruntime.get_available_providers() + if "CoreMLExecutionProvider" in providers: + _log(f"onnxruntime CoreML provider available: {providers}") + return True + _log(f"onnxruntime providers: {providers} — CoreMLExecutionProvider not found") + _log("Fix: pip install onnxruntime (arm64 macOS wheel includes CoreML EP)") + raise ImportError("CoreMLExecutionProvider not available") + def _check_framework(self) -> bool: - """Check if the optimized inference runtime is importable.""" + """Check if the optimized inference runtime is importable and compatible.""" checks = { "cuda": lambda: __import__("tensorrt"), "rocm": lambda: self._check_rocm_runtime(), - "mps": lambda: __import__("coremltools"), + "mps": lambda: self._check_mps_runtime(), "intel": lambda: __import__("openvino"), "cpu": lambda: __import__("onnxruntime"), } @@ -496,6 +661,27 @@ def __init__(self, *args, **kwargs): _log("coremltools not available, loading without compute_units") return YOLO(model_path) + def _load_onnx_coreml(self, onnx_path: str): + """Load ONNX model with CoreMLExecutionProvider for fast GPU/ANE inference. + + Returns an OnnxCoreMLModel wrapper that is compatible with the + ultralytics model(frame_path, conf=...) call pattern. + """ + import onnxruntime as ort + + providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider'] + session = ort.InferenceSession(onnx_path, providers=providers) + active = session.get_providers() + _log(f"ONNX+CoreML session: {active}") + + # Get YOLO class names from the .pt model (needed for detection output) + from ultralytics import YOLO + pt_path = onnx_path.replace('.onnx', '.pt') + pt_model = YOLO(pt_path) + class_names = pt_model.names # {0: 'person', 1: 'bicycle', ...} + + return _OnnxCoreMLModel(session, class_names) + def load_optimized(self, model_name: str, use_optimized: bool = True): """ Load the best available model for this hardware. @@ -512,10 +698,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): optimized_path = self.get_optimized_path(model_name) if optimized_path.exists(): try: - # On Apple Silicon: route CoreML to Neural Engine - if self.backend == "mps" and self.compute_units != "all": - model = self._load_coreml_with_compute_units( - str(optimized_path)) + # MPS: use ONNX Runtime + CoreML EP for fast inference + if self.backend == "mps": + model = self._load_onnx_coreml(str(optimized_path)) else: model = YOLO(str(optimized_path)) self.load_ms = (time.perf_counter() - t0) * 1000 @@ -529,10 +714,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): exported = self.export_model(pt_model, model_name) if exported: try: - # On Apple Silicon: route CoreML to Neural Engine - if self.backend == "mps" and self.compute_units != "all": - model = self._load_coreml_with_compute_units( - str(exported)) + # MPS: use ONNX Runtime + CoreML EP for fast inference + if self.backend == "mps": + model = self._load_onnx_coreml(str(exported)) else: model = YOLO(str(exported)) self.load_ms = (time.perf_counter() - t0) * 1000 diff --git a/skills/detection/yolo-detection-2026/yolo26n.onnx b/skills/detection/yolo-detection-2026/yolo26n.onnx new file mode 100644 index 00000000..378a00e5 Binary files /dev/null and b/skills/detection/yolo-detection-2026/yolo26n.onnx differ diff --git a/skills/detection/yolo-detection-2026/yolo26n_names.json b/skills/detection/yolo-detection-2026/yolo26n_names.json new file mode 100644 index 00000000..67db67b1 --- /dev/null +++ b/skills/detection/yolo-detection-2026/yolo26n_names.json @@ -0,0 +1,82 @@ +{ + "0": "person", + "1": "bicycle", + "2": "car", + "3": "motorcycle", + "4": "airplane", + "5": "bus", + "6": "train", + "7": "truck", + "8": "boat", + "9": "traffic light", + "10": "fire hydrant", + "11": "stop sign", + "12": "parking meter", + "13": "bench", + "14": "bird", + "15": "cat", + "16": "dog", + "17": "horse", + "18": "sheep", + "19": "cow", + "20": "elephant", + "21": "bear", + "22": "zebra", + "23": "giraffe", + "24": "backpack", + "25": "umbrella", + "26": "handbag", + "27": "tie", + "28": "suitcase", + "29": "frisbee", + "30": "skis", + "31": "snowboard", + "32": "sports ball", + "33": "kite", + "34": "baseball bat", + "35": "baseball glove", + "36": "skateboard", + "37": "surfboard", + "38": "tennis racket", + "39": "bottle", + "40": "wine glass", + "41": "cup", + "42": "fork", + "43": "knife", + "44": "spoon", + "45": "bowl", + "46": "banana", + "47": "apple", + "48": "sandwich", + "49": "orange", + "50": "broccoli", + "51": "carrot", + "52": "hot dog", + "53": "pizza", + "54": "donut", + "55": "cake", + "56": "chair", + "57": "couch", + "58": "potted plant", + "59": "bed", + "60": "dining table", + "61": "toilet", + "62": "tv", + "63": "laptop", + "64": "mouse", + "65": "remote", + "66": "keyboard", + "67": "cell phone", + "68": "microwave", + "69": "oven", + "70": "toaster", + "71": "sink", + "72": "refrigerator", + "73": "book", + "74": "clock", + "75": "vase", + "76": "scissors", + "77": "teddy bear", + "78": "hair drier", + "79": "toothbrush" +} \ No newline at end of file diff --git a/skills/lib/env_config.py b/skills/lib/env_config.py index 1669f03c..dde50304 100644 --- a/skills/lib/env_config.py +++ b/skills/lib/env_config.py @@ -58,11 +58,12 @@ class BackendSpec: ), "mps": BackendSpec( name="mps", - export_format="coreml", - model_suffix=".mlpackage", - half=True, - extra_export_args={"nms": False}, - compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM + export_format="onnx", + model_suffix=".onnx", + half=False, # ONNX Runtime handles precision internally + # ONNX Runtime + CoreMLExecutionProvider bypasses the broken + # MPSGraphExecutable MLIR pipeline on macOS 26.x while still + # leveraging GPU/ANE via CoreML under the hood. ), "intel": BackendSpec( name="intel", @@ -78,6 +79,106 @@ class BackendSpec: ), } +# ─── ONNX + CoreML EP wrapper ──────────────────────────────────────────────── +# Provides an ultralytics-compatible model interface using onnxruntime directly +# with CoreMLExecutionProvider for ~6ms inference on Apple Silicon (vs 21ms when +# ultralytics defaults to CPUExecutionProvider). + +class _BoxResult: + """Minimal replacement for ultralytics Boxes result.""" + __slots__ = ('xyxy', 'conf', 'cls') + + def __init__(self, xyxy, conf, cls): + self.xyxy = xyxy # [[x1,y1,x2,y2]] + self.conf = conf # [conf] + self.cls = cls # [cls_id] + + +class _DetResult: + """Minimal replacement for ultralytics Results.""" + __slots__ = ('boxes',) + + def __init__(self, boxes: list): + self.boxes = boxes + + +class _OnnxCoreMLModel: + """ONNX Runtime model with CoreML EP, compatible with ultralytics API. + + Supports: model(image_path_or_pil, conf=0.5, verbose=False) + Returns: list of _DetResult with .boxes iterable of _BoxResult + """ + + def __init__(self, session, class_names: dict): + self.session = session + self.names = class_names + self._input_name = session.get_inputs()[0].name + # Expected input shape: [1, 3, H, W] + shape = session.get_inputs()[0].shape + self._input_h = shape[2] if isinstance(shape[2], int) else 640 + self._input_w = shape[3] if isinstance(shape[3], int) else 640 + + def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs): + """Run inference on an image path or PIL Image.""" + import numpy as np + from PIL import Image + + # Load image + if isinstance(source, str): + img = Image.open(source).convert("RGB") + elif isinstance(source, Image.Image): + img = source.convert("RGB") + else: + img = Image.fromarray(source).convert("RGB") + + orig_w, orig_h = img.size + + # Letterbox resize to input size + scale = min(self._input_w / orig_w, self._input_h / orig_h) + new_w, new_h = int(orig_w * scale), int(orig_h * scale) + img_resized = img.resize((new_w, new_h), Image.BILINEAR) + + # Pad to input size (center) + pad_x = (self._input_w - new_w) // 2 + pad_y = (self._input_h - new_h) // 2 + canvas = np.full((self._input_h, self._input_w, 3), 114, dtype=np.uint8) + canvas[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = np.array(img_resized) + + # HWC→CHW, normalize, add batch dim + blob = canvas.transpose(2, 0, 1).astype(np.float32) / 255.0 + blob = np.expand_dims(blob, 0) + + # Run inference + outputs = self.session.run(None, {self._input_name: blob}) + preds = outputs[0] # shape: [1, num_detections, 6] + + # Parse detections: [x1, y1, x2, y2, confidence, class_id] + boxes = [] + for det in preds[0]: + det_conf = float(det[4]) + if det_conf < conf: + continue + + # Scale coordinates back to original image space + x1 = (float(det[0]) - pad_x) / scale + y1 = (float(det[1]) - pad_y) / scale + x2 = (float(det[2]) - pad_x) / scale + y2 = (float(det[3]) - pad_y) / scale + + # Clip to image bounds + x1 = max(0, min(x1, orig_w)) + y1 = max(0, min(y1, orig_h)) + x2 = max(0, min(x2, orig_w)) + y2 = max(0, min(y2, orig_h)) + + boxes.append(_BoxResult( + xyxy=np.array([[x1, y1, x2, y2]]), + conf=np.array([det_conf]), + cls=np.array([int(det[5])]), + )) + + return [_DetResult(boxes)] + # ─── Hardware detection ────────────────────────────────────────────────────── @@ -411,12 +512,28 @@ def _check_rocm_runtime(self): _log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm") raise ImportError("ROCmExecutionProvider not available") + def _check_mps_runtime(self): + """Verify onnxruntime has CoreML provider for Apple GPU/ANE acceleration. + + ONNX Runtime + CoreMLExecutionProvider bypasses the broken + MPSGraphExecutable MLIR pipeline (macOS 26.x) while still routing + inference through CoreML to leverage GPU and Neural Engine. + """ + import onnxruntime + providers = onnxruntime.get_available_providers() + if "CoreMLExecutionProvider" in providers: + _log(f"onnxruntime CoreML provider available: {providers}") + return True + _log(f"onnxruntime providers: {providers} — CoreMLExecutionProvider not found") + _log("Fix: pip install onnxruntime (arm64 macOS wheel includes CoreML EP)") + raise ImportError("CoreMLExecutionProvider not available") + def _check_framework(self) -> bool: - """Check if the optimized inference runtime is importable.""" + """Check if the optimized inference runtime is importable and compatible.""" checks = { "cuda": lambda: __import__("tensorrt"), "rocm": lambda: self._check_rocm_runtime(), - "mps": lambda: __import__("coremltools"), + "mps": lambda: self._check_mps_runtime(), "intel": lambda: __import__("openvino"), "cpu": lambda: __import__("onnxruntime"), } @@ -544,6 +661,43 @@ def __init__(self, *args, **kwargs): _log("coremltools not available, loading without compute_units") return YOLO(model_path) + def _load_onnx_coreml(self, onnx_path: str): + """Load ONNX model with CoreMLExecutionProvider for fast GPU/ANE inference. + + Returns an OnnxCoreMLModel wrapper that is compatible with the + ultralytics model(frame_path, conf=...) call pattern. + """ + import onnxruntime as ort + + providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider'] + session = ort.InferenceSession(onnx_path, providers=providers) + active = session.get_providers() + _log(f"ONNX+CoreML session: {active}") + + # Load class names from companion JSON (avoids torch/ultralytics dep) + import json + names_path = onnx_path.replace('.onnx', '_names.json') + try: + with open(names_path) as f: + raw = json.load(f) + # JSON keys are strings; convert to int-keyed dict + class_names = {int(k): v for k, v in raw.items()} + _log(f"Loaded {len(class_names)} class names from {Path(names_path).name}") + except FileNotFoundError: + # Fallback: try loading from .pt if JSON doesn't exist + try: + from ultralytics import YOLO + pt_path = onnx_path.replace('.onnx', '.pt') + pt_model = YOLO(pt_path) + class_names = pt_model.names + _log(f"Loaded class names from {Path(pt_path).name} (fallback)") + except Exception: + # Last resort: use COCO 80-class defaults + _log("WARNING: No class names found, using generic labels") + class_names = {i: f"class_{i}" for i in range(80)} + + return _OnnxCoreMLModel(session, class_names) + def load_optimized(self, model_name: str, use_optimized: bool = True): """ Load the best available model for this hardware. @@ -560,10 +714,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): optimized_path = self.get_optimized_path(model_name) if optimized_path.exists(): try: - # On Apple Silicon: route CoreML to Neural Engine - if self.backend == "mps" and self.compute_units != "all": - model = self._load_coreml_with_compute_units( - str(optimized_path)) + # MPS: use ONNX Runtime + CoreML EP for fast inference + if self.backend == "mps": + model = self._load_onnx_coreml(str(optimized_path)) else: model = YOLO(str(optimized_path)) self.load_ms = (time.perf_counter() - t0) * 1000 @@ -577,10 +730,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True): exported = self.export_model(pt_model, model_name) if exported: try: - # On Apple Silicon: route CoreML to Neural Engine - if self.backend == "mps" and self.compute_units != "all": - model = self._load_coreml_with_compute_units( - str(exported)) + # MPS: use ONNX Runtime + CoreML EP for fast inference + if self.backend == "mps": + model = self._load_onnx_coreml(str(exported)) else: model = YOLO(str(exported)) self.load_ms = (time.perf_counter() - t0) * 1000