Skip to content

Commit 59cba25

Browse files
authored
Merge pull request #165 from SharpAI/feature/onnx-coreml-inference
Feature/onnx coreml inference
2 parents a68cd16 + 136ca11 commit 59cba25

File tree

5 files changed

+469
-53
lines changed

5 files changed

+469
-53
lines changed
Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,8 @@
11
# YOLO 2026 — MPS (Apple Silicon) requirements
2-
# Standard PyTorch — MPS backend is included by default on macOS
3-
torch>=2.4.0
4-
torchvision>=0.19.0
5-
ultralytics>=8.3.0
6-
coremltools>=8.0
2+
# Uses ONNX Runtime + CoreML EP for GPU/ANE acceleration.
3+
# Pre-built yolo26n.onnx is shipped in the repo, so torch/ultralytics
4+
# are NOT needed at runtime — only onnxruntime for inference.
5+
onnxruntime>=1.19.0
76
numpy>=1.24.0,<2.0.0
87
opencv-python-headless>=4.8.0
98
Pillow>=10.0.0
10-

skills/detection/yolo-detection-2026/scripts/env_config.py

Lines changed: 216 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,12 @@ class BackendSpec:
5858
),
5959
"mps": BackendSpec(
6060
name="mps",
61-
export_format="coreml",
62-
model_suffix=".mlpackage",
63-
half=True,
64-
extra_export_args={"nms": False},
65-
compute_units="cpu_and_ne", # Route to Neural Engine, leave GPU free for LLM/VLM
61+
export_format="onnx",
62+
model_suffix=".onnx",
63+
half=False, # ONNX Runtime handles precision internally
64+
# ONNX Runtime + CoreMLExecutionProvider bypasses the broken
65+
# MPSGraphExecutable MLIR pipeline on macOS 26.x while still
66+
# leveraging GPU/ANE via CoreML under the hood.
6667
),
6768
"intel": BackendSpec(
6869
name="intel",
@@ -78,6 +79,106 @@ class BackendSpec:
7879
),
7980
}
8081

82+
# ─── ONNX + CoreML EP wrapper ────────────────────────────────────────────────
83+
# Provides an ultralytics-compatible model interface using onnxruntime directly
84+
# with CoreMLExecutionProvider for ~6ms inference on Apple Silicon (vs 21ms when
85+
# ultralytics defaults to CPUExecutionProvider).
86+
87+
class _BoxResult:
88+
"""Minimal replacement for ultralytics Boxes result."""
89+
__slots__ = ('xyxy', 'conf', 'cls')
90+
91+
def __init__(self, xyxy, conf, cls):
92+
self.xyxy = xyxy # [[x1,y1,x2,y2]]
93+
self.conf = conf # [conf]
94+
self.cls = cls # [cls_id]
95+
96+
97+
class _DetResult:
98+
"""Minimal replacement for ultralytics Results."""
99+
__slots__ = ('boxes',)
100+
101+
def __init__(self, boxes: list):
102+
self.boxes = boxes
103+
104+
105+
class _OnnxCoreMLModel:
106+
"""ONNX Runtime model with CoreML EP, compatible with ultralytics API.
107+
108+
Supports: model(image_path_or_pil, conf=0.5, verbose=False)
109+
Returns: list of _DetResult with .boxes iterable of _BoxResult
110+
"""
111+
112+
def __init__(self, session, class_names: dict):
113+
self.session = session
114+
self.names = class_names
115+
self._input_name = session.get_inputs()[0].name
116+
# Expected input shape: [1, 3, H, W]
117+
shape = session.get_inputs()[0].shape
118+
self._input_h = shape[2] if isinstance(shape[2], int) else 640
119+
self._input_w = shape[3] if isinstance(shape[3], int) else 640
120+
121+
def __call__(self, source, conf: float = 0.25, verbose: bool = True, **kwargs):
122+
"""Run inference on an image path or PIL Image."""
123+
import numpy as np
124+
from PIL import Image
125+
126+
# Load image
127+
if isinstance(source, str):
128+
img = Image.open(source).convert("RGB")
129+
elif isinstance(source, Image.Image):
130+
img = source.convert("RGB")
131+
else:
132+
img = Image.fromarray(source).convert("RGB")
133+
134+
orig_w, orig_h = img.size
135+
136+
# Letterbox resize to input size
137+
scale = min(self._input_w / orig_w, self._input_h / orig_h)
138+
new_w, new_h = int(orig_w * scale), int(orig_h * scale)
139+
img_resized = img.resize((new_w, new_h), Image.BILINEAR)
140+
141+
# Pad to input size (center)
142+
pad_x = (self._input_w - new_w) // 2
143+
pad_y = (self._input_h - new_h) // 2
144+
canvas = np.full((self._input_h, self._input_w, 3), 114, dtype=np.uint8)
145+
canvas[pad_y:pad_y + new_h, pad_x:pad_x + new_w] = np.array(img_resized)
146+
147+
# HWC→CHW, normalize, add batch dim
148+
blob = canvas.transpose(2, 0, 1).astype(np.float32) / 255.0
149+
blob = np.expand_dims(blob, 0)
150+
151+
# Run inference
152+
outputs = self.session.run(None, {self._input_name: blob})
153+
preds = outputs[0] # shape: [1, num_detections, 6]
154+
155+
# Parse detections: [x1, y1, x2, y2, confidence, class_id]
156+
boxes = []
157+
for det in preds[0]:
158+
det_conf = float(det[4])
159+
if det_conf < conf:
160+
continue
161+
162+
# Scale coordinates back to original image space
163+
x1 = (float(det[0]) - pad_x) / scale
164+
y1 = (float(det[1]) - pad_y) / scale
165+
x2 = (float(det[2]) - pad_x) / scale
166+
y2 = (float(det[3]) - pad_y) / scale
167+
168+
# Clip to image bounds
169+
x1 = max(0, min(x1, orig_w))
170+
y1 = max(0, min(y1, orig_h))
171+
x2 = max(0, min(x2, orig_w))
172+
y2 = max(0, min(y2, orig_h))
173+
174+
boxes.append(_BoxResult(
175+
xyxy=np.array([[x1, y1, x2, y2]]),
176+
conf=np.array([det_conf]),
177+
cls=np.array([int(det[5])]),
178+
))
179+
180+
return [_DetResult(boxes)]
181+
81182

82183
# ─── Hardware detection ──────────────────────────────────────────────────────
83184

@@ -133,31 +234,79 @@ def detect() -> "HardwareEnv":
133234
return env
134235

135236
def _try_cuda(self) -> bool:
136-
"""Detect NVIDIA GPU via nvidia-smi and torch."""
137-
if not shutil.which("nvidia-smi"):
138-
return False
237+
"""Detect NVIDIA GPU via nvidia-smi (with Windows path search) and WMI fallback."""
238+
nvidia_smi = shutil.which("nvidia-smi")
239+
240+
# Windows: check well-known paths if not on PATH
241+
if not nvidia_smi and platform.system() == "Windows":
242+
for candidate in [
243+
Path(os.environ.get("PROGRAMFILES", r"C:\Program Files"))
244+
/ "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe",
245+
Path(os.environ.get("WINDIR", r"C:\Windows"))
246+
/ "System32" / "nvidia-smi.exe",
247+
]:
248+
if candidate.is_file():
249+
nvidia_smi = str(candidate)
250+
_log(f"Found nvidia-smi at {nvidia_smi}")
251+
break
252+
253+
if nvidia_smi:
254+
try:
255+
result = subprocess.run(
256+
[nvidia_smi, "--query-gpu=name,memory.total,driver_version",
257+
"--format=csv,noheader,nounits"],
258+
capture_output=True, text=True, timeout=10,
259+
)
260+
if result.returncode == 0:
261+
line = result.stdout.strip().split("\n")[0]
262+
parts = [p.strip() for p in line.split(",")]
263+
if len(parts) >= 3:
264+
self.backend = "cuda"
265+
self.device = "cuda"
266+
self.gpu_name = parts[0]
267+
self.gpu_memory_mb = int(float(parts[1]))
268+
self.driver_version = parts[2]
269+
self.detection_details["nvidia_smi"] = line
270+
_log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})")
271+
return True
272+
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e:
273+
_log(f"nvidia-smi probe failed: {e}")
274+
275+
# Windows WMI fallback: detect NVIDIA GPU even without nvidia-smi on PATH
276+
if platform.system() == "Windows":
277+
return self._try_cuda_wmi()
278+
279+
return False
280+
281+
def _try_cuda_wmi(self) -> bool:
282+
"""Windows-only: detect NVIDIA GPU via WMI (wmic)."""
139283
try:
140284
result = subprocess.run(
141-
["nvidia-smi", "--query-gpu=name,memory.total,driver_version",
142-
"--format=csv,noheader,nounits"],
285+
["wmic", "path", "win32_VideoController", "get",
286+
"Name,AdapterRAM,DriverVersion", "/format:csv"],
143287
capture_output=True, text=True, timeout=10,
144288
)
145289
if result.returncode != 0:
146290
return False
147291

148-
line = result.stdout.strip().split("\n")[0]
149-
parts = [p.strip() for p in line.split(",")]
150-
if len(parts) >= 3:
151-
self.backend = "cuda"
152-
self.device = "cuda"
153-
self.gpu_name = parts[0]
154-
self.gpu_memory_mb = int(float(parts[1]))
155-
self.driver_version = parts[2]
156-
self.detection_details["nvidia_smi"] = line
157-
_log(f"NVIDIA GPU: {self.gpu_name} ({self.gpu_memory_mb}MB, driver {self.driver_version})")
158-
return True
292+
for line in result.stdout.strip().split("\n"):
293+
if "NVIDIA" in line.upper():
294+
parts = [p.strip() for p in line.split(",")]
295+
# CSV format: Node,AdapterRAM,DriverVersion,Name
296+
if len(parts) >= 4:
297+
self.backend = "cuda"
298+
self.device = "cuda"
299+
self.gpu_name = parts[3]
300+
try:
301+
self.gpu_memory_mb = int(int(parts[1]) / (1024 * 1024))
302+
except (ValueError, IndexError):
303+
pass
304+
self.driver_version = parts[2] if len(parts) > 2 else ""
305+
self.detection_details["wmi"] = line
306+
_log(f"NVIDIA GPU (WMI): {self.gpu_name} ({self.gpu_memory_mb}MB)")
307+
return True
159308
except (subprocess.TimeoutExpired, FileNotFoundError, ValueError) as e:
160-
_log(f"nvidia-smi probe failed: {e}")
309+
_log(f"WMI probe failed: {e}")
161310
return False
162311

163312
def _try_rocm(self) -> bool:
@@ -363,12 +512,28 @@ def _check_rocm_runtime(self):
363512
_log("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm")
364513
raise ImportError("ROCmExecutionProvider not available")
365514

515+
def _check_mps_runtime(self):
516+
"""Verify onnxruntime has CoreML provider for Apple GPU/ANE acceleration.
517+
518+
ONNX Runtime + CoreMLExecutionProvider bypasses the broken
519+
MPSGraphExecutable MLIR pipeline (macOS 26.x) while still routing
520+
inference through CoreML to leverage GPU and Neural Engine.
521+
"""
522+
import onnxruntime
523+
providers = onnxruntime.get_available_providers()
524+
if "CoreMLExecutionProvider" in providers:
525+
_log(f"onnxruntime CoreML provider available: {providers}")
526+
return True
527+
_log(f"onnxruntime providers: {providers} — CoreMLExecutionProvider not found")
528+
_log("Fix: pip install onnxruntime (arm64 macOS wheel includes CoreML EP)")
529+
raise ImportError("CoreMLExecutionProvider not available")
530+
366531
def _check_framework(self) -> bool:
367-
"""Check if the optimized inference runtime is importable."""
532+
"""Check if the optimized inference runtime is importable and compatible."""
368533
checks = {
369534
"cuda": lambda: __import__("tensorrt"),
370535
"rocm": lambda: self._check_rocm_runtime(),
371-
"mps": lambda: __import__("coremltools"),
536+
"mps": lambda: self._check_mps_runtime(),
372537
"intel": lambda: __import__("openvino"),
373538
"cpu": lambda: __import__("onnxruntime"),
374539
}
@@ -496,6 +661,27 @@ def __init__(self, *args, **kwargs):
496661
_log("coremltools not available, loading without compute_units")
497662
return YOLO(model_path)
498663

664+
def _load_onnx_coreml(self, onnx_path: str):
665+
"""Load ONNX model with CoreMLExecutionProvider for fast GPU/ANE inference.
666+
667+
Returns an OnnxCoreMLModel wrapper that is compatible with the
668+
ultralytics model(frame_path, conf=...) call pattern.
669+
"""
670+
import onnxruntime as ort
671+
672+
providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider']
673+
session = ort.InferenceSession(onnx_path, providers=providers)
674+
active = session.get_providers()
675+
_log(f"ONNX+CoreML session: {active}")
676+
677+
# Get YOLO class names from the .pt model (needed for detection output)
678+
from ultralytics import YOLO
679+
pt_path = onnx_path.replace('.onnx', '.pt')
680+
pt_model = YOLO(pt_path)
681+
class_names = pt_model.names # {0: 'person', 1: 'bicycle', ...}
682+
683+
return _OnnxCoreMLModel(session, class_names)
684+
499685
def load_optimized(self, model_name: str, use_optimized: bool = True):
500686
"""
501687
Load the best available model for this hardware.
@@ -512,10 +698,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
512698
optimized_path = self.get_optimized_path(model_name)
513699
if optimized_path.exists():
514700
try:
515-
# On Apple Silicon: route CoreML to Neural Engine
516-
if self.backend == "mps" and self.compute_units != "all":
517-
model = self._load_coreml_with_compute_units(
518-
str(optimized_path))
701+
# MPS: use ONNX Runtime + CoreML EP for fast inference
702+
if self.backend == "mps":
703+
model = self._load_onnx_coreml(str(optimized_path))
519704
else:
520705
model = YOLO(str(optimized_path))
521706
self.load_ms = (time.perf_counter() - t0) * 1000
@@ -529,10 +714,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
529714
exported = self.export_model(pt_model, model_name)
530715
if exported:
531716
try:
532-
# On Apple Silicon: route CoreML to Neural Engine
533-
if self.backend == "mps" and self.compute_units != "all":
534-
model = self._load_coreml_with_compute_units(
535-
str(exported))
717+
# MPS: use ONNX Runtime + CoreML EP for fast inference
718+
if self.backend == "mps":
719+
model = self._load_onnx_coreml(str(exported))
536720
else:
537721
model = YOLO(str(exported))
538722
self.load_ms = (time.perf_counter() - t0) * 1000
9.48 MB
Binary file not shown.

0 commit comments

Comments
 (0)