@@ -58,11 +58,12 @@ class BackendSpec:
5858 ),
5959 "mps" : BackendSpec (
6060 name = "mps" ,
61- export_format = "coreml" ,
62- model_suffix = ".mlpackage" ,
63- half = True ,
64- extra_export_args = {"nms" : False },
65- compute_units = "cpu_and_ne" , # Route to Neural Engine, leave GPU free for LLM/VLM
61+ export_format = "onnx" ,
62+ model_suffix = ".onnx" ,
63+ half = False , # ONNX Runtime handles precision internally
64+ # ONNX Runtime + CoreMLExecutionProvider bypasses the broken
65+ # MPSGraphExecutable MLIR pipeline on macOS 26.x while still
66+ # leveraging GPU/ANE via CoreML under the hood.
6667 ),
6768 "intel" : BackendSpec (
6869 name = "intel" ,
@@ -78,6 +79,106 @@ class BackendSpec:
7879 ),
7980}
8081
82+ # ─── ONNX + CoreML EP wrapper ────────────────────────────────────────────────
83+ # Provides an ultralytics-compatible model interface using onnxruntime directly
84+ # with CoreMLExecutionProvider for ~6ms inference on Apple Silicon (vs 21ms when
85+ # ultralytics defaults to CPUExecutionProvider).
86+
87+ class _BoxResult :
88+ """Minimal replacement for ultralytics Boxes result."""
89+ __slots__ = ('xyxy' , 'conf' , 'cls' )
90+
91+ def __init__ (self , xyxy , conf , cls ):
92+ self .xyxy = xyxy # [[x1,y1,x2,y2]]
93+ self .conf = conf # [conf]
94+ self .cls = cls # [cls_id]
95+
96+
97+ class _DetResult :
98+ """Minimal replacement for ultralytics Results."""
99+ __slots__ = ('boxes' ,)
100+
101+ def __init__ (self , boxes : list ):
102+ self .boxes = boxes
103+
104+
105+ class _OnnxCoreMLModel :
106+ """ONNX Runtime model with CoreML EP, compatible with ultralytics API.
107+
108+ Supports: model(image_path_or_pil, conf=0.5, verbose=False)
109+ Returns: list of _DetResult with .boxes iterable of _BoxResult
110+ """
111+
112+ def __init__ (self , session , class_names : dict ):
113+ self .session = session
114+ self .names = class_names
115+ self ._input_name = session .get_inputs ()[0 ].name
116+ # Expected input shape: [1, 3, H, W]
117+ shape = session .get_inputs ()[0 ].shape
118+ self ._input_h = shape [2 ] if isinstance (shape [2 ], int ) else 640
119+ self ._input_w = shape [3 ] if isinstance (shape [3 ], int ) else 640
120+
121+ def __call__ (self , source , conf : float = 0.25 , verbose : bool = True , ** kwargs ):
122+ """Run inference on an image path or PIL Image."""
123+ import numpy as np
124+ from PIL import Image
125+
126+ # Load image
127+ if isinstance (source , str ):
128+ img = Image .open (source ).convert ("RGB" )
129+ elif isinstance (source , Image .Image ):
130+ img = source .convert ("RGB" )
131+ else :
132+ img = Image .fromarray (source ).convert ("RGB" )
133+
134+ orig_w , orig_h = img .size
135+
136+ # Letterbox resize to input size
137+ scale = min (self ._input_w / orig_w , self ._input_h / orig_h )
138+ new_w , new_h = int (orig_w * scale ), int (orig_h * scale )
139+ img_resized = img .resize ((new_w , new_h ), Image .BILINEAR )
140+
141+ # Pad to input size (center)
142+ pad_x = (self ._input_w - new_w ) // 2
143+ pad_y = (self ._input_h - new_h ) // 2
144+ canvas = np .full ((self ._input_h , self ._input_w , 3 ), 114 , dtype = np .uint8 )
145+ canvas [pad_y :pad_y + new_h , pad_x :pad_x + new_w ] = np .array (img_resized )
146+
147+ # HWC→CHW, normalize, add batch dim
148+ blob = canvas .transpose (2 , 0 , 1 ).astype (np .float32 ) / 255.0
149+ blob = np .expand_dims (blob , 0 )
150+
151+ # Run inference
152+ outputs = self .session .run (None , {self ._input_name : blob })
153+ preds = outputs [0 ] # shape: [1, num_detections, 6]
154+
155+ # Parse detections: [x1, y1, x2, y2, confidence, class_id]
156+ boxes = []
157+ for det in preds [0 ]:
158+ det_conf = float (det [4 ])
159+ if det_conf < conf :
160+ continue
161+
162+ # Scale coordinates back to original image space
163+ x1 = (float (det [0 ]) - pad_x ) / scale
164+ y1 = (float (det [1 ]) - pad_y ) / scale
165+ x2 = (float (det [2 ]) - pad_x ) / scale
166+ y2 = (float (det [3 ]) - pad_y ) / scale
167+
168+ # Clip to image bounds
169+ x1 = max (0 , min (x1 , orig_w ))
170+ y1 = max (0 , min (y1 , orig_h ))
171+ x2 = max (0 , min (x2 , orig_w ))
172+ y2 = max (0 , min (y2 , orig_h ))
173+
174+ boxes .append (_BoxResult (
175+ xyxy = np .array ([[x1 , y1 , x2 , y2 ]]),
176+ conf = np .array ([det_conf ]),
177+ cls = np .array ([int (det [5 ])]),
178+ ))
179+
180+ return [_DetResult (boxes )]
181+
81182
82183# ─── Hardware detection ──────────────────────────────────────────────────────
83184
@@ -133,31 +234,79 @@ def detect() -> "HardwareEnv":
133234 return env
134235
135236 def _try_cuda (self ) -> bool :
136- """Detect NVIDIA GPU via nvidia-smi and torch."""
137- if not shutil .which ("nvidia-smi" ):
138- return False
237+ """Detect NVIDIA GPU via nvidia-smi (with Windows path search) and WMI fallback."""
238+ nvidia_smi = shutil .which ("nvidia-smi" )
239+
240+ # Windows: check well-known paths if not on PATH
241+ if not nvidia_smi and platform .system () == "Windows" :
242+ for candidate in [
243+ Path (os .environ .get ("PROGRAMFILES" , r"C:\Program Files" ))
244+ / "NVIDIA Corporation" / "NVSMI" / "nvidia-smi.exe" ,
245+ Path (os .environ .get ("WINDIR" , r"C:\Windows" ))
246+ / "System32" / "nvidia-smi.exe" ,
247+ ]:
248+ if candidate .is_file ():
249+ nvidia_smi = str (candidate )
250+ _log (f"Found nvidia-smi at { nvidia_smi } " )
251+ break
252+
253+ if nvidia_smi :
254+ try :
255+ result = subprocess .run (
256+ [nvidia_smi , "--query-gpu=name,memory.total,driver_version" ,
257+ "--format=csv,noheader,nounits" ],
258+ capture_output = True , text = True , timeout = 10 ,
259+ )
260+ if result .returncode == 0 :
261+ line = result .stdout .strip ().split ("\n " )[0 ]
262+ parts = [p .strip () for p in line .split ("," )]
263+ if len (parts ) >= 3 :
264+ self .backend = "cuda"
265+ self .device = "cuda"
266+ self .gpu_name = parts [0 ]
267+ self .gpu_memory_mb = int (float (parts [1 ]))
268+ self .driver_version = parts [2 ]
269+ self .detection_details ["nvidia_smi" ] = line
270+ _log (f"NVIDIA GPU: { self .gpu_name } ({ self .gpu_memory_mb } MB, driver { self .driver_version } )" )
271+ return True
272+ except (subprocess .TimeoutExpired , FileNotFoundError , ValueError ) as e :
273+ _log (f"nvidia-smi probe failed: { e } " )
274+
275+ # Windows WMI fallback: detect NVIDIA GPU even without nvidia-smi on PATH
276+ if platform .system () == "Windows" :
277+ return self ._try_cuda_wmi ()
278+
279+ return False
280+
281+ def _try_cuda_wmi (self ) -> bool :
282+ """Windows-only: detect NVIDIA GPU via WMI (wmic)."""
139283 try :
140284 result = subprocess .run (
141- ["nvidia-smi " , "--query-gpu=name,memory.total,driver_version " ,
142- "--format=csv,noheader,nounits " ],
285+ ["wmic " , "path" , "win32_VideoController" , "get " ,
286+ "Name,AdapterRAM,DriverVersion" , "/format:csv " ],
143287 capture_output = True , text = True , timeout = 10 ,
144288 )
145289 if result .returncode != 0 :
146290 return False
147291
148- line = result .stdout .strip ().split ("\n " )[0 ]
149- parts = [p .strip () for p in line .split ("," )]
150- if len (parts ) >= 3 :
151- self .backend = "cuda"
152- self .device = "cuda"
153- self .gpu_name = parts [0 ]
154- self .gpu_memory_mb = int (float (parts [1 ]))
155- self .driver_version = parts [2 ]
156- self .detection_details ["nvidia_smi" ] = line
157- _log (f"NVIDIA GPU: { self .gpu_name } ({ self .gpu_memory_mb } MB, driver { self .driver_version } )" )
158- return True
292+ for line in result .stdout .strip ().split ("\n " ):
293+ if "NVIDIA" in line .upper ():
294+ parts = [p .strip () for p in line .split ("," )]
295+ # CSV format: Node,AdapterRAM,DriverVersion,Name
296+ if len (parts ) >= 4 :
297+ self .backend = "cuda"
298+ self .device = "cuda"
299+ self .gpu_name = parts [3 ]
300+ try :
301+ self .gpu_memory_mb = int (int (parts [1 ]) / (1024 * 1024 ))
302+ except (ValueError , IndexError ):
303+ pass
304+ self .driver_version = parts [2 ] if len (parts ) > 2 else ""
305+ self .detection_details ["wmi" ] = line
306+ _log (f"NVIDIA GPU (WMI): { self .gpu_name } ({ self .gpu_memory_mb } MB)" )
307+ return True
159308 except (subprocess .TimeoutExpired , FileNotFoundError , ValueError ) as e :
160- _log (f"nvidia-smi probe failed: { e } " )
309+ _log (f"WMI probe failed: { e } " )
161310 return False
162311
163312 def _try_rocm (self ) -> bool :
@@ -363,12 +512,28 @@ def _check_rocm_runtime(self):
363512 _log ("Fix: pip uninstall onnxruntime && pip install onnxruntime-rocm" )
364513 raise ImportError ("ROCmExecutionProvider not available" )
365514
515+ def _check_mps_runtime (self ):
516+ """Verify onnxruntime has CoreML provider for Apple GPU/ANE acceleration.
517+
518+ ONNX Runtime + CoreMLExecutionProvider bypasses the broken
519+ MPSGraphExecutable MLIR pipeline (macOS 26.x) while still routing
520+ inference through CoreML to leverage GPU and Neural Engine.
521+ """
522+ import onnxruntime
523+ providers = onnxruntime .get_available_providers ()
524+ if "CoreMLExecutionProvider" in providers :
525+ _log (f"onnxruntime CoreML provider available: { providers } " )
526+ return True
527+ _log (f"onnxruntime providers: { providers } — CoreMLExecutionProvider not found" )
528+ _log ("Fix: pip install onnxruntime (arm64 macOS wheel includes CoreML EP)" )
529+ raise ImportError ("CoreMLExecutionProvider not available" )
530+
366531 def _check_framework (self ) -> bool :
367- """Check if the optimized inference runtime is importable."""
532+ """Check if the optimized inference runtime is importable and compatible ."""
368533 checks = {
369534 "cuda" : lambda : __import__ ("tensorrt" ),
370535 "rocm" : lambda : self ._check_rocm_runtime (),
371- "mps" : lambda : __import__ ( "coremltools" ),
536+ "mps" : lambda : self . _check_mps_runtime ( ),
372537 "intel" : lambda : __import__ ("openvino" ),
373538 "cpu" : lambda : __import__ ("onnxruntime" ),
374539 }
@@ -496,6 +661,27 @@ def __init__(self, *args, **kwargs):
496661 _log ("coremltools not available, loading without compute_units" )
497662 return YOLO (model_path )
498663
664+ def _load_onnx_coreml (self , onnx_path : str ):
665+ """Load ONNX model with CoreMLExecutionProvider for fast GPU/ANE inference.
666+
667+ Returns an OnnxCoreMLModel wrapper that is compatible with the
668+ ultralytics model(frame_path, conf=...) call pattern.
669+ """
670+ import onnxruntime as ort
671+
672+ providers = ['CoreMLExecutionProvider' , 'CPUExecutionProvider' ]
673+ session = ort .InferenceSession (onnx_path , providers = providers )
674+ active = session .get_providers ()
675+ _log (f"ONNX+CoreML session: { active } " )
676+
677+ # Get YOLO class names from the .pt model (needed for detection output)
678+ from ultralytics import YOLO
679+ pt_path = onnx_path .replace ('.onnx' , '.pt' )
680+ pt_model = YOLO (pt_path )
681+ class_names = pt_model .names # {0: 'person', 1: 'bicycle', ...}
682+
683+ return _OnnxCoreMLModel (session , class_names )
684+
499685 def load_optimized (self , model_name : str , use_optimized : bool = True ):
500686 """
501687 Load the best available model for this hardware.
@@ -512,10 +698,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
512698 optimized_path = self .get_optimized_path (model_name )
513699 if optimized_path .exists ():
514700 try :
515- # On Apple Silicon: route CoreML to Neural Engine
516- if self .backend == "mps" and self .compute_units != "all" :
517- model = self ._load_coreml_with_compute_units (
518- str (optimized_path ))
701+ # MPS: use ONNX Runtime + CoreML EP for fast inference
702+ if self .backend == "mps" :
703+ model = self ._load_onnx_coreml (str (optimized_path ))
519704 else :
520705 model = YOLO (str (optimized_path ))
521706 self .load_ms = (time .perf_counter () - t0 ) * 1000
@@ -529,10 +714,9 @@ def load_optimized(self, model_name: str, use_optimized: bool = True):
529714 exported = self .export_model (pt_model , model_name )
530715 if exported :
531716 try :
532- # On Apple Silicon: route CoreML to Neural Engine
533- if self .backend == "mps" and self .compute_units != "all" :
534- model = self ._load_coreml_with_compute_units (
535- str (exported ))
717+ # MPS: use ONNX Runtime + CoreML EP for fast inference
718+ if self .backend == "mps" :
719+ model = self ._load_onnx_coreml (str (exported ))
536720 else :
537721 model = YOLO (str (exported ))
538722 self .load_ms = (time .perf_counter () - t0 ) * 1000
0 commit comments