From 9bf816855eba178333d0aea95b931e70f6d6c441 Mon Sep 17 00:00:00 2001 From: Ivan Nikolic Date: Tue, 17 Mar 2026 22:08:36 +0800 Subject: [PATCH] initial depth anything module implementation --- dimos/hardware/sensors/camera/webcam.py | 5 +- dimos/perception/common/utils.py | 142 ----------- .../depth_anything3/blueprints/webcam.py | 32 +++ .../depth/depth_anything3/module.py | 240 ++++++++++++++++++ dimos/robot/all_blueprints.py | 3 + dimos/spec/perception.py | 5 +- pyproject.toml | 2 + 7 files changed, 283 insertions(+), 146 deletions(-) create mode 100644 dimos/perception/depth/depth_anything3/blueprints/webcam.py create mode 100644 dimos/perception/depth/depth_anything3/module.py diff --git a/dimos/hardware/sensors/camera/webcam.py b/dimos/hardware/sensors/camera/webcam.py index cfd1a080a0..3f3b047ba0 100644 --- a/dimos/hardware/sensors/camera/webcam.py +++ b/dimos/hardware/sensors/camera/webcam.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass, field from functools import cache import threading import time from typing import Literal import cv2 +from pydantic import Field from reactivex import create from reactivex.observable import Observable @@ -28,13 +28,12 @@ from dimos.utils.reactive import backpressure -@dataclass class WebcamConfig(CameraConfig): camera_index: int = 0 # /dev/videoN width: int = 640 height: int = 480 fps: float = 15.0 - camera_info: CameraInfo = field(default_factory=CameraInfo) + camera_info: CameraInfo = Field(default_factory=CameraInfo) frame_id_prefix: str | None = None stereo_slice: Literal["left", "right"] | None = None # For stereo cameras diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py index 1670d31998..acc6ec48f6 100644 --- a/dimos/perception/common/utils.py +++ b/dimos/perception/common/utils.py @@ -50,7 +50,6 @@ "Vector", "Vector3", "bbox2d_to_corners", - "colorize_depth", "combine_object_data", "cp", "cv2", @@ -351,147 +350,6 @@ def project_2d_points_to_3d( ) -def colorize_depth( - depth_img: Union[np.ndarray, "cp.ndarray"], # type: ignore[type-arg] - max_depth: float = 5.0, - overlay_stats: bool = True, -) -> Union[np.ndarray, "cp.ndarray"] | None: # type: ignore[type-arg] - """ - Normalize and colorize depth image using COLORMAP_JET with optional statistics overlay. - - Args: - depth_img: Depth image (H, W) in meters - max_depth: Maximum depth value for normalization - overlay_stats: Whether to overlay depth statistics on the image - - Returns: - Colorized depth image (H, W, 3) in RGB format, or None if input is None - """ - if depth_img is None: - return None - - was_cu = _is_cu_array(depth_img) - xp = cp if was_cu else np - depth = depth_img if was_cu else np.asarray(depth_img) - - valid_mask = xp.isfinite(depth) & (depth > 0) - depth_norm = xp.zeros_like(depth, dtype=xp.float32) - if bool(valid_mask.any() if not was_cu else xp.any(valid_mask)): - depth_norm = xp.where(valid_mask, xp.clip(depth / max_depth, 0, 1), depth_norm) - - # Use CPU for colormap/text; convert back to GPU if needed - depth_norm_np = _to_numpy(depth_norm) # type: ignore[no-untyped-call] - depth_colored = cv2.applyColorMap((depth_norm_np * 255).astype(np.uint8), cv2.COLORMAP_JET) - depth_rgb_np = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB) - depth_rgb_np = (depth_rgb_np * 0.6).astype(np.uint8) - - if overlay_stats and (np.any(_to_numpy(valid_mask))): # type: ignore[no-untyped-call] - valid_depths = _to_numpy(depth)[_to_numpy(valid_mask)] # type: ignore[no-untyped-call] - min_depth = float(np.min(valid_depths)) - max_depth_actual = float(np.max(valid_depths)) - h, w = depth_rgb_np.shape[:2] - center_y, center_x = h // 2, w // 2 - center_region = _to_numpy( # type: ignore[no-untyped-call] - depth - )[max(0, center_y - 2) : min(h, center_y + 3), max(0, center_x - 2) : min(w, center_x + 3)] - center_mask = np.isfinite(center_region) & (center_region > 0) - if center_mask.any(): - center_depth = float(np.median(center_region[center_mask])) - else: - depth_np = _to_numpy(depth) # type: ignore[no-untyped-call] - vm_np = _to_numpy(valid_mask) # type: ignore[no-untyped-call] - center_depth = float(depth_np[center_y, center_x]) if vm_np[center_y, center_x] else 0.0 - - font = cv2.FONT_HERSHEY_SIMPLEX - font_scale = 0.6 - thickness = 1 - line_type = cv2.LINE_AA - text_color = (255, 255, 255) - bg_color = (0, 0, 0) - padding = 5 - - min_text = f"Min: {min_depth:.2f}m" - (text_w, text_h), _ = cv2.getTextSize(min_text, font, font_scale, thickness) - cv2.rectangle( - depth_rgb_np, - (padding, padding), - (padding + text_w + 4, padding + text_h + 6), - bg_color, - -1, - ) - cv2.putText( - depth_rgb_np, - min_text, - (padding + 2, padding + text_h + 2), - font, - font_scale, - text_color, - thickness, - line_type, - ) - - max_text = f"Max: {max_depth_actual:.2f}m" - (text_w, text_h), _ = cv2.getTextSize(max_text, font, font_scale, thickness) - cv2.rectangle( - depth_rgb_np, - (w - padding - text_w - 4, padding), - (w - padding, padding + text_h + 6), - bg_color, - -1, - ) - cv2.putText( - depth_rgb_np, - max_text, - (w - padding - text_w - 2, padding + text_h + 2), - font, - font_scale, - text_color, - thickness, - line_type, - ) - - if center_depth > 0: - center_text = f"{center_depth:.2f}m" - (text_w, text_h), _ = cv2.getTextSize(center_text, font, font_scale, thickness) - center_text_x = center_x - text_w // 2 - center_text_y = center_y + text_h // 2 - cross_size = 10 - cross_color = (255, 255, 255) - cv2.line( - depth_rgb_np, - (center_x - cross_size, center_y), - (center_x + cross_size, center_y), - cross_color, - 1, - ) - cv2.line( - depth_rgb_np, - (center_x, center_y - cross_size), - (center_x, center_y + cross_size), - cross_color, - 1, - ) - cv2.rectangle( - depth_rgb_np, - (center_text_x - 2, center_text_y - text_h - 2), - (center_text_x + text_w + 2, center_text_y + 2), - bg_color, - -1, - ) - cv2.putText( - depth_rgb_np, - center_text, - (center_text_x, center_text_y), - font, - font_scale, - text_color, - thickness, - line_type, - ) - - return _to_cupy(depth_rgb_np) if was_cu else depth_rgb_np # type: ignore[no-untyped-call] - - def draw_bounding_box( image: Union[np.ndarray, "cp.ndarray"], # type: ignore[type-arg] bbox: list[float], diff --git a/dimos/perception/depth/depth_anything3/blueprints/webcam.py b/dimos/perception/depth/depth_anything3/blueprints/webcam.py new file mode 100644 index 0000000000..41ac47155d --- /dev/null +++ b/dimos/perception/depth/depth_anything3/blueprints/webcam.py @@ -0,0 +1,32 @@ +# Copyright 2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dimos.core.blueprints import autoconnect +from dimos.hardware.sensors.camera.module import CameraModule +from dimos.perception.depth.depth_anything3.module import DA3Mode, DA3Model, DepthAnything3Module +from dimos.visualization.rerun.bridge import rerun_bridge + +depth_anything3_webcam = autoconnect( + CameraModule.blueprint(), + DepthAnything3Module.blueprint(model=DA3Model.SMALL), + rerun_bridge(), +) + +depth_anything3_webcam_temporal = autoconnect( + CameraModule.blueprint(), + DepthAnything3Module.blueprint(model=DA3Model.SMALL, mode=DA3Mode.TEMPORAL, window_frames=5), + rerun_bridge(), +) + +__all__ = ["depth_anything3_webcam", "depth_anything3_webcam_temporal"] diff --git a/dimos/perception/depth/depth_anything3/module.py b/dimos/perception/depth/depth_anything3/module.py new file mode 100644 index 0000000000..ec2f1abbb9 --- /dev/null +++ b/dimos/perception/depth/depth_anything3/module.py @@ -0,0 +1,240 @@ +# Copyright 2025-2026 Dimensional Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +from enum import Enum +from typing import TYPE_CHECKING, Any + +import numpy as np +from reactivex import operators as ops +from reactivex.observable import Observable + +from dimos.core.core import rpc +from dimos.core.module import Module, ModuleConfig +from dimos.core.stream import In, Out +from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, sharpness_barrier +from dimos.spec import perception +from dimos.utils.decorators.decorators import simple_mcache +from dimos.utils.reactive import backpressure + +if TYPE_CHECKING: + from dimos.core.module_coordinator import ModuleCoordinator + + +class DA3Model(str, Enum): + SMALL = "depth-anything/DA3-SMALL" + BASE = "depth-anything/DA3-BASE" + LARGE = "depth-anything/DA3-LARGE-1.1" + METRIC = "depth-anything/DA3METRIC-LARGE" + GIANT = "depth-anything/DA3NESTED-GIANT-LARGE-1.1" + + +class DA3Mode(str, Enum): + SINGLE = "single" + TEMPORAL = "temporal" + MULTI = "multi" + + +class Config(ModuleConfig): + model: DA3Model = DA3Model.LARGE + mode: DA3Mode = DA3Mode.SINGLE + device: str = "cuda" + max_freq: float = 5.0 + process_res: int = 504 + + # temporal mode + window_frames: int = 5 + motion_threshold: float = 5.0 # min mean pixel diff to accept a new keyframe + + # multi-camera mode + num_cameras: int = 1 + + # inference options + use_ray_pose: bool = False + ref_view_strategy: str = "saddle_balanced" + + # outputs + publish_confidence: bool = False + + +class DepthAnything3Module(Module[Config]): + default_config = Config + + color_image: In[Image] + + depth_image: Out[Image] + confidence: Out[Image] + + def __init__(self, *args: Any, **kwargs: Any) -> None: + super().__init__(*args, **kwargs) + + from depth_anything_3.api import DepthAnything3 + + self._model = DepthAnything3.from_pretrained(self.config.model.value) + self._model = self._model.to(device=self.config.device) + + # Create extra camera inputs/outputs for multi mode + if self.config.mode == DA3Mode.MULTI: + for i in range(1, self.config.num_cameras): + setattr(self, f"color_image_{i}", In(Image, f"color_image_{i}", self)) + setattr(self, f"depth_image_{i}", Out(Image, f"depth_image_{i}", self)) + + def _predict(self, images: list[Image]) -> list[tuple[np.ndarray, np.ndarray]]: + """Run DA3 inference. Returns list of (depth, confidence) arrays.""" + np_images = [img.data for img in images] + prediction = self._model.inference( + image=np_images, + use_ray_pose=self.config.use_ray_pose, + ref_view_strategy=self.config.ref_view_strategy, + process_res=self.config.process_res, + ) + return list(zip(prediction.depth, prediction.conf, strict=False)) + + def _publish_result(self, image: Image, depth: np.ndarray, conf: np.ndarray) -> None: + """Publish depth (and optionally confidence) for a single frame.""" + self.depth_image.publish( + Image(data=depth, format=ImageFormat.DEPTH, frame_id=image.frame_id, ts=image.ts) + ) + if self.config.publish_confidence: + self.confidence.publish( + Image(data=conf, format=ImageFormat.DEPTH, frame_id=image.frame_id, ts=image.ts) + ) + + # -- Stream builders -- + + @simple_mcache + def _sharp_stream(self) -> Observable[Image]: + return backpressure( + self.color_image.pure_observable().pipe( + sharpness_barrier(self.config.max_freq), + ) + ) + + @simple_mcache + def _single_stream(self) -> Observable[tuple[Image, np.ndarray, np.ndarray]]: + def process(img: Image) -> tuple[Image, np.ndarray, np.ndarray]: + results = self._predict([img]) + depth, conf = results[0] + return (img, depth, conf) + + return backpressure(self._sharp_stream().pipe(ops.map(process))) + + @simple_mcache + def _temporal_stream(self) -> Observable[tuple[Image, np.ndarray, np.ndarray]]: + window_size = self.config.window_frames + threshold = self.config.motion_threshold + + def accumulate_keyframes(keyframes: list[Image], img: Image) -> list[Image]: + if not keyframes: + return [img] + # Compare against last keyframe — mean absolute pixel difference + diff = float( + np.mean(np.abs(img.data.astype(np.float32) - keyframes[-1].data.astype(np.float32))) + ) + if diff >= threshold: + keyframes = [*keyframes[-(window_size - 1) :], img] + return keyframes + + def process_window(keyframes: list[Image]) -> tuple[Image, np.ndarray, np.ndarray] | None: + if len(keyframes) < 2: + return None + results = self._predict(keyframes) + depth, conf = results[-1] + return (keyframes[-1], depth, conf) + + return backpressure( + self._sharp_stream().pipe( + ops.scan(accumulate_keyframes, []), + ops.map(process_window), + ops.filter(lambda r: r is not None), + ) + ) + + @rpc + def start(self) -> None: + if self.config.mode == DA3Mode.SINGLE: + self._single_stream().subscribe(lambda r: self._publish_result(r[0], r[1], r[2])) + + elif self.config.mode == DA3Mode.TEMPORAL: + self._temporal_stream().subscribe(lambda r: self._publish_result(r[0], r[1], r[2])) + + elif self.config.mode == DA3Mode.MULTI: + from dimos.types.timestamped import align_timestamped + + camera_streams = [self.color_image.pure_observable()] + for i in range(1, self.config.num_cameras): + port: In[Image] = getattr(self, f"color_image_{i}") + camera_streams.append(port.pure_observable()) + + aligned = align_timestamped( + camera_streams[0], + *camera_streams[1:], + match_tolerance=0.1, + buffer_size=2.0, + ) + + def process_multi(frames: tuple[Image, ...]) -> None: + images = list(frames) + results = self._predict(images) + # Publish depth for camera 0 + depth, conf = results[0] + self._publish_result(images[0], depth, conf) + # Publish depth for remaining cameras + for i in range(1, len(results)): + d, c = results[i] + out: Out[Image] = getattr(self, f"depth_image_{i}") + out.publish( + Image( + data=d, + format=ImageFormat.DEPTH, + frame_id=images[i].frame_id, + ts=images[i].ts, + ) + ) + if self.config.publish_confidence: + conf_out: Out[Image] = getattr(self, f"confidence_{i}", None) # type: ignore[assignment] + if conf_out: + conf_out.publish( + Image( + data=c, + format=ImageFormat.DEPTH, + frame_id=images[i].frame_id, + ts=images[i].ts, + ) + ) + + backpressure(aligned).subscribe(process_multi) + + @rpc + def stop(self) -> None: + return super().stop() # type: ignore[no-any-return] + + +def deploy( + dimos: ModuleCoordinator, + camera: perception.Image, + prefix: str = "/depth_anything3", + **kwargs: Any, +) -> DepthAnything3Module: + from dimos.core.transport import LCMTransport + + module = DepthAnything3Module(**kwargs) + module.color_image.connect(camera.color_image) + + module.depth_image.transport = LCMTransport(f"{prefix}/depth", Image) + module.confidence.transport = LCMTransport(f"{prefix}/confidence", Image) + + module.start() + return module diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index e82cb656ce..841b27cd4e 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -50,6 +50,8 @@ "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration", "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm", "demo-skill": "dimos.agents.skills.demo_skill:demo_skill", + "depth-anything3-webcam": "dimos.perception.depth.depth_anything3.blueprints.webcam:depth_anything3_webcam", + "depth-anything3-webcam-temporal": "dimos.perception.depth.depth_anything3.blueprints.webcam:depth_anything3_webcam_temporal", "drone-agentic": "dimos.robot.drone.blueprints.agentic.drone_agentic:drone_agentic", "drone-basic": "dimos.robot.drone.blueprints.basic.drone_basic:drone_basic", "dual-xarm6-planner": "dimos.manipulation.blueprints:dual_xarm6_planner", @@ -126,6 +128,7 @@ "navigation-skill": "dimos.agents.skills.navigation", "object-scene-registration-module": "dimos.perception.object_scene_registration", "object-tracking": "dimos.perception.object_tracker", + "orbslam3-module": "dimos.perception.slam.orbslam3.module", "osm-skill": "dimos.agents.skills.osm", "person-follow-skill": "dimos.agents.skills.person_follow", "person-tracker-module": "dimos.perception.detection.person_tracker", diff --git a/dimos/spec/perception.py b/dimos/spec/perception.py index 4fac65ad02..88ecf89751 100644 --- a/dimos/spec/perception.py +++ b/dimos/spec/perception.py @@ -30,8 +30,11 @@ class Camera(Image): camera_info: Out[CameraInfo] -class DepthCamera(Camera): +class Depth(Protocol): depth_image: Out[ImageMsg] + + +class DepthCamera(Camera, Depth): depth_camera_info: Out[CameraInfo] diff --git a/pyproject.toml b/pyproject.toml index 1fbd29f86f..b26d47694a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -387,6 +387,8 @@ module = [ "annotation_protocol", "cyclonedds", "cyclonedds.*", + "depth_anything_3", + "depth_anything_3.*", "dimos_lcm.*", "etils", "geometry_msgs.*",