From 9bf816855eba178333d0aea95b931e70f6d6c441 Mon Sep 17 00:00:00 2001
From: Ivan Nikolic <lesh@sysphere.org>
Date: Tue, 17 Mar 2026 22:08:36 +0800
Subject: [PATCH] initial depth anything module implementation

---
 dimos/hardware/sensors/camera/webcam.py       |   5 +-
 dimos/perception/common/utils.py              | 142 -----------
 .../depth_anything3/blueprints/webcam.py      |  32 +++
 .../depth/depth_anything3/module.py           | 240 ++++++++++++++++++
 dimos/robot/all_blueprints.py                 |   3 +
 dimos/spec/perception.py                      |   5 +-
 pyproject.toml                                |   2 +
 7 files changed, 283 insertions(+), 146 deletions(-)
 create mode 100644 dimos/perception/depth/depth_anything3/blueprints/webcam.py
 create mode 100644 dimos/perception/depth/depth_anything3/module.py

diff --git a/dimos/hardware/sensors/camera/webcam.py b/dimos/hardware/sensors/camera/webcam.py
index cfd1a080a0..3f3b047ba0 100644
--- a/dimos/hardware/sensors/camera/webcam.py
+++ b/dimos/hardware/sensors/camera/webcam.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass, field
 from functools import cache
 import threading
 import time
 from typing import Literal
 
 import cv2
+from pydantic import Field
 from reactivex import create
 from reactivex.observable import Observable
 
@@ -28,13 +28,12 @@
 from dimos.utils.reactive import backpressure
 
 
-@dataclass
 class WebcamConfig(CameraConfig):
     camera_index: int = 0  # /dev/videoN
     width: int = 640
     height: int = 480
     fps: float = 15.0
-    camera_info: CameraInfo = field(default_factory=CameraInfo)
+    camera_info: CameraInfo = Field(default_factory=CameraInfo)
     frame_id_prefix: str | None = None
     stereo_slice: Literal["left", "right"] | None = None  # For stereo cameras
 
diff --git a/dimos/perception/common/utils.py b/dimos/perception/common/utils.py
index 1670d31998..acc6ec48f6 100644
--- a/dimos/perception/common/utils.py
+++ b/dimos/perception/common/utils.py
@@ -50,7 +50,6 @@
     "Vector",
     "Vector3",
     "bbox2d_to_corners",
-    "colorize_depth",
     "combine_object_data",
     "cp",
     "cv2",
@@ -351,147 +350,6 @@ def project_2d_points_to_3d(
     )
 
 
-def colorize_depth(
-    depth_img: Union[np.ndarray, "cp.ndarray"],  # type: ignore[type-arg]
-    max_depth: float = 5.0,
-    overlay_stats: bool = True,
-) -> Union[np.ndarray, "cp.ndarray"] | None:  # type: ignore[type-arg]
-    """
-    Normalize and colorize depth image using COLORMAP_JET with optional statistics overlay.
-
-    Args:
-        depth_img: Depth image (H, W) in meters
-        max_depth: Maximum depth value for normalization
-        overlay_stats: Whether to overlay depth statistics on the image
-
-    Returns:
-        Colorized depth image (H, W, 3) in RGB format, or None if input is None
-    """
-    if depth_img is None:
-        return None
-
-    was_cu = _is_cu_array(depth_img)
-    xp = cp if was_cu else np
-    depth = depth_img if was_cu else np.asarray(depth_img)
-
-    valid_mask = xp.isfinite(depth) & (depth > 0)
-    depth_norm = xp.zeros_like(depth, dtype=xp.float32)
-    if bool(valid_mask.any() if not was_cu else xp.any(valid_mask)):
-        depth_norm = xp.where(valid_mask, xp.clip(depth / max_depth, 0, 1), depth_norm)
-
-    # Use CPU for colormap/text; convert back to GPU if needed
-    depth_norm_np = _to_numpy(depth_norm)  # type: ignore[no-untyped-call]
-    depth_colored = cv2.applyColorMap((depth_norm_np * 255).astype(np.uint8), cv2.COLORMAP_JET)
-    depth_rgb_np = cv2.cvtColor(depth_colored, cv2.COLOR_BGR2RGB)
-    depth_rgb_np = (depth_rgb_np * 0.6).astype(np.uint8)
-
-    if overlay_stats and (np.any(_to_numpy(valid_mask))):  # type: ignore[no-untyped-call]
-        valid_depths = _to_numpy(depth)[_to_numpy(valid_mask)]  # type: ignore[no-untyped-call]
-        min_depth = float(np.min(valid_depths))
-        max_depth_actual = float(np.max(valid_depths))
-        h, w = depth_rgb_np.shape[:2]
-        center_y, center_x = h // 2, w // 2
-        center_region = _to_numpy(  # type: ignore[no-untyped-call]
-            depth
-        )[max(0, center_y - 2) : min(h, center_y + 3), max(0, center_x - 2) : min(w, center_x + 3)]
-        center_mask = np.isfinite(center_region) & (center_region > 0)
-        if center_mask.any():
-            center_depth = float(np.median(center_region[center_mask]))
-        else:
-            depth_np = _to_numpy(depth)  # type: ignore[no-untyped-call]
-            vm_np = _to_numpy(valid_mask)  # type: ignore[no-untyped-call]
-            center_depth = float(depth_np[center_y, center_x]) if vm_np[center_y, center_x] else 0.0
-
-        font = cv2.FONT_HERSHEY_SIMPLEX
-        font_scale = 0.6
-        thickness = 1
-        line_type = cv2.LINE_AA
-        text_color = (255, 255, 255)
-        bg_color = (0, 0, 0)
-        padding = 5
-
-        min_text = f"Min: {min_depth:.2f}m"
-        (text_w, text_h), _ = cv2.getTextSize(min_text, font, font_scale, thickness)
-        cv2.rectangle(
-            depth_rgb_np,
-            (padding, padding),
-            (padding + text_w + 4, padding + text_h + 6),
-            bg_color,
-            -1,
-        )
-        cv2.putText(
-            depth_rgb_np,
-            min_text,
-            (padding + 2, padding + text_h + 2),
-            font,
-            font_scale,
-            text_color,
-            thickness,
-            line_type,
-        )
-
-        max_text = f"Max: {max_depth_actual:.2f}m"
-        (text_w, text_h), _ = cv2.getTextSize(max_text, font, font_scale, thickness)
-        cv2.rectangle(
-            depth_rgb_np,
-            (w - padding - text_w - 4, padding),
-            (w - padding, padding + text_h + 6),
-            bg_color,
-            -1,
-        )
-        cv2.putText(
-            depth_rgb_np,
-            max_text,
-            (w - padding - text_w - 2, padding + text_h + 2),
-            font,
-            font_scale,
-            text_color,
-            thickness,
-            line_type,
-        )
-
-        if center_depth > 0:
-            center_text = f"{center_depth:.2f}m"
-            (text_w, text_h), _ = cv2.getTextSize(center_text, font, font_scale, thickness)
-            center_text_x = center_x - text_w // 2
-            center_text_y = center_y + text_h // 2
-            cross_size = 10
-            cross_color = (255, 255, 255)
-            cv2.line(
-                depth_rgb_np,
-                (center_x - cross_size, center_y),
-                (center_x + cross_size, center_y),
-                cross_color,
-                1,
-            )
-            cv2.line(
-                depth_rgb_np,
-                (center_x, center_y - cross_size),
-                (center_x, center_y + cross_size),
-                cross_color,
-                1,
-            )
-            cv2.rectangle(
-                depth_rgb_np,
-                (center_text_x - 2, center_text_y - text_h - 2),
-                (center_text_x + text_w + 2, center_text_y + 2),
-                bg_color,
-                -1,
-            )
-            cv2.putText(
-                depth_rgb_np,
-                center_text,
-                (center_text_x, center_text_y),
-                font,
-                font_scale,
-                text_color,
-                thickness,
-                line_type,
-            )
-
-    return _to_cupy(depth_rgb_np) if was_cu else depth_rgb_np  # type: ignore[no-untyped-call]
-
-
 def draw_bounding_box(
     image: Union[np.ndarray, "cp.ndarray"],  # type: ignore[type-arg]
     bbox: list[float],
diff --git a/dimos/perception/depth/depth_anything3/blueprints/webcam.py b/dimos/perception/depth/depth_anything3/blueprints/webcam.py
new file mode 100644
index 0000000000..41ac47155d
--- /dev/null
+++ b/dimos/perception/depth/depth_anything3/blueprints/webcam.py
@@ -0,0 +1,32 @@
+# Copyright 2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dimos.core.blueprints import autoconnect
+from dimos.hardware.sensors.camera.module import CameraModule
+from dimos.perception.depth.depth_anything3.module import DA3Mode, DA3Model, DepthAnything3Module
+from dimos.visualization.rerun.bridge import rerun_bridge
+
+depth_anything3_webcam = autoconnect(
+    CameraModule.blueprint(),
+    DepthAnything3Module.blueprint(model=DA3Model.SMALL),
+    rerun_bridge(),
+)
+
+depth_anything3_webcam_temporal = autoconnect(
+    CameraModule.blueprint(),
+    DepthAnything3Module.blueprint(model=DA3Model.SMALL, mode=DA3Mode.TEMPORAL, window_frames=5),
+    rerun_bridge(),
+)
+
+__all__ = ["depth_anything3_webcam", "depth_anything3_webcam_temporal"]
diff --git a/dimos/perception/depth/depth_anything3/module.py b/dimos/perception/depth/depth_anything3/module.py
new file mode 100644
index 0000000000..ec2f1abbb9
--- /dev/null
+++ b/dimos/perception/depth/depth_anything3/module.py
@@ -0,0 +1,240 @@
+# Copyright 2025-2026 Dimensional Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import TYPE_CHECKING, Any
+
+import numpy as np
+from reactivex import operators as ops
+from reactivex.observable import Observable
+
+from dimos.core.core import rpc
+from dimos.core.module import Module, ModuleConfig
+from dimos.core.stream import In, Out
+from dimos.msgs.sensor_msgs.Image import Image, ImageFormat, sharpness_barrier
+from dimos.spec import perception
+from dimos.utils.decorators.decorators import simple_mcache
+from dimos.utils.reactive import backpressure
+
+if TYPE_CHECKING:
+    from dimos.core.module_coordinator import ModuleCoordinator
+
+
+class DA3Model(str, Enum):
+    SMALL = "depth-anything/DA3-SMALL"
+    BASE = "depth-anything/DA3-BASE"
+    LARGE = "depth-anything/DA3-LARGE-1.1"
+    METRIC = "depth-anything/DA3METRIC-LARGE"
+    GIANT = "depth-anything/DA3NESTED-GIANT-LARGE-1.1"
+
+
+class DA3Mode(str, Enum):
+    SINGLE = "single"
+    TEMPORAL = "temporal"
+    MULTI = "multi"
+
+
+class Config(ModuleConfig):
+    model: DA3Model = DA3Model.LARGE
+    mode: DA3Mode = DA3Mode.SINGLE
+    device: str = "cuda"
+    max_freq: float = 5.0
+    process_res: int = 504
+
+    # temporal mode
+    window_frames: int = 5
+    motion_threshold: float = 5.0  # min mean pixel diff to accept a new keyframe
+
+    # multi-camera mode
+    num_cameras: int = 1
+
+    # inference options
+    use_ray_pose: bool = False
+    ref_view_strategy: str = "saddle_balanced"
+
+    # outputs
+    publish_confidence: bool = False
+
+
+class DepthAnything3Module(Module[Config]):
+    default_config = Config
+
+    color_image: In[Image]
+
+    depth_image: Out[Image]
+    confidence: Out[Image]
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+        from depth_anything_3.api import DepthAnything3
+
+        self._model = DepthAnything3.from_pretrained(self.config.model.value)
+        self._model = self._model.to(device=self.config.device)
+
+        # Create extra camera inputs/outputs for multi mode
+        if self.config.mode == DA3Mode.MULTI:
+            for i in range(1, self.config.num_cameras):
+                setattr(self, f"color_image_{i}", In(Image, f"color_image_{i}", self))
+                setattr(self, f"depth_image_{i}", Out(Image, f"depth_image_{i}", self))
+
+    def _predict(self, images: list[Image]) -> list[tuple[np.ndarray, np.ndarray]]:
+        """Run DA3 inference. Returns list of (depth, confidence) arrays."""
+        np_images = [img.data for img in images]
+        prediction = self._model.inference(
+            image=np_images,
+            use_ray_pose=self.config.use_ray_pose,
+            ref_view_strategy=self.config.ref_view_strategy,
+            process_res=self.config.process_res,
+        )
+        return list(zip(prediction.depth, prediction.conf, strict=False))
+
+    def _publish_result(self, image: Image, depth: np.ndarray, conf: np.ndarray) -> None:
+        """Publish depth (and optionally confidence) for a single frame."""
+        self.depth_image.publish(
+            Image(data=depth, format=ImageFormat.DEPTH, frame_id=image.frame_id, ts=image.ts)
+        )
+        if self.config.publish_confidence:
+            self.confidence.publish(
+                Image(data=conf, format=ImageFormat.DEPTH, frame_id=image.frame_id, ts=image.ts)
+            )
+
+    # -- Stream builders --
+
+    @simple_mcache
+    def _sharp_stream(self) -> Observable[Image]:
+        return backpressure(
+            self.color_image.pure_observable().pipe(
+                sharpness_barrier(self.config.max_freq),
+            )
+        )
+
+    @simple_mcache
+    def _single_stream(self) -> Observable[tuple[Image, np.ndarray, np.ndarray]]:
+        def process(img: Image) -> tuple[Image, np.ndarray, np.ndarray]:
+            results = self._predict([img])
+            depth, conf = results[0]
+            return (img, depth, conf)
+
+        return backpressure(self._sharp_stream().pipe(ops.map(process)))
+
+    @simple_mcache
+    def _temporal_stream(self) -> Observable[tuple[Image, np.ndarray, np.ndarray]]:
+        window_size = self.config.window_frames
+        threshold = self.config.motion_threshold
+
+        def accumulate_keyframes(keyframes: list[Image], img: Image) -> list[Image]:
+            if not keyframes:
+                return [img]
+            # Compare against last keyframe — mean absolute pixel difference
+            diff = float(
+                np.mean(np.abs(img.data.astype(np.float32) - keyframes[-1].data.astype(np.float32)))
+            )
+            if diff >= threshold:
+                keyframes = [*keyframes[-(window_size - 1) :], img]
+            return keyframes
+
+        def process_window(keyframes: list[Image]) -> tuple[Image, np.ndarray, np.ndarray] | None:
+            if len(keyframes) < 2:
+                return None
+            results = self._predict(keyframes)
+            depth, conf = results[-1]
+            return (keyframes[-1], depth, conf)
+
+        return backpressure(
+            self._sharp_stream().pipe(
+                ops.scan(accumulate_keyframes, []),
+                ops.map(process_window),
+                ops.filter(lambda r: r is not None),
+            )
+        )
+
+    @rpc
+    def start(self) -> None:
+        if self.config.mode == DA3Mode.SINGLE:
+            self._single_stream().subscribe(lambda r: self._publish_result(r[0], r[1], r[2]))
+
+        elif self.config.mode == DA3Mode.TEMPORAL:
+            self._temporal_stream().subscribe(lambda r: self._publish_result(r[0], r[1], r[2]))
+
+        elif self.config.mode == DA3Mode.MULTI:
+            from dimos.types.timestamped import align_timestamped
+
+            camera_streams = [self.color_image.pure_observable()]
+            for i in range(1, self.config.num_cameras):
+                port: In[Image] = getattr(self, f"color_image_{i}")
+                camera_streams.append(port.pure_observable())
+
+            aligned = align_timestamped(
+                camera_streams[0],
+                *camera_streams[1:],
+                match_tolerance=0.1,
+                buffer_size=2.0,
+            )
+
+            def process_multi(frames: tuple[Image, ...]) -> None:
+                images = list(frames)
+                results = self._predict(images)
+                # Publish depth for camera 0
+                depth, conf = results[0]
+                self._publish_result(images[0], depth, conf)
+                # Publish depth for remaining cameras
+                for i in range(1, len(results)):
+                    d, c = results[i]
+                    out: Out[Image] = getattr(self, f"depth_image_{i}")
+                    out.publish(
+                        Image(
+                            data=d,
+                            format=ImageFormat.DEPTH,
+                            frame_id=images[i].frame_id,
+                            ts=images[i].ts,
+                        )
+                    )
+                    if self.config.publish_confidence:
+                        conf_out: Out[Image] = getattr(self, f"confidence_{i}", None)  # type: ignore[assignment]
+                        if conf_out:
+                            conf_out.publish(
+                                Image(
+                                    data=c,
+                                    format=ImageFormat.DEPTH,
+                                    frame_id=images[i].frame_id,
+                                    ts=images[i].ts,
+                                )
+                            )
+
+            backpressure(aligned).subscribe(process_multi)
+
+    @rpc
+    def stop(self) -> None:
+        return super().stop()  # type: ignore[no-any-return]
+
+
+def deploy(
+    dimos: ModuleCoordinator,
+    camera: perception.Image,
+    prefix: str = "/depth_anything3",
+    **kwargs: Any,
+) -> DepthAnything3Module:
+    from dimos.core.transport import LCMTransport
+
+    module = DepthAnything3Module(**kwargs)
+    module.color_image.connect(camera.color_image)
+
+    module.depth_image.transport = LCMTransport(f"{prefix}/depth", Image)
+    module.confidence.transport = LCMTransport(f"{prefix}/confidence", Image)
+
+    module.start()
+    return module
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index e82cb656ce..841b27cd4e 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -50,6 +50,8 @@
     "demo-object-scene-registration": "dimos.perception.demo_object_scene_registration:demo_object_scene_registration",
     "demo-osm": "dimos.mapping.osm.demo_osm:demo_osm",
     "demo-skill": "dimos.agents.skills.demo_skill:demo_skill",
+    "depth-anything3-webcam": "dimos.perception.depth.depth_anything3.blueprints.webcam:depth_anything3_webcam",
+    "depth-anything3-webcam-temporal": "dimos.perception.depth.depth_anything3.blueprints.webcam:depth_anything3_webcam_temporal",
     "drone-agentic": "dimos.robot.drone.blueprints.agentic.drone_agentic:drone_agentic",
     "drone-basic": "dimos.robot.drone.blueprints.basic.drone_basic:drone_basic",
     "dual-xarm6-planner": "dimos.manipulation.blueprints:dual_xarm6_planner",
@@ -126,6 +128,7 @@
     "navigation-skill": "dimos.agents.skills.navigation",
     "object-scene-registration-module": "dimos.perception.object_scene_registration",
     "object-tracking": "dimos.perception.object_tracker",
+    "orbslam3-module": "dimos.perception.slam.orbslam3.module",
     "osm-skill": "dimos.agents.skills.osm",
     "person-follow-skill": "dimos.agents.skills.person_follow",
     "person-tracker-module": "dimos.perception.detection.person_tracker",
diff --git a/dimos/spec/perception.py b/dimos/spec/perception.py
index 4fac65ad02..88ecf89751 100644
--- a/dimos/spec/perception.py
+++ b/dimos/spec/perception.py
@@ -30,8 +30,11 @@ class Camera(Image):
     camera_info: Out[CameraInfo]
 
 
-class DepthCamera(Camera):
+class Depth(Protocol):
     depth_image: Out[ImageMsg]
+
+
+class DepthCamera(Camera, Depth):
     depth_camera_info: Out[CameraInfo]
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 1fbd29f86f..b26d47694a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -387,6 +387,8 @@ module = [
     "annotation_protocol",
     "cyclonedds",
     "cyclonedds.*",
+    "depth_anything_3",
+    "depth_anything_3.*",
     "dimos_lcm.*",
     "etils",
     "geometry_msgs.*",