diff --git a/dimos/manipulation/blueprints.py b/dimos/manipulation/blueprints.py
index 1c006c1d04..d7f53e1fa1 100644
--- a/dimos/manipulation/blueprints.py
+++ b/dimos/manipulation/blueprints.py
@@ -27,6 +27,7 @@
     python -i -m dimos.manipulation.planning.examples.manipulation_client
 """
 
+from typing import Any
 import math
 
 from dimos.agents.mcp.mcp_client import McpClient
@@ -37,10 +38,12 @@
 from dimos.core.transport import LCMTransport
 from dimos.hardware.sensors.camera.realsense.camera import RealSenseCamera
 from dimos.manipulation.manipulation_module import ManipulationModule
+from dimos.manipulation.memory2 import LazyPerceptionModule, RGBDCameraRecorder
 from dimos.manipulation.pick_and_place_module import PickAndPlaceModule
 from dimos.msgs.geometry_msgs.Quaternion import Quaternion
 from dimos.msgs.geometry_msgs.Transform import Transform
 from dimos.msgs.geometry_msgs.Vector3 import Vector3
+from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo
 from dimos.msgs.sensor_msgs.JointState import JointState
 from dimos.perception.object_scene_registration import ObjectSceneRegistrationModule
 from dimos.robot.catalog.ufactory import xarm6 as _catalog_xarm6, xarm7 as _catalog_xarm7
@@ -338,8 +341,200 @@
 )
 
 
+# ============================================================================
+# XArm6 perception — memory2-native pipeline (open-vocab VLM, no tracker)
+# ============================================================================
+# Mirrors the xArm7 xarm_perception blueprint but uses the new memory2-native
+# perception modules (RGBDCameraRecorder + LazyPerceptionModule) in place of
+# ObjectSceneRegistrationModule. RGBDCameraRecorder also runs the continuous
+# CLIP embed pipeline (no separate SemanticSearch module — see spec.py for the
+# "why one module" reasoning). The PickAndPlaceModule API is unchanged — its
+# `objects: In[list[Object]]` port is now fed by LazyPerceptionModule.objects.
+#
+# TF chain: world → link6 (ManipulationModule) → camera_link (RealSense)
+# Usage: chain coordinator + agent in ONE dimos invocation. dimos's CLI takes
+# multiple blueprint names and autoconnect()s them into one process — separate
+# invocations conflict on dimos's internal port reservation (check_port_conflicts).
+#   XARM6_IP=<ip> OPENAI_API_KEY=<key> dimos run coordinator-xarm6 xarm6-perception-agent
+# Without the coordinator the arm won't actually move (joint_state stream has no
+# producer otherwise).
+#
+# Camera transform: PLACEHOLDER reusing xArm7 values. Replace with measured
+# hand-eye calibration for the xArm6 mount on first hardware run if positions
+# returned by find_objects are off by a consistent offset.
+_XARM6_PERCEPTION_CAMERA_TRANSFORM = Transform(
+    translation=Vector3(x=0.06693724, y=-0.0309563, z=0.00691482),
+    rotation=Quaternion(0.70513398, 0.00535696, 0.70897578, -0.01052180),  # xyzw
+)
+
+_xarm6_perception_cfg = _catalog_xarm6(
+    name="arm",
+    adapter_type="xarm" if global_config.xarm6_ip else "mock",
+    address=global_config.xarm6_ip,
+    pitch=math.radians(0),
+    add_gripper=True,
+    tf_extra_links=["link6"],
+)
+
+def _convert_color_camera_info(ci: CameraInfo):
+    return ci.to_rerun(
+        image_topic="world/color_image",
+        optical_frame="camera_color_optical_frame",
+    )
+
+def _convert_depth_camera_info(ci: CameraInfo):
+    return ci.to_rerun(
+        image_topic="world/depth_image",
+        optical_frame="camera_depth_optical_frame",
+    )
+
+xarm6_perception = (
+    autoconnect(
+        PickAndPlaceModule.blueprint(
+            robots=[_xarm6_perception_cfg.to_robot_model_config()],
+            planning_timeout=10.0,
+            enable_viz=True,
+            floor_z=-0.02,
+        ),
+        RealSenseCamera.blueprint(
+            base_frame_id="link6",
+            base_transform=_XARM6_PERCEPTION_CAMERA_TRANSFORM,
+        ),
+        # Memory2-native perception pipeline — replaces ObjectSceneRegistrationModule.
+        # RGBDCameraRecorder records color/depth/intrinsics AND continuously
+        # CLIP-embeds color frames into color_image_embedded.
+        # db_path is xArm6-specific to silo from other memory2 users and to avoid
+        # picking up stale embeddings from a different camera mount.
+        # Resume-if-exists: delete recording_xarm6.db manually for a clean slate.
+        RGBDCameraRecorder.blueprint(db_path="recording_xarm6.db"),
+        LazyPerceptionModule.blueprint(db_path="recording_xarm6.db"),
+        vis_module(
+            global_config.viewer,
+            rerun_config={
+                "visual_override": {
+                    "world/camera_info": _convert_color_camera_info,
+                    "world/depth_camera_info": _convert_depth_camera_info,
+                }
+            },
+        ),
+    )
+    .transports(
+        {
+            ("joint_state", JointState): LCMTransport("/coordinator/joint_state", JointState),
+        }
+    )
+    .global_config(n_workers=4)
+)
+
+
+# XArm6 perception + LLM agent. Three perception skills (find_objects,
+# find_objects_near, recall) replace the previous look / scan_objects from
+# the tracker-based pipeline. The agent reads the (seen Ns ago) timestamp
+# in each response and decides whether to re-query before acting.
+# Usage: chain coordinator + agent in ONE dimos invocation. dimos's CLI takes
+# multiple blueprint names and autoconnect()s them into one process — separate
+# invocations conflict on dimos's internal port reservation (check_port_conflicts).
+#   XARM6_IP=<ip> OPENAI_API_KEY=<key> dimos run coordinator-xarm6 xarm6-perception-agent
+# Without the coordinator the arm won't actually move (joint_state stream has no
+# producer otherwise).
+_MANIPULATION_AGENT_SYSTEM_PROMPT_XARM6 = """\
+You are a robotic manipulation assistant controlling an xArm6 robot arm with an \
+eye-in-hand RealSense camera and a gripper.
+
+# Skills
+
+## Perception
+- **find_objects <prompt>**: Find objects matching the prompt in the current scene. \
+Returns 3D positions and a "(seen Ns ago)" timestamp so you can judge freshness. \
+Prompt is any natural language — use spatial qualifiers ("the cup on the right") \
+or attributes ("the red mug") to disambiguate when multiple instances exist. \
+Example: "what cups are there?" → find_objects("cup"); "find the red mug" → \
+find_objects("red mug").
+
+- **find_objects_near <prompt> <x> <y> <z> [radius]**: Same as find_objects but \
+narrows to frames captured when the camera was within `radius` meters of (x, y, z). \
+Use when you know which workspace area to scan — cheaper than full-scene query. \
+Example: "find a screwdriver near the workbench at 1.0, 0.5, 0.8" → \
+find_objects_near("screwdriver", 1.0, 0.5, 0.8, 0.5).
+
+- **recall <name>**: Where did I last see something matching <name>? Cheaper than \
+find_objects (no VLM, returns camera pose at the matching frame plus timestamp), \
+across full session history including previous process runs. Use for "where was the \
+cup earlier?" / "have I ever seen a wrench?" type questions.
+
+## Pick & Place
+- **pick <object_name>**: Pick up an object that find_objects most recently \
+detected. Use the EXACT name from find_objects output. When multiple instances \
+match, add a spatial or attribute qualifier in your find_objects prompt ("cup on \
+the right") so VLM returns just one — then pick operates unambiguously.
+- **place <x> <y> <z>**: Place a held object at explicit world-frame coordinates. \
+Example: "place it at 0.4, 0.3, 0.1"
+- **drop_on <object_name>**: Drop a held object onto another detected object. \
+Automatically compensates for camera occlusion. Example: "drop it in the bowl".
+- **place_back**: Return a held object to its original pick position.
+- **pick_and_place <object_name> <x> <y> <z>**: Pick then place in one command.
+
+## Motion
+- **move_to_pose <x> <y> <z> [roll pitch yaw]**: Move end-effector to an absolute \
+world-frame pose (meters / radians).
+- **move_to_joints <j1, j2, ..., j6>**: Move to a 6-DOF joint configuration (radians).
+- **go_home**: Move to the home/observe position.
+- **go_init**: Return to startup position. Use after pick/place as a safe rest pose.
+
+## Gripper
+- **open_gripper / close_gripper / set_gripper**: Direct gripper control.
+
+## Status & Recovery
+- **get_robot_state**: Current joint positions, end-effector pose, gripper state.
+- **get_scene_info**: Latest detection snapshot (from the most recent find_objects call).
+- **reset**: Clear a FAULT state and return to IDLE.
+- **clear_perception_obstacles**: Remove detected obstacles from the planning world. \
+Use when planning fails with COLLISION_AT_START.
+
+# Freshness and re-querying
+- Every find_objects / find_objects_near response includes "(seen Ns ago)".
+- A few seconds old → fresh, safe to act on.
+- Minutes old → scene may have changed; re-query before acting.
+- If you just picked or placed an object, the scene changed — re-query before the \
+next action.
+
+# Disambiguation (multiple instances)
+- If find_objects returns multiple positions, either:
+  1. Re-query with a spatial/attribute qualifier ("cup on the right", "red cup"), or
+  2. Read the positions yourself and choose by reasoning (e.g., "right" = larger Y; \
+"closest" = smallest distance from gripper).
+- Don't rely on identity across calls — find_objects always reflects the most recent \
+detection. The same name in two calls may be the same physical object or different ones.
+
+# Rules
+- Use the EXACT object name from find_objects output. Do NOT substitute similar names.
+- "drop it in/on [object]" → use **drop_on**. "place it at [coords]" → use **place**.
+- "bring it back" → pick, then **go_init**.
+- "bring it to me" / "hand it over" → pick, then move toward user (≈ X=0, Y=0.5).
+- NEVER open the gripper while holding an object unless executing place / drop_on / \
+when the user explicitly asks.
+- After pick or place, return to init with **go_init** unless another action follows.
+
+# Coordinate System
+World frame (meters): X = forward, Y = left, Z = up. Z = 0 is robot base.
+Typical working area: X 0.3–0.7, Y −0.5 to 0.5, Z 0.05–0.5.
+
+# Error Recovery
+If planning fails with COLLISION_AT_START: call **clear_perception_obstacles**, then \
+**reset**, then retry.
+"""
+
+xarm6_perception_agent = autoconnect(
+    xarm6_perception,
+    McpServer.blueprint(),
+    McpClient.blueprint(system_prompt=_MANIPULATION_AGENT_SYSTEM_PROMPT_XARM6),
+)
+
+
 __all__ = [
     "dual_xarm6_planner",
+    "xarm6_perception",
+    "xarm6_perception_agent",
     "xarm6_planner_only",
     "xarm7_planner_coordinator",
     "xarm7_planner_coordinator_agent",
diff --git a/dimos/manipulation/memory2/__init__.py b/dimos/manipulation/memory2/__init__.py
new file mode 100644
index 0000000000..59906b9c29
--- /dev/null
+++ b/dimos/manipulation/memory2/__init__.py
@@ -0,0 +1,24 @@
+"""Manipulation × memory2 integration.
+
+Three agent-callable perception skills built on memory2 primitives:
+``find_objects``, ``find_objects_near``, ``recall``. See ``spec.py``
+for the architecture docstring, configs, and Protocol definitions.
+"""
+
+from dimos.manipulation.memory2.lazy_perception import LazyPerceptionModule
+from dimos.manipulation.memory2.recorder import RGBDCameraRecorder
+from dimos.manipulation.memory2.spec import (
+    LazyPerceptionModuleConfig,
+    LazyPerceptionModuleSpec,
+    RGBDCameraRecorderConfig,
+    RGBDCameraRecorderSpec,
+)
+
+__all__ = [
+    "LazyPerceptionModule",
+    "LazyPerceptionModuleConfig",
+    "LazyPerceptionModuleSpec",
+    "RGBDCameraRecorder",
+    "RGBDCameraRecorderConfig",
+    "RGBDCameraRecorderSpec",
+]
diff --git a/dimos/manipulation/memory2/lazy_perception.py b/dimos/manipulation/memory2/lazy_perception.py
new file mode 100644
index 0000000000..718e8af159
--- /dev/null
+++ b/dimos/manipulation/memory2/lazy_perception.py
@@ -0,0 +1,277 @@
+"""LazyPerceptionModule — agent-callable open-vocab perception.
+
+Three @skill methods, each a one-line composition of memory2 query
+primitives. See ``spec.py`` for the architecture docstring.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+import time
+from typing import Any
+
+from dimos.agents.annotation import skill
+from dimos.core.core import rpc
+from dimos.core.stream import Out
+from dimos.manipulation.memory2.spec import LazyPerceptionModuleConfig
+from dimos.memory2.module import MemoryModule
+from dimos.models.embedding.base import EmbeddingModel
+from dimos.models.vl.base import VlModel
+from dimos.msgs.geometry_msgs.Quaternion import Quaternion
+from dimos.msgs.geometry_msgs.Transform import Transform
+from dimos.msgs.geometry_msgs.Vector3 import Vector3
+from dimos.perception.detection.type.detection3d.object import Object as DetObject
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+
+def _relative_time(ts: float) -> str:
+    """Format a past timestamp as a human-readable age."""
+    delta = max(0.0, time.time() - ts)
+    if delta < 60:
+        return f"{int(delta)}s ago"
+    if delta < 3600:
+        return f"{int(delta // 60)}min ago"
+    return f"{int(delta // 3600)}h ago"
+
+
+class LazyPerceptionModule(MemoryModule):
+    """Lazy memory2-native open-vocab object detector.
+
+    Three skills, each a one-line memory2 composition. Stateless:
+    every call is an independent query → VLM (find_objects only)
+    → 3D → publish.
+    """
+
+    config: LazyPerceptionModuleConfig
+
+    objects: Out[list[DetObject]]
+
+    _vlm: VlModel | None = None
+    _clip: EmbeddingModel | None = None
+
+    @rpc
+    def start(self) -> None:
+        super().start()
+        self._vlm = self.register_disposable(self.config.vlm_provider())
+        self._vlm.start()
+        self._clip = self.register_disposable(self.config.embedding_model())
+        self._clip.start()
+
+    # ------------------------------------------------------------------ skills
+
+    @skill
+    def find_objects(self, prompt: str) -> str:
+        """Find objects matching ``prompt``. Returns most recent confident
+        match with timestamp.
+
+        Open-vocab: ``prompt`` can be any natural language. Comma-separated
+        prompts are split and processed per class because Moondream's
+        ``query_detections`` labels every result with the literal query
+        string.
+        """
+        prompt_list = [p.strip() for p in prompt.split(",") if p.strip()]
+        if not prompt_list:
+            self.objects.publish([])
+            return "No prompts provided."
+
+        all_objects: list[DetObject] = []
+        most_recent_ts: float | None = None
+        for single in prompt_list:
+            objs, ts = self._find_and_project(
+                single,
+                build_query=lambda stream, vec: stream.search(vec),
+            )
+            all_objects.extend(objs)
+            if ts is not None and (most_recent_ts is None or ts > most_recent_ts):
+                most_recent_ts = ts
+
+        self.objects.publish(all_objects)
+        if not all_objects:
+            return f"No confident '{prompt}' match in memory."
+
+        age = _relative_time(most_recent_ts) if most_recent_ts is not None else "unknown age"
+        lines = [self._fmt_object_line(o) for o in all_objects]
+        return f"Found {len(all_objects)} object(s) matching '{prompt}' (seen {age}):\n" + "\n".join(lines)
+
+    @skill
+    def find_objects_near(
+        self,
+        prompt: str,
+        x: float,
+        y: float,
+        z: float,
+        radius: float = 1.0,
+    ) -> str:
+        """Find objects matching ``prompt`` in frames recorded when the
+        camera was within ``radius`` meters of ``(x, y, z)``.
+
+        ``.near()`` filters by the camera's pose at record time, NOT by
+        the detected object's position. Note: applied as a Python
+        post-filter after vector search; R*Tree pre-gating of the
+        vector index is a memory2 follow-up.
+        """
+        prompt_list = [p.strip() for p in prompt.split(",") if p.strip()]
+        if not prompt_list:
+            self.objects.publish([])
+            return "No prompts provided."
+
+        pose = (x, y, z)
+        all_objects: list[DetObject] = []
+        most_recent_ts: float | None = None
+        for single in prompt_list:
+            objs, ts = self._find_and_project(
+                single,
+                build_query=lambda stream, vec: stream.near(pose, radius).search(vec),
+            )
+            all_objects.extend(objs)
+            if ts is not None and (most_recent_ts is None or ts > most_recent_ts):
+                most_recent_ts = ts
+
+        self.objects.publish(all_objects)
+        if not all_objects:
+            return f"No confident '{prompt}' match near ({x:.2f}, {y:.2f}, {z:.2f}) within {radius}m."
+
+        age = _relative_time(most_recent_ts) if most_recent_ts is not None else "unknown age"
+        lines = [self._fmt_object_line(o) for o in all_objects]
+        return (
+            f"Found {len(all_objects)} object(s) matching '{prompt}' "
+            f"near ({x:.2f}, {y:.2f}, {z:.2f}) within {radius}m (seen {age}):\n"
+            + "\n".join(lines)
+        )
+
+    @skill
+    def recall(self, name: str) -> str:
+        """Where did I last see something matching ``name``?
+
+        Cheaper than ``find_objects``: no VLM, no 3D projection. Returns
+        the camera pose at the most recent confident semantic match plus
+        timestamp. Works across process restarts because memory2's SQLite
+        store is the persistence layer.
+        """
+        if self._clip is None:
+            return f"No memory of '{name}'."
+        try:
+            vec = self._clip.embed_text(name)
+            obs = (
+                self.store.streams.color_image_embedded
+                    .search(vec)
+                    .filter(lambda o: (o.similarity or 0) >= self.config.min_similarity)
+                    .order_by("ts", desc=True)
+                    .first()
+            )
+        except (AttributeError, LookupError):
+            return f"No memory of '{name}'."
+        except Exception as e:
+            logger.warning("recall(%r) failed: %s", name, e)
+            return f"No memory of '{name}'."
+
+        age = _relative_time(obs.ts)
+        if obs.pose:
+            x, y, z = obs.pose[0], obs.pose[1], obs.pose[2]
+            return f"Last saw '{name}' with camera near ({x:.2f}, {y:.2f}, {z:.2f}) ({age})."
+        return f"Last saw '{name}' (camera pose unknown) ({age})."
+
+    # ----------------------------------------------------------- internal
+
+    def _find_and_project(
+        self,
+        prompt: str,
+        build_query: Callable[[Any, Any], Any],
+    ) -> tuple[list[DetObject], float | None]:
+        """Composed memory2 pipeline for ONE prompt class.
+
+        Returns (objects, observation_ts). ``build_query`` is a callable
+        ``(stream, query_vec) -> filtered Stream`` so find_objects /
+        find_objects_near can share the rest of the pipeline.
+        """
+        if self._clip is None:
+            return [], None
+        try:
+            vec = self._clip.embed_text(prompt)
+            obs = (
+                build_query(self.store.streams.color_image_embedded, vec)
+                    .filter(lambda o: (o.similarity or 0) >= self.config.min_similarity)
+                    .order_by("ts", desc=True)
+                    .first()
+            )
+        except (AttributeError, LookupError):
+            return [], None
+        except Exception as e:
+            logger.warning("_find_and_project(%r): %s", prompt, e)
+            return [], None
+
+        return self._detect_and_project_one(obs, prompt), obs.ts
+
+    def _detect_and_project_one(self, color_obs: Any, prompt: str) -> list[DetObject]:
+        """VLM detection + 3D projection for ONE peak frame, ONE prompt class."""
+        if self._vlm is None:
+            return []
+
+        # Aligned depth + latest intrinsics. .first()/.last() raise LookupError on empty.
+        try:
+            depth_obs = self.store.streams.depth_image.at(
+                color_obs.ts, tolerance=0.1
+            ).first()
+            info_obs = self.store.streams.camera_info.last()
+        except LookupError:
+            logger.warning("missing depth/info near ts=%.3f", color_obs.ts)
+            return []
+        except AttributeError:
+            logger.warning("depth_image or camera_info stream not yet available")
+            return []
+
+        # VLM can hang (network) or OOM (CUDA). Treat failure as no-detection.
+        try:
+            dets_2d = self._vlm.query_detections(color_obs.data, prompt)
+        except Exception as e:
+            logger.warning("VLM failed (prompt=%r ts=%.3f): %s", prompt, color_obs.ts, e)
+            return []
+        if not dets_2d.detections:
+            return []
+
+        camera_transform = self._camera_transform_from_pose(color_obs.pose)
+        try:
+            # from_2d_to_list's annotation says ImageDetections2D[Detection2DSeg]
+            # but the implementation at object.py:205-215 handles bbox-only
+            # detections too (synthesizes a rectangular mask). Moondream returns
+            # bbox-only — works at runtime.
+            return DetObject.from_2d_to_list(
+                detections_2d=dets_2d,  # type: ignore[arg-type]
+                color_image=color_obs.data,
+                depth_image=depth_obs.data,
+                camera_info=info_obs.data,
+                camera_transform=camera_transform,
+                max_distance=self.config.max_distance,
+                use_aabb=self.config.use_aabb,
+                max_obstacle_width=self.config.max_obstacle_width,
+            )
+        except Exception as e:
+            logger.warning("from_2d_to_list failed at ts=%.3f: %s", color_obs.ts, e)
+            return []
+
+    def _camera_transform_from_pose(self, pose: Any) -> Transform | None:
+        """Build a Transform from the recorder's pose-stamped observation.
+
+        Observation.pose is always a 7-tuple (x, y, z, qx, qy, qz, qw) or None
+        (memory2/observationstore/sqlite.py:_decompose_pose).
+        Object.from_2d_to_list(camera_transform=T) applies T to a camera-frame
+        pointcloud to produce world-frame output — T is camera→world; the
+        recorder's pose-in-world IS that. NO .inverse().
+        """
+        if pose is None:
+            return None
+        try:
+            x, y, z, qx, qy, qz, qw = pose
+        except (TypeError, ValueError):
+            logger.warning("unexpected pose shape: %r", type(pose).__name__)
+            return None
+        return Transform(
+            translation=Vector3(x, y, z),
+            rotation=Quaternion(qx, qy, qz, qw),
+        )
+
+    @staticmethod
+    def _fmt_object_line(o: DetObject) -> str:
+        return f"  - {o.name} at ({o.center.x:.2f}, {o.center.y:.2f}, {o.center.z:.2f})"
diff --git a/dimos/manipulation/memory2/recorder.py b/dimos/manipulation/memory2/recorder.py
new file mode 100644
index 0000000000..ad4794b7c1
--- /dev/null
+++ b/dimos/manipulation/memory2/recorder.py
@@ -0,0 +1,100 @@
+"""RGBDCameraRecorder — Recorder + continuous CLIP embed for RGBD streams.
+
+Two things this module does on top of the base ``Recorder`` pattern at
+``dimos/robot/unitree/go2/blueprints/smart/unitree_go2.py:49-53``:
+
+1. **Continuous CLIP embed pipeline.** Same logic as ``memory2.SemanticSearch``,
+   but co-located here so it shares the recorder's store. See ``spec.py`` for
+   the "why in one module" reasoning (memory2's SubjectNotifier is per-store).
+
+2. **Resume-if-exists semantics.** ``start()`` overrides ``Recorder.start()``
+   to skip the overwrite-or-fail gate. If ``recording.db`` exists, we open it
+   and append (cross-session memory). If it doesn't, SQLite creates it fresh.
+   No flag flipping, no manual file deletion between runs. Users wanting a
+   clean slate delete the file themselves.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from dimos.core.core import rpc
+from dimos.core.stream import In
+from dimos.manipulation.memory2.spec import RGBDCameraRecorderConfig
+from dimos.memory2.embed import EmbedImages
+from dimos.memory2.module import Recorder
+from dimos.memory2.stream import Stream as MemoryStream
+from dimos.memory2.transform import QualityWindow
+from dimos.models.embedding.base import EmbeddingModel
+from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo
+from dimos.msgs.sensor_msgs.Image import Image
+from dimos.utils.logging_config import setup_logger
+
+logger = setup_logger()
+
+
+class RGBDCameraRecorder(Recorder):
+    """Records RGBD streams (color + depth + intrinsics) and continuously
+    CLIP-embeds qualifying color frames into ``color_image_embedded``."""
+
+    color_image: In[Image]
+    depth_image: In[Image]
+    camera_info: In[CameraInfo]
+
+    config: RGBDCameraRecorderConfig
+
+    _embedding_model: EmbeddingModel | None = None
+    _embeddings: MemoryStream[Any] | None = None
+
+    @rpc
+    def start(self) -> None:
+
+        super(Recorder, self).start()
+
+        if self.config.g.replay:
+            logger.info(
+                "Replay mode active — recording disabled, leaving %s untouched",
+                self.config.db_path,
+            )
+            return
+
+        if not self.inputs:
+            logger.warning("RGBDCameraRecorder has no In ports — nothing to record")
+            return
+
+        # Replicate Recorder.start()'s port-to-stream registration. Touching
+        # self.store lazily opens the SqliteStore at db_path (existing file or new).
+        #
+        # depth_image MUST use a lossless codec. memory2's codec auto-dispatch
+        # maps every `Image`-typed stream to JpegCodec (lossy 8-bit DCT) — correct
+        # for color RGB, catastrophic for uint16 depth (millimeters get shredded,
+        # then 3D projection finds no coherent points and find_objects returns
+        # nothing). We use memory2's documented per-stream `codec=` override
+        # (store/base.py: "Per-stream overrides ... codec=") to record depth as
+        # lossless lz4+lcm (raw Image.lcm_encode, no pixel loss). Color stays
+        # JPEG — lossy is fine for RGB and keeps the DB small.
+        for name, port in self.inputs.items():
+            if name == "depth_image":
+                stream: MemoryStream[Any] = self.store.stream(
+                    name, port.type, codec="lz4+lcm"
+                )
+            else:
+                stream = self.store.stream(name, port.type)
+            self._port_to_stream(name, port, stream)
+            logger.info("Recording %s (%s)", name, port.type.__name__)
+
+        # Continuous CLIP embed — same store, same notifier, .live() works.
+        self._embedding_model = self.register_disposable(self.config.embedding_model())
+        self._embedding_model.start()
+        self._embeddings = self.store.stream("color_image_embedded", Image)
+
+        # fmt: off
+        self.store.streams.color_image \
+            .live() \
+            .filter(lambda obs: obs.data.brightness > 0.1) \
+            .transform(QualityWindow(lambda img: img.sharpness, window=0.5)) \
+            .transform(EmbedImages(self._embedding_model, batch_size=2)) \
+            .save(self._embeddings) \
+            .drain_thread()
+        # fmt: on
+        logger.info("RGBDCameraRecorder: continuous CLIP embed pipeline active")
diff --git a/dimos/manipulation/memory2/spec.py b/dimos/manipulation/memory2/spec.py
new file mode 100644
index 0000000000..46830fc245
--- /dev/null
+++ b/dimos/manipulation/memory2/spec.py
@@ -0,0 +1,322 @@
+""" Memory2-native perception for manipulation.
+
+
+Architecture
+------------
+
+::
+
+    camera
+      |
+      v
+    RGBDCameraRecorder
+      ├── records: color_image, depth_image, camera_info
+      └── continuous CLIP embed (brightness/sharpness filtered)
+                → color_image_embedded
+              |
+              ┌──────────── pulled on trigger (read-only) ──┐
+              v                                             |
+    LazyPerceptionModule                                    |
+      | @skill find_objects(prompt)                         |
+      |        .search(vec).filter(sim>=thr).order_by("ts", desc).first()
+      |        → VLM detect → 3D project                    |
+      | @skill find_objects_near(prompt, x, y, z, radius)   |
+      |        .near((x,y,z), r).search(vec).filter(...).order_by(...).first()
+      |        → VLM detect → 3D project                    |
+      | @skill recall(name)                                 |
+      |        .search(vec).filter(sim>=thr).order_by("ts", desc).first()
+      |        → return camera pose + timestamp (no VLM, cheaper)
+      |                                                     |
+      | objects: Out[list[DetObject]]                       |
+      v                                                     |
+    PickAndPlaceModule  (manipulation — In[objects])        |
+                                                            |
+    Shared SQLite store (recording.db) ─────────────────────┘
+    Recorder writes; LazyPerceptionModule opens its own SqliteStore
+    on the same file (read-only via WAL).
+
+    Why recording + embedding in ONE module: memory2's SubjectNotifier
+    is in-memory per-store, so a separate-module embedder couldn't
+    receive .live() notifications from the recorder's writes even on
+    the same db_path. Keeping them in one module = one store = one
+    notifier = live pipeline works. The LazyPerceptionModule reads
+    only at skill-call time (no .live()), so cross-instance SQLite
+    reads (WAL) are sufficient there.
+
+
+Three skills, three query shapes
+--------------------------------
+
+Each skill is a one-line composition of memory2 primitives. Every
+skill returns the **most recent confident match** along with its
+timestamp — the agent sees how fresh the data is and decides if it's
+fresh enough to act on. No time-window parameter on the API; "current"
+is whatever memory2 has most recently.
+
+- ``find_objects(prompt)`` — **semantic + most recent**. Search the
+  embedding stream, filter to confident matches, take the most recent.
+  Run VLM + 3D projection on that frame. Returns ``list[Object]`` (via
+  the ``objects`` port) plus formatted summary with timestamp.
+
+- ``find_objects_near(prompt, x, y, z, radius)`` — **spatial + semantic
+  + most recent**. Same as ``find_objects`` plus a camera-pose proximity
+  filter. NOTE on indexing: memory2 currently runs vector ``.search()``
+  as a top-K vec0 query and applies the spatial ``.near()`` predicate in
+  Python afterward — the R*Tree index does NOT pre-filter the embedding
+  search. Correct results, but for very long recordings the top-K cap
+  can elide spatially-near low-similarity frames. Pushing R*Tree
+  pre-filtering through ``Backend._vector_search`` is a memory2 follow-up.
+
+- ``recall(name)`` — **cheaper cousin of find_objects**. Same query
+  shape but skips VLM detection and 3D projection; returns the camera
+  pose at the matching frame plus timestamp. Use for "where was I when
+  I saw X" questions where exact 3D object pose isn't needed.
+
+**Freshness contract is in the response.** Every result includes
+``(seen N seconds ago)`` so the agent can reason: "2 seconds is fresh
+— act"; "5 minutes is stale — re-query or skip."
+
+
+The lazy pipeline
+------------------
+
+All three skills share the same shape:
+
+1. CLIP-embed the prompt → query vector.
+
+2. Build a memory2 query on ``color_image_embedded`` by composing the
+   relevant filters (``.near`` / ``.search`` / ``.filter`` /
+   ``.order_by``). ``.search`` pushes through the vec0 vector index;
+   tag/time/SQL-expressible filters push to SQL. ``.near`` runs as a
+   Python post-filter after vector search (R*Tree pre-gating of vector
+   search is a memory2 follow-up).
+
+3. Take ``.first()`` — the most recent confident match. Single
+   observation.
+
+4. (find_objects / find_objects_near only:) Pull aligned depth via
+   ``.at(obs.ts, tolerance)`` and latest intrinsics via ``.last()``.
+   Run VLM detection on the color frame, project via
+   ``Object.from_2d_to_list`` (camera→world transform from the
+   recorder's pose).
+
+5. Publish ``list[Object]`` on ``objects``. Return a formatted summary
+   that **includes the observation timestamp** so the agent can read
+   freshness.
+
+**Snapshot contract.** Every publish on ``objects`` replaces
+``PickAndPlaceModule._detection_snapshot`` — the cache ``pick`` /
+``place`` read. So:
+
+- A successful ``find_objects("cup")`` → snapshot = [cup objects]; ``pick("cup")`` works.
+- An interleaved ``find_objects("apple")`` returning 0 matches → snapshot = []; later
+  ``pick("cup")`` fails ("not found"). Re-query before acting.
+- This is intentional. Each ``find_objects`` call is a fresh world model, not an
+  accumulating one. The agent prompt makes this explicit.
+
+``PickAndPlaceModule`` reads ``objects`` to act on named
+targets. Multiple instances in the result are handled either by
+prompt qualifiers ("cup on the right" → VLM returns one detection in
+the frame) or by the agent reading the returned positions and
+choosing.
+
+
+Streams (recorded continuously)
+-------------------------------
+
+::
+
+    color_image            JPEG-encoded RGB frames + world pose
+    depth_image            depth frames, LOSSLESS lz4+lcm codec
+    camera_info            intrinsics (rarely change, .last() suffices)
+    color_image_embedded   CLIP vector + image bytes, vec0-indexed
+                           for similarity search (populated by the
+                           recorder's continuous embed pipeline)
+
+**depth_image MUST be recorded losslessly.** memory2's codec
+auto-dispatch maps every ``Image``-typed stream to ``JpegCodec``
+(lossy 8-bit DCT) — correct for RGB color, but it shreds uint16
+depth (millimeters). JPEG'd depth → no coherent 3D points →
+``from_2d_to_list`` returns ``[]`` → ``find_objects`` silently
+returns nothing even though CLIP + VLM succeeded. ``RGBDCameraRecorder``
+overrides the depth stream's codec to ``lz4+lcm`` via memory2's
+public per-stream ``codec=`` override (no memory2 changes). The codec
+is persisted per-stream in the SQLite registry, so an existing
+recording created before this fix keeps its JPEG depth — delete the
+db file once to recreate the stream with the lossless codec.
+
+
+Open vocab + cross-session memory
+---------------------------------
+
+The agent passes any natural-language description to ``find_objects``
+or ``find_objects_near`` (``"red mug with handle"``,
+``"the screwdriver near the laptop"``). The VLM handles visual
+disambiguation; the agent can encode spatial qualifiers in the prompt,
+or call ``find_objects_near`` for camera-pose-bounded queries (Python
+spatial post-filter; see the ``find_objects_near`` note above for the
+indexing limitation).
+
+Cross-session memory is implicit: memory2 persists every observation
+in ``recording.db``. Skills query the full embedding history regardless
+of when the current process started — no replay logic, no bespoke
+loaders. ``recall(name)`` returning "(seen 3 hours ago)" is normal and
+correct after a process restart; the agent reads the timestamp and
+decides.
+
+``RGBDCameraRecorder`` uses **resume-if-exists**: it opens ``recording.db``
+if the file is there (cross-session memory works out of the box) and
+creates it fresh if it isn't. ``RecorderConfig.overwrite`` is inert in
+this subclass — see ``recorder.py``. Users wanting a clean slate delete
+the file manually before launching.
+
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import TYPE_CHECKING, Protocol, runtime_checkable
+
+from dimos.memory2.module import MemoryModuleConfig, RecorderConfig
+from dimos.models.embedding.base import EmbeddingModel
+from dimos.models.embedding.clip import CLIPModel
+from dimos.models.vl.base import VlModel
+from dimos.models.vl.moondream import MoondreamVlModel
+
+if TYPE_CHECKING:
+    from dimos.agents.annotation import skill  # noqa: F401  (referenced in docstrings)
+    from dimos.core.stream import In, Out
+    from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo
+    from dimos.msgs.sensor_msgs.Image import Image
+    from dimos.perception.detection.type.detection3d.object import Object as DetObject
+
+
+# =============================================================================
+#  RGBDCameraRecorder — records color / depth / intrinsics to memory2
+# =============================================================================
+
+
+class RGBDCameraRecorderConfig(RecorderConfig):
+    """Config for the RGBD-camera recorder + continuous embedder."""
+
+    db_path: str | Path = "recording.db"
+    embedding_model: type[EmbeddingModel] = CLIPModel
+
+
+@runtime_checkable
+class RGBDCameraRecorderSpec(Protocol):
+    """Protocol for the RGBD-camera recorder + continuous embedder."""
+
+    config: RGBDCameraRecorderConfig
+
+    color_image: In[Image]
+    depth_image: In[Image]
+    camera_info: In[CameraInfo]
+
+
+# =============================================================================
+#  LazyPerceptionModule — agent-callable open-vocab perception
+# =============================================================================
+
+
+class LazyPerceptionModuleConfig(MemoryModuleConfig):
+    """Config for the lazy perception module."""
+
+    db_path: str | Path = "recording.db"
+
+    # Models (blueprint can override)
+    vlm_provider: type[VlModel] = MoondreamVlModel
+    embedding_model: type[EmbeddingModel] = CLIPModel
+
+    # Similarity threshold — observations below this don't count as a
+    # confident match for find_objects / find_objects_near.
+    min_similarity: float = 0.20
+
+    # 3D projection knobs threaded into ``Object.from_2d_to_list``.
+    max_distance: float = 1.0
+    use_aabb: bool = True
+    max_obstacle_width: float = 0.06
+
+
+@runtime_checkable
+class LazyPerceptionModuleSpec(Protocol):
+    """Protocol for the lazy perception module."""
+
+    config: LazyPerceptionModuleConfig
+
+    objects: Out[list[DetObject]]
+    """Wired by the blueprint to ``PickAndPlaceModule.objects``. Cache
+    of the most recent ``find_objects`` / ``find_objects_near`` result.
+    Manipulation reads from here to act on the named target. Named
+    ``objects`` to match the existing manipulation In port for
+    autoconnect-by-name."""
+
+    def find_objects(self, prompt: str) -> str:
+        """Find objects matching ``prompt``. Returns the most recent
+        confident match with timestamp.
+
+        Decorated ``@skill`` in the implementation. Composes
+        ``.search()`` + ``.filter(similarity >= min_similarity)`` +
+        ``.order_by("ts", desc=True)`` + ``.first()`` over
+        ``color_image_embedded``, then runs VLM detection on the
+        matching frame and projects to 3D. Publishes ``list[Object]``
+        on the ``objects`` port.
+
+        Open-vocab: ``prompt`` can be any natural language
+        (``"red mug"``, ``"the cup on the right"``). Comma-separated
+        prompts are split and processed per class because VLM
+        ``query_detections`` labels every result with the literal
+        query string.
+
+        Returns a human-readable summary including the observation
+        timestamp (``"(seen 3s ago)"``) so the agent can judge
+        freshness. Returns a no-match message when no observation
+        meets ``min_similarity``.
+        """
+        ...
+
+    def find_objects_near(
+        self,
+        prompt: str,
+        x: float,
+        y: float,
+        z: float,
+        radius: float = 1.0,
+    ) -> str:
+        """Find objects matching ``prompt``, in frames recorded when
+        the camera was within ``radius`` meters of ``(x, y, z)``.
+
+        Decorated ``@skill`` in the implementation. Same query shape
+        as ``find_objects`` plus an extra ``.near((x, y, z), radius)``
+        predicate. ``.near()`` runs as a Python post-filter today —
+        memory2's R*Tree index does not pre-gate the vector search.
+        Take most recent confident match → VLM → 3D project → publish
+        on the ``objects`` port.
+
+        ``.near()`` filters by the camera's pose at record time, NOT
+        by the detected object's position. Use for "frames captured
+        while looking at this workspace area"; object-position
+        filtering is downstream of detection.
+        """
+        ...
+
+    def recall(self, name: str) -> str:
+        """Where did I last see something matching ``name``?
+
+        Decorated ``@skill`` in the implementation. Cheaper cousin of
+        ``find_objects``: composes ``.search()`` +
+        ``.order_by("ts", desc=True)`` + ``.first()`` over
+        ``color_image_embedded`` — returns the most recent confident
+        semantic match across the full embedding history, but **does
+        not run VLM**. Returns the camera pose at the matching frame
+        plus timestamp.
+
+        Works after process restart because memory2's SQLite store is
+        the persistence layer; the query touches the same db that
+        previous sessions wrote.
+
+        Returns a human-readable summary with timestamp
+        (``"Last saw 'cup' near (1.0, 0.5, 0.9) (seen 3min ago)"``), or
+        ``"No memory of <name>."`` when nothing matches.
+        """
+        ...
diff --git a/dimos/manipulation/memory2/test_perception_stack.py b/dimos/manipulation/memory2/test_perception_stack.py
new file mode 100644
index 0000000000..5a8f1101ab
--- /dev/null
+++ b/dimos/manipulation/memory2/test_perception_stack.py
@@ -0,0 +1,321 @@
+"""End-to-end stack smoke test for the memory2-native perception, bypassing the agent.
+
+What it verifies:
+  1. ``recording_xarm6.db`` exists and has streams.
+  2. CLIP is continuously embedding (``color_image_embedded`` stream growing).
+  3. CLIP semantic search matches a text prompt to a recorded frame.
+  4. Moondream VLM produces bbox detections on that frame.
+  5. ``Object.from_2d_to_list`` projects to 3D against aligned depth + intrinsics.
+
+Run while ``dimos run coordinator-xarm6 xarm6-perception-agent`` is alive (so the
+recorder is actively writing). It writes ``perception_diag.log`` in the cwd.
+
+Usage:
+    python dimos/manipulation/memory2/test_perception_stack.py [PROMPT] [DB_PATH]
+
+    python dimos/manipulation/memory2/test_perception_stack.py "cup"
+    python dimos/manipulation/memory2/test_perception_stack.py "the red mug" recording_xarm6.db
+"""
+
+from __future__ import annotations
+
+import sys
+import time
+from contextlib import redirect_stdout
+from pathlib import Path
+
+LOG = Path("perception_diag.log").resolve()
+
+
+def hr(title: str) -> None:
+    print()
+    print("=" * 78)
+    print(f"== {title}")
+    print("=" * 78)
+
+
+def run(prompt: str, db_path: str) -> None:
+    hr("A. DB file + streams")
+    db = Path(db_path).resolve()
+    if not db.exists():
+        print(f"NOT FOUND: {db}")
+        print("Is the recorder running? Check: `dimos run coordinator-xarm6 xarm6-perception-agent`")
+        return
+    print(f"db: {db}  size: {db.stat().st_size / 1e6:.1f} MB")
+
+    from dimos.memory2.store.sqlite import SqliteStore
+
+    # read-only access; recorder still has its own connection
+    store = SqliteStore(path=str(db))
+    store.start()
+    streams = store.list_streams()
+    print(f"streams found: {streams}")
+
+    # Show the codec each stream was created with (persisted in the registry).
+    # depth_image MUST be a lossless codec (lz4+lcm / lcm). If it shows 'jpeg',
+    # the recorder.py codec fix is NOT deployed, OR the DB predates the fix.
+    print()
+    print("per-stream codec (from registry — the load-bearing line for depth):")
+    try:
+        for name in ["color_image", "depth_image", "camera_info", "color_image_embedded"]:
+            stored = store._registry.get(name)
+            cid = stored.get("codec_id") if stored else "(not registered)"
+            flag = ""
+            if name == "depth_image":
+                flag = "  <-- MUST be lossless (lz4+lcm/lcm). 'jpeg' = fix not deployed."
+            print(f"  {name}: codec={cid}{flag}")
+    except Exception as e:
+        print(f"  codec introspection failed: {e}")
+
+    expected = ["color_image", "depth_image", "camera_info", "color_image_embedded"]
+    newest_ts: dict[str, float] = {}
+    for name in expected:
+        if name not in streams:
+            print(f"  MISSING: {name}")
+            continue
+        try:
+            st = store.streams.__getattr__(name)
+            cnt = sum(1 for _ in st.order_by("ts"))
+            last = st.order_by("ts", desc=True).first()
+            newest_ts[name] = last.ts
+            age = time.time() - last.ts
+            print(f"  {name}: {cnt} obs | newest ts={last.ts:.1f} ({age:.1f}s ago)")
+        except Exception as e:
+            print(f"  {name}: count/recency failed ({e})")
+
+    hr("A2. staleness — is the embed pipeline keeping up with the camera?")
+    if "color_image" in newest_ts and "color_image_embedded" in newest_ts:
+        lag = newest_ts["color_image"] - newest_ts["color_image_embedded"]
+        print(f"newest color_image           : {newest_ts['color_image']:.1f}")
+        print(f"newest color_image_embedded  : {newest_ts['color_image_embedded']:.1f}")
+        print(f"embed lag behind live camera : {lag:.1f}s")
+        if lag > 30:
+            print("  -> STALE. The embed pipeline is NOT producing recent embeddings.")
+            print("  -> find_objects searches recent frames first; if the newest embedding")
+            print("     is minutes/hours old, it can never match what the camera sees NOW.")
+        elif lag > 5:
+            print("  -> Lagging but alive. Pipeline can't keep up with frame rate.")
+        else:
+            print("  -> Fresh. Embed pipeline is current with the camera.")
+
+    hr("B. CLIP semantic search for prompt")
+    print(f"prompt: {prompt!r}")
+    from dimos.models.embedding.clip import CLIPModel
+
+    clip = CLIPModel()
+    clip.start()
+    t0 = time.perf_counter()
+    vec = clip.embed_text(prompt)
+    print(f"text embed: {time.perf_counter() - t0:.3f}s  dim={getattr(vec, 'shape', '?')}")
+
+    try:
+        embedded = store.streams.color_image_embedded
+        results = list(embedded.search(vec).order_by("ts", desc=True).limit(5))
+    except Exception as e:
+        print(f"search failed: {e}")
+        print("  -> color_image_embedded stream missing or no embeddings written yet.")
+        print("  -> Confirm the recorder logged 'continuous CLIP embed pipeline active'.")
+        return
+    print(f"top-5 matches (most recent first):")
+    if not results:
+        print("  (no matches at all — CLIP embed pipeline not writing? Check brightness/sharpness filters)")
+        return
+    for i, obs in enumerate(results):
+        sim = getattr(obs, "similarity", None)
+        pose = obs.pose
+        print(f"  #{i}  ts={obs.ts:.3f}  sim={sim}  pose={pose}")
+
+    most_recent = results[0]
+    print(f"\npicking #0 for VLM (ts={most_recent.ts:.3f}, similarity={getattr(most_recent, 'similarity', None)})")
+
+    hr("C. depth + intrinsics for that timestamp")
+    try:
+        depth_obs = store.streams.depth_image.at(most_recent.ts, tolerance=0.1).first()
+        info_obs = store.streams.camera_info.last()
+        print(f"depth ts={depth_obs.ts:.3f}  (Δ={depth_obs.ts - most_recent.ts:+.3f}s)")
+        print(f"info ts ={info_obs.ts:.3f}")
+    except Exception as e:
+        print(f"depth/info lookup failed: {e}")
+        return
+
+    hr("D. Moondream detection on the matched frame")
+    from dimos.models.vl.moondream import MoondreamVlModel
+
+    vlm = MoondreamVlModel()
+    vlm.start()
+    t0 = time.perf_counter()
+    try:
+        dets_2d = vlm.query_detections(most_recent.data, prompt)
+        print(f"VLM call: {time.perf_counter() - t0:.3f}s")
+        print(f"detections: {len(dets_2d.detections)}")
+        for i, d in enumerate(dets_2d.detections):
+            bbox = getattr(d, "bbox", None)
+            print(f"  #{i}  bbox={bbox}  name={getattr(d, 'name', '?')}")
+    except Exception as e:
+        print(f"VLM call failed: {e}")
+        return
+
+    if not dets_2d.detections:
+        print("  -> VLM returned no detections for this prompt + this frame.")
+        print("  -> Try a more visible prompt or move the object into clearer view.")
+        return
+
+    hr("E. depth stats in the detection bbox (why does projection fail?)")
+    import numpy as np
+
+    from dimos.perception.detection.type.detection3d.object import Object as DetObject
+
+    try:
+        depth_cv = depth_obs.data.to_opencv()
+        print(f"depth array: shape={depth_cv.shape} dtype={depth_cv.dtype}")
+        print(f"depth global: min={np.nanmin(depth_cv):.3f} max={np.nanmax(depth_cv):.3f} "
+              f"mean={np.nanmean(depth_cv):.3f}")
+        d0 = dets_2d.detections[0]
+        x1, y1, x2, y2 = (int(v) for v in d0.bbox)
+        h, w = depth_cv.shape[:2]
+        x1, y1 = max(0, x1), max(0, y1)
+        x2, y2 = min(w, x2), min(h, y2)
+        roi = depth_cv[y1:y2, x1:x2]
+        nonzero = roi[(roi > 0) & np.isfinite(roi)]
+        print(f"bbox roi: ({x1},{y1})-({x2},{y2})  pixels={roi.size}")
+        if nonzero.size:
+            print(f"bbox valid-depth: count={nonzero.size} "
+                  f"min={nonzero.min():.3f} max={nonzero.max():.3f} mean={nonzero.mean():.3f}")
+            print("  -> If these look like 700-3000, depth is in MILLIMETERS (need depth_scale=1000)")
+            print("  -> If these look like 0.7-3.0, depth is in METERS (depth_scale=1.0 is correct)")
+        else:
+            print("  -> NO valid depth in bbox region. Sensor holes / reflective / out-of-range.")
+    except Exception as e:
+        print(f"depth-stat failed: {e}")
+
+    hr("E2. 3D projection — production params vs filters-disabled")
+
+    def _project(label: str, **kw):
+        try:
+            objs = DetObject.from_2d_to_list(
+                detections_2d=dets_2d,
+                color_image=most_recent.data,
+                depth_image=depth_obs.data,
+                camera_info=info_obs.data,
+                camera_transform=None,
+                **kw,
+            )
+            print(f"[{label}] -> {len(objs)} objects")
+            for o in objs:
+                print(f"    {o.name} at ({o.center.x:.3f}, {o.center.y:.3f}, {o.center.z:.3f})")
+        except Exception as e:
+            print(f"[{label}] raised: {e}")
+
+    # 1. exactly what find_objects uses today
+    _project("production (max_dist=1.0, scale=1.0, trunc=10)",
+             max_distance=1.0, use_aabb=True, max_obstacle_width=0.06)
+    # 2. distance filter off — isolates the max_distance guard
+    _project("max_distance=0 (filter off)",
+             max_distance=0.0, use_aabb=True, max_obstacle_width=0.06)
+    # 3. treat depth as millimeters — isolates the unit-scale bug
+    _project("depth_scale=1000 (mm->m)",
+             depth_scale=1000.0, depth_trunc=10.0, max_distance=0.0,
+             use_aabb=True, max_obstacle_width=0.06)
+    # 4. both: mm scale + no distance filter + generous trunc
+    _project("mm scale + no dist filter + trunc=50",
+             depth_scale=1000.0, depth_trunc=50.0, max_distance=0.0,
+             use_aabb=True, max_obstacle_width=0.0)
+    print()
+    print("Whichever variant returns >0 objects tells us the exact fix:")
+    print("  - 'max_distance=0' works  -> raise/disable max_distance in LazyPerceptionModuleConfig")
+    print("  - 'depth_scale=1000' works -> recorder/realsense emits mm; need depth_scale=1000 in find_objects")
+    print("  - none work               -> empty depth in bbox; sensor/exposure issue")
+
+    hr("F. EXACT find_objects() code path (the real query LazyPerceptionModule runs)")
+    print("Replicates lazy_perception.py: .search(vec).filter(sim>=min_similarity)")
+    print(".order_by('ts',desc=True).first() -> VLM -> from_2d_to_list(config params)")
+    print()
+    MIN_SIM = 0.20  # LazyPerceptionModuleConfig.min_similarity default
+    print(f"min_similarity = {MIN_SIM} (the filter find_objects applies)")
+    try:
+        filtered = (
+            store.streams.color_image_embedded.search(vec)
+            .filter(lambda o: (getattr(o, "similarity", 0) or 0) >= MIN_SIM)
+            .order_by("ts", desc=True)
+        )
+        chosen = filtered.first()
+        sim = getattr(chosen, "similarity", None)
+        print(f"chosen obs: ts={chosen.ts:.3f} sim={sim}  (passed the >= {MIN_SIM} filter)")
+    except Exception as e:
+        print(f"NO observation passed the min_similarity filter: {e}")
+        print(f"  -> Every embedded frame scored < {MIN_SIM} for prompt {prompt!r}.")
+        print("  -> THIS is why find_objects returns nothing. Two options:")
+        print("     (a) prompt/scene mismatch — try a prompt that matches what the camera sees")
+        print("     (b) min_similarity too high for this CLIP model — lower it in spec.py:")
+        print("         LazyPerceptionModuleConfig.min_similarity")
+        print("  -> Look at section B sims: if the best is e.g. 0.21, a threshold of 0.20")
+        print("     barely passes; 0.25+ would reject everything.")
+        chosen = None
+
+    if chosen is not None:
+        try:
+            d_obs = store.streams.depth_image.at(chosen.ts, tolerance=0.1).first()
+            i_obs = store.streams.camera_info.last()
+            vlm2 = MoondreamVlModel()
+            vlm2.start()
+            dets = vlm2.query_detections(chosen.data, prompt)
+            print(f"VLM on chosen frame: {len(dets.detections)} detections")
+            if dets.detections:
+                objs = DetObject.from_2d_to_list(
+                    detections_2d=dets,
+                    color_image=chosen.data,
+                    depth_image=d_obs.data,
+                    camera_info=i_obs.data,
+                    camera_transform=None,
+                    max_distance=1.0,
+                    use_aabb=True,
+                    max_obstacle_width=0.06,
+                )
+                print(f"find_objects WOULD return {len(objs)} object(s):")
+                for o in objs:
+                    print(f"  {o.name} at ({o.center.x:.3f}, {o.center.y:.3f}, {o.center.z:.3f})")
+                if objs:
+                    print()
+                    print("  => find_objects() WORKS. If HumanCLI still says 'nothing',")
+                    print("     the agent isn't CALLING find_objects (system-prompt issue),")
+                    print("     OR it's calling get_scene_info (empty until find_objects runs).")
+                else:
+                    print()
+                    print("  => find_objects() returns []. 3D projection still failing —")
+                    print("     check section E depth stats: is depth coherent post-codec-fix?")
+            else:
+                print("  => VLM found nothing on the chosen frame. find_objects returns [].")
+                print("     CLIP matched a frame, but Moondream can't see the object in it.")
+                print("     Likely: CLIP matched on background context, not the object itself.")
+        except Exception as e:
+            print(f"find_objects replication failed at projection: {e}")
+
+    print()
+    print("READ THIS:")
+    print("  Section B  -> CLIP semantic search proven (similarity numbers)")
+    print("  Section D  -> Moondream VLM proven (bbox detections)")
+    print("  Section E  -> depth integrity post-codec-fix (coherent values = fix worked)")
+    print("  Section F  -> the EXACT find_objects path. If F returns objects but HumanCLI")
+    print("               says nothing, it's 100% an agent-prompting issue, not the stack.")
+
+
+def main() -> None:
+    prompt = sys.argv[1] if len(sys.argv) > 1 else "cup"
+    db_path = sys.argv[2] if len(sys.argv) > 2 else "recording_xarm6.db"
+    LOG.unlink(missing_ok=True)
+    with LOG.open("w") as f, redirect_stdout(f):
+        print(f"perception_diag — prompt={prompt!r} db={db_path} time={time.strftime('%Y-%m-%d %H:%M:%S')}")
+        try:
+            run(prompt, db_path)
+        except Exception as e:
+            import traceback
+
+            print(f"FATAL: {e}")
+            traceback.print_exc()
+        print()
+        print("=== END ===")
+    print(f"wrote {LOG}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py
index 9ca223832c..613c73d10e 100644
--- a/dimos/manipulation/pick_and_place_module.py
+++ b/dimos/manipulation/pick_and_place_module.py
@@ -130,8 +130,24 @@ def start(self) -> None:
         logger.info("PickAndPlaceModule started")
 
     def _on_objects(self, objects: list[DetObject]) -> None:
-        """Callback when objects received from perception (runs on RxPY thread pool)."""
+        """Callback when objects received from perception (runs on RxPY thread pool).
+
+        Also writes ``_detection_snapshot`` so ``pick``/``place`` can act on
+        publish-driven detections (e.g., LazyPerceptionModule.find_objects)
+        without an explicit refresh_obstacles()/look() cycle.
+
+        Snapshot has TWO writers now:
+        - this callback (publish-driven, xArm6-style continuous flow)
+        - ``refresh_obstacles()`` (scan-driven, xArm7-style episodic flow)
+        For the xArm7 path with ObjectSceneRegistrationModule (publishes
+        only non-empty ``all_permanent``, monotonically), the callback
+        keeps the snapshot self-healed to the latest cached set — strictly
+        additive vs the previous scan→freeze semantics. The previous
+        "freeze at refresh time" was a workaround for cache volatility,
+        not a hard contract.
+        """
         try:
+            self._detection_snapshot = list(objects)
             if self._world_monitor is not None:
                 self._world_monitor.on_objects(objects)
         except Exception as e:
diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py
index b88159dd8e..3e164b4695 100644
--- a/dimos/robot/all_blueprints.py
+++ b/dimos/robot/all_blueprints.py
@@ -108,6 +108,8 @@
     "xarm-perception-agent": "dimos.manipulation.blueprints:xarm_perception_agent",
     "xarm-perception-sim": "dimos.manipulation.blueprints:xarm_perception_sim",
     "xarm-perception-sim-agent": "dimos.manipulation.blueprints:xarm_perception_sim_agent",
+    "xarm6-perception": "dimos.manipulation.blueprints:xarm6_perception",
+    "xarm6-perception-agent": "dimos.manipulation.blueprints:xarm6_perception_agent",
     "xarm6-planner-only": "dimos.manipulation.blueprints:xarm6_planner_only",
     "xarm7-planner-coordinator": "dimos.manipulation.blueprints:xarm7_planner_coordinator",
     "xarm7-planner-coordinator-agent": "dimos.manipulation.blueprints:xarm7_planner_coordinator_agent",