diff --git a/dimos/manipulation/blueprints.py b/dimos/manipulation/blueprints.py index 1c006c1d04..d7f53e1fa1 100644 --- a/dimos/manipulation/blueprints.py +++ b/dimos/manipulation/blueprints.py @@ -27,6 +27,7 @@ python -i -m dimos.manipulation.planning.examples.manipulation_client """ +from typing import Any import math from dimos.agents.mcp.mcp_client import McpClient @@ -37,10 +38,12 @@ from dimos.core.transport import LCMTransport from dimos.hardware.sensors.camera.realsense.camera import RealSenseCamera from dimos.manipulation.manipulation_module import ManipulationModule +from dimos.manipulation.memory2 import LazyPerceptionModule, RGBDCameraRecorder from dimos.manipulation.pick_and_place_module import PickAndPlaceModule from dimos.msgs.geometry_msgs.Quaternion import Quaternion from dimos.msgs.geometry_msgs.Transform import Transform from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo from dimos.msgs.sensor_msgs.JointState import JointState from dimos.perception.object_scene_registration import ObjectSceneRegistrationModule from dimos.robot.catalog.ufactory import xarm6 as _catalog_xarm6, xarm7 as _catalog_xarm7 @@ -338,8 +341,200 @@ ) +# ============================================================================ +# XArm6 perception — memory2-native pipeline (open-vocab VLM, no tracker) +# ============================================================================ +# Mirrors the xArm7 xarm_perception blueprint but uses the new memory2-native +# perception modules (RGBDCameraRecorder + LazyPerceptionModule) in place of +# ObjectSceneRegistrationModule. RGBDCameraRecorder also runs the continuous +# CLIP embed pipeline (no separate SemanticSearch module — see spec.py for the +# "why one module" reasoning). The PickAndPlaceModule API is unchanged — its +# `objects: In[list[Object]]` port is now fed by LazyPerceptionModule.objects. +# +# TF chain: world → link6 (ManipulationModule) → camera_link (RealSense) +# Usage: chain coordinator + agent in ONE dimos invocation. dimos's CLI takes +# multiple blueprint names and autoconnect()s them into one process — separate +# invocations conflict on dimos's internal port reservation (check_port_conflicts). +# XARM6_IP= OPENAI_API_KEY= dimos run coordinator-xarm6 xarm6-perception-agent +# Without the coordinator the arm won't actually move (joint_state stream has no +# producer otherwise). +# +# Camera transform: PLACEHOLDER reusing xArm7 values. Replace with measured +# hand-eye calibration for the xArm6 mount on first hardware run if positions +# returned by find_objects are off by a consistent offset. +_XARM6_PERCEPTION_CAMERA_TRANSFORM = Transform( + translation=Vector3(x=0.06693724, y=-0.0309563, z=0.00691482), + rotation=Quaternion(0.70513398, 0.00535696, 0.70897578, -0.01052180), # xyzw +) + +_xarm6_perception_cfg = _catalog_xarm6( + name="arm", + adapter_type="xarm" if global_config.xarm6_ip else "mock", + address=global_config.xarm6_ip, + pitch=math.radians(0), + add_gripper=True, + tf_extra_links=["link6"], +) + +def _convert_color_camera_info(ci: CameraInfo): + return ci.to_rerun( + image_topic="world/color_image", + optical_frame="camera_color_optical_frame", + ) + +def _convert_depth_camera_info(ci: CameraInfo): + return ci.to_rerun( + image_topic="world/depth_image", + optical_frame="camera_depth_optical_frame", + ) + +xarm6_perception = ( + autoconnect( + PickAndPlaceModule.blueprint( + robots=[_xarm6_perception_cfg.to_robot_model_config()], + planning_timeout=10.0, + enable_viz=True, + floor_z=-0.02, + ), + RealSenseCamera.blueprint( + base_frame_id="link6", + base_transform=_XARM6_PERCEPTION_CAMERA_TRANSFORM, + ), + # Memory2-native perception pipeline — replaces ObjectSceneRegistrationModule. + # RGBDCameraRecorder records color/depth/intrinsics AND continuously + # CLIP-embeds color frames into color_image_embedded. + # db_path is xArm6-specific to silo from other memory2 users and to avoid + # picking up stale embeddings from a different camera mount. + # Resume-if-exists: delete recording_xarm6.db manually for a clean slate. + RGBDCameraRecorder.blueprint(db_path="recording_xarm6.db"), + LazyPerceptionModule.blueprint(db_path="recording_xarm6.db"), + vis_module( + global_config.viewer, + rerun_config={ + "visual_override": { + "world/camera_info": _convert_color_camera_info, + "world/depth_camera_info": _convert_depth_camera_info, + } + }, + ), + ) + .transports( + { + ("joint_state", JointState): LCMTransport("/coordinator/joint_state", JointState), + } + ) + .global_config(n_workers=4) +) + + +# XArm6 perception + LLM agent. Three perception skills (find_objects, +# find_objects_near, recall) replace the previous look / scan_objects from +# the tracker-based pipeline. The agent reads the (seen Ns ago) timestamp +# in each response and decides whether to re-query before acting. +# Usage: chain coordinator + agent in ONE dimos invocation. dimos's CLI takes +# multiple blueprint names and autoconnect()s them into one process — separate +# invocations conflict on dimos's internal port reservation (check_port_conflicts). +# XARM6_IP= OPENAI_API_KEY= dimos run coordinator-xarm6 xarm6-perception-agent +# Without the coordinator the arm won't actually move (joint_state stream has no +# producer otherwise). +_MANIPULATION_AGENT_SYSTEM_PROMPT_XARM6 = """\ +You are a robotic manipulation assistant controlling an xArm6 robot arm with an \ +eye-in-hand RealSense camera and a gripper. + +# Skills + +## Perception +- **find_objects **: Find objects matching the prompt in the current scene. \ +Returns 3D positions and a "(seen Ns ago)" timestamp so you can judge freshness. \ +Prompt is any natural language — use spatial qualifiers ("the cup on the right") \ +or attributes ("the red mug") to disambiguate when multiple instances exist. \ +Example: "what cups are there?" → find_objects("cup"); "find the red mug" → \ +find_objects("red mug"). + +- **find_objects_near [radius]**: Same as find_objects but \ +narrows to frames captured when the camera was within `radius` meters of (x, y, z). \ +Use when you know which workspace area to scan — cheaper than full-scene query. \ +Example: "find a screwdriver near the workbench at 1.0, 0.5, 0.8" → \ +find_objects_near("screwdriver", 1.0, 0.5, 0.8, 0.5). + +- **recall **: Where did I last see something matching ? Cheaper than \ +find_objects (no VLM, returns camera pose at the matching frame plus timestamp), \ +across full session history including previous process runs. Use for "where was the \ +cup earlier?" / "have I ever seen a wrench?" type questions. + +## Pick & Place +- **pick **: Pick up an object that find_objects most recently \ +detected. Use the EXACT name from find_objects output. When multiple instances \ +match, add a spatial or attribute qualifier in your find_objects prompt ("cup on \ +the right") so VLM returns just one — then pick operates unambiguously. +- **place **: Place a held object at explicit world-frame coordinates. \ +Example: "place it at 0.4, 0.3, 0.1" +- **drop_on **: Drop a held object onto another detected object. \ +Automatically compensates for camera occlusion. Example: "drop it in the bowl". +- **place_back**: Return a held object to its original pick position. +- **pick_and_place **: Pick then place in one command. + +## Motion +- **move_to_pose [roll pitch yaw]**: Move end-effector to an absolute \ +world-frame pose (meters / radians). +- **move_to_joints **: Move to a 6-DOF joint configuration (radians). +- **go_home**: Move to the home/observe position. +- **go_init**: Return to startup position. Use after pick/place as a safe rest pose. + +## Gripper +- **open_gripper / close_gripper / set_gripper**: Direct gripper control. + +## Status & Recovery +- **get_robot_state**: Current joint positions, end-effector pose, gripper state. +- **get_scene_info**: Latest detection snapshot (from the most recent find_objects call). +- **reset**: Clear a FAULT state and return to IDLE. +- **clear_perception_obstacles**: Remove detected obstacles from the planning world. \ +Use when planning fails with COLLISION_AT_START. + +# Freshness and re-querying +- Every find_objects / find_objects_near response includes "(seen Ns ago)". +- A few seconds old → fresh, safe to act on. +- Minutes old → scene may have changed; re-query before acting. +- If you just picked or placed an object, the scene changed — re-query before the \ +next action. + +# Disambiguation (multiple instances) +- If find_objects returns multiple positions, either: + 1. Re-query with a spatial/attribute qualifier ("cup on the right", "red cup"), or + 2. Read the positions yourself and choose by reasoning (e.g., "right" = larger Y; \ +"closest" = smallest distance from gripper). +- Don't rely on identity across calls — find_objects always reflects the most recent \ +detection. The same name in two calls may be the same physical object or different ones. + +# Rules +- Use the EXACT object name from find_objects output. Do NOT substitute similar names. +- "drop it in/on [object]" → use **drop_on**. "place it at [coords]" → use **place**. +- "bring it back" → pick, then **go_init**. +- "bring it to me" / "hand it over" → pick, then move toward user (≈ X=0, Y=0.5). +- NEVER open the gripper while holding an object unless executing place / drop_on / \ +when the user explicitly asks. +- After pick or place, return to init with **go_init** unless another action follows. + +# Coordinate System +World frame (meters): X = forward, Y = left, Z = up. Z = 0 is robot base. +Typical working area: X 0.3–0.7, Y −0.5 to 0.5, Z 0.05–0.5. + +# Error Recovery +If planning fails with COLLISION_AT_START: call **clear_perception_obstacles**, then \ +**reset**, then retry. +""" + +xarm6_perception_agent = autoconnect( + xarm6_perception, + McpServer.blueprint(), + McpClient.blueprint(system_prompt=_MANIPULATION_AGENT_SYSTEM_PROMPT_XARM6), +) + + __all__ = [ "dual_xarm6_planner", + "xarm6_perception", + "xarm6_perception_agent", "xarm6_planner_only", "xarm7_planner_coordinator", "xarm7_planner_coordinator_agent", diff --git a/dimos/manipulation/memory2/__init__.py b/dimos/manipulation/memory2/__init__.py new file mode 100644 index 0000000000..59906b9c29 --- /dev/null +++ b/dimos/manipulation/memory2/__init__.py @@ -0,0 +1,24 @@ +"""Manipulation × memory2 integration. + +Three agent-callable perception skills built on memory2 primitives: +``find_objects``, ``find_objects_near``, ``recall``. See ``spec.py`` +for the architecture docstring, configs, and Protocol definitions. +""" + +from dimos.manipulation.memory2.lazy_perception import LazyPerceptionModule +from dimos.manipulation.memory2.recorder import RGBDCameraRecorder +from dimos.manipulation.memory2.spec import ( + LazyPerceptionModuleConfig, + LazyPerceptionModuleSpec, + RGBDCameraRecorderConfig, + RGBDCameraRecorderSpec, +) + +__all__ = [ + "LazyPerceptionModule", + "LazyPerceptionModuleConfig", + "LazyPerceptionModuleSpec", + "RGBDCameraRecorder", + "RGBDCameraRecorderConfig", + "RGBDCameraRecorderSpec", +] diff --git a/dimos/manipulation/memory2/lazy_perception.py b/dimos/manipulation/memory2/lazy_perception.py new file mode 100644 index 0000000000..718e8af159 --- /dev/null +++ b/dimos/manipulation/memory2/lazy_perception.py @@ -0,0 +1,277 @@ +"""LazyPerceptionModule — agent-callable open-vocab perception. + +Three @skill methods, each a one-line composition of memory2 query +primitives. See ``spec.py`` for the architecture docstring. +""" + +from __future__ import annotations + +from collections.abc import Callable +import time +from typing import Any + +from dimos.agents.annotation import skill +from dimos.core.core import rpc +from dimos.core.stream import Out +from dimos.manipulation.memory2.spec import LazyPerceptionModuleConfig +from dimos.memory2.module import MemoryModule +from dimos.models.embedding.base import EmbeddingModel +from dimos.models.vl.base import VlModel +from dimos.msgs.geometry_msgs.Quaternion import Quaternion +from dimos.msgs.geometry_msgs.Transform import Transform +from dimos.msgs.geometry_msgs.Vector3 import Vector3 +from dimos.perception.detection.type.detection3d.object import Object as DetObject +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +def _relative_time(ts: float) -> str: + """Format a past timestamp as a human-readable age.""" + delta = max(0.0, time.time() - ts) + if delta < 60: + return f"{int(delta)}s ago" + if delta < 3600: + return f"{int(delta // 60)}min ago" + return f"{int(delta // 3600)}h ago" + + +class LazyPerceptionModule(MemoryModule): + """Lazy memory2-native open-vocab object detector. + + Three skills, each a one-line memory2 composition. Stateless: + every call is an independent query → VLM (find_objects only) + → 3D → publish. + """ + + config: LazyPerceptionModuleConfig + + objects: Out[list[DetObject]] + + _vlm: VlModel | None = None + _clip: EmbeddingModel | None = None + + @rpc + def start(self) -> None: + super().start() + self._vlm = self.register_disposable(self.config.vlm_provider()) + self._vlm.start() + self._clip = self.register_disposable(self.config.embedding_model()) + self._clip.start() + + # ------------------------------------------------------------------ skills + + @skill + def find_objects(self, prompt: str) -> str: + """Find objects matching ``prompt``. Returns most recent confident + match with timestamp. + + Open-vocab: ``prompt`` can be any natural language. Comma-separated + prompts are split and processed per class because Moondream's + ``query_detections`` labels every result with the literal query + string. + """ + prompt_list = [p.strip() for p in prompt.split(",") if p.strip()] + if not prompt_list: + self.objects.publish([]) + return "No prompts provided." + + all_objects: list[DetObject] = [] + most_recent_ts: float | None = None + for single in prompt_list: + objs, ts = self._find_and_project( + single, + build_query=lambda stream, vec: stream.search(vec), + ) + all_objects.extend(objs) + if ts is not None and (most_recent_ts is None or ts > most_recent_ts): + most_recent_ts = ts + + self.objects.publish(all_objects) + if not all_objects: + return f"No confident '{prompt}' match in memory." + + age = _relative_time(most_recent_ts) if most_recent_ts is not None else "unknown age" + lines = [self._fmt_object_line(o) for o in all_objects] + return f"Found {len(all_objects)} object(s) matching '{prompt}' (seen {age}):\n" + "\n".join(lines) + + @skill + def find_objects_near( + self, + prompt: str, + x: float, + y: float, + z: float, + radius: float = 1.0, + ) -> str: + """Find objects matching ``prompt`` in frames recorded when the + camera was within ``radius`` meters of ``(x, y, z)``. + + ``.near()`` filters by the camera's pose at record time, NOT by + the detected object's position. Note: applied as a Python + post-filter after vector search; R*Tree pre-gating of the + vector index is a memory2 follow-up. + """ + prompt_list = [p.strip() for p in prompt.split(",") if p.strip()] + if not prompt_list: + self.objects.publish([]) + return "No prompts provided." + + pose = (x, y, z) + all_objects: list[DetObject] = [] + most_recent_ts: float | None = None + for single in prompt_list: + objs, ts = self._find_and_project( + single, + build_query=lambda stream, vec: stream.near(pose, radius).search(vec), + ) + all_objects.extend(objs) + if ts is not None and (most_recent_ts is None or ts > most_recent_ts): + most_recent_ts = ts + + self.objects.publish(all_objects) + if not all_objects: + return f"No confident '{prompt}' match near ({x:.2f}, {y:.2f}, {z:.2f}) within {radius}m." + + age = _relative_time(most_recent_ts) if most_recent_ts is not None else "unknown age" + lines = [self._fmt_object_line(o) for o in all_objects] + return ( + f"Found {len(all_objects)} object(s) matching '{prompt}' " + f"near ({x:.2f}, {y:.2f}, {z:.2f}) within {radius}m (seen {age}):\n" + + "\n".join(lines) + ) + + @skill + def recall(self, name: str) -> str: + """Where did I last see something matching ``name``? + + Cheaper than ``find_objects``: no VLM, no 3D projection. Returns + the camera pose at the most recent confident semantic match plus + timestamp. Works across process restarts because memory2's SQLite + store is the persistence layer. + """ + if self._clip is None: + return f"No memory of '{name}'." + try: + vec = self._clip.embed_text(name) + obs = ( + self.store.streams.color_image_embedded + .search(vec) + .filter(lambda o: (o.similarity or 0) >= self.config.min_similarity) + .order_by("ts", desc=True) + .first() + ) + except (AttributeError, LookupError): + return f"No memory of '{name}'." + except Exception as e: + logger.warning("recall(%r) failed: %s", name, e) + return f"No memory of '{name}'." + + age = _relative_time(obs.ts) + if obs.pose: + x, y, z = obs.pose[0], obs.pose[1], obs.pose[2] + return f"Last saw '{name}' with camera near ({x:.2f}, {y:.2f}, {z:.2f}) ({age})." + return f"Last saw '{name}' (camera pose unknown) ({age})." + + # ----------------------------------------------------------- internal + + def _find_and_project( + self, + prompt: str, + build_query: Callable[[Any, Any], Any], + ) -> tuple[list[DetObject], float | None]: + """Composed memory2 pipeline for ONE prompt class. + + Returns (objects, observation_ts). ``build_query`` is a callable + ``(stream, query_vec) -> filtered Stream`` so find_objects / + find_objects_near can share the rest of the pipeline. + """ + if self._clip is None: + return [], None + try: + vec = self._clip.embed_text(prompt) + obs = ( + build_query(self.store.streams.color_image_embedded, vec) + .filter(lambda o: (o.similarity or 0) >= self.config.min_similarity) + .order_by("ts", desc=True) + .first() + ) + except (AttributeError, LookupError): + return [], None + except Exception as e: + logger.warning("_find_and_project(%r): %s", prompt, e) + return [], None + + return self._detect_and_project_one(obs, prompt), obs.ts + + def _detect_and_project_one(self, color_obs: Any, prompt: str) -> list[DetObject]: + """VLM detection + 3D projection for ONE peak frame, ONE prompt class.""" + if self._vlm is None: + return [] + + # Aligned depth + latest intrinsics. .first()/.last() raise LookupError on empty. + try: + depth_obs = self.store.streams.depth_image.at( + color_obs.ts, tolerance=0.1 + ).first() + info_obs = self.store.streams.camera_info.last() + except LookupError: + logger.warning("missing depth/info near ts=%.3f", color_obs.ts) + return [] + except AttributeError: + logger.warning("depth_image or camera_info stream not yet available") + return [] + + # VLM can hang (network) or OOM (CUDA). Treat failure as no-detection. + try: + dets_2d = self._vlm.query_detections(color_obs.data, prompt) + except Exception as e: + logger.warning("VLM failed (prompt=%r ts=%.3f): %s", prompt, color_obs.ts, e) + return [] + if not dets_2d.detections: + return [] + + camera_transform = self._camera_transform_from_pose(color_obs.pose) + try: + # from_2d_to_list's annotation says ImageDetections2D[Detection2DSeg] + # but the implementation at object.py:205-215 handles bbox-only + # detections too (synthesizes a rectangular mask). Moondream returns + # bbox-only — works at runtime. + return DetObject.from_2d_to_list( + detections_2d=dets_2d, # type: ignore[arg-type] + color_image=color_obs.data, + depth_image=depth_obs.data, + camera_info=info_obs.data, + camera_transform=camera_transform, + max_distance=self.config.max_distance, + use_aabb=self.config.use_aabb, + max_obstacle_width=self.config.max_obstacle_width, + ) + except Exception as e: + logger.warning("from_2d_to_list failed at ts=%.3f: %s", color_obs.ts, e) + return [] + + def _camera_transform_from_pose(self, pose: Any) -> Transform | None: + """Build a Transform from the recorder's pose-stamped observation. + + Observation.pose is always a 7-tuple (x, y, z, qx, qy, qz, qw) or None + (memory2/observationstore/sqlite.py:_decompose_pose). + Object.from_2d_to_list(camera_transform=T) applies T to a camera-frame + pointcloud to produce world-frame output — T is camera→world; the + recorder's pose-in-world IS that. NO .inverse(). + """ + if pose is None: + return None + try: + x, y, z, qx, qy, qz, qw = pose + except (TypeError, ValueError): + logger.warning("unexpected pose shape: %r", type(pose).__name__) + return None + return Transform( + translation=Vector3(x, y, z), + rotation=Quaternion(qx, qy, qz, qw), + ) + + @staticmethod + def _fmt_object_line(o: DetObject) -> str: + return f" - {o.name} at ({o.center.x:.2f}, {o.center.y:.2f}, {o.center.z:.2f})" diff --git a/dimos/manipulation/memory2/recorder.py b/dimos/manipulation/memory2/recorder.py new file mode 100644 index 0000000000..ad4794b7c1 --- /dev/null +++ b/dimos/manipulation/memory2/recorder.py @@ -0,0 +1,100 @@ +"""RGBDCameraRecorder — Recorder + continuous CLIP embed for RGBD streams. + +Two things this module does on top of the base ``Recorder`` pattern at +``dimos/robot/unitree/go2/blueprints/smart/unitree_go2.py:49-53``: + +1. **Continuous CLIP embed pipeline.** Same logic as ``memory2.SemanticSearch``, + but co-located here so it shares the recorder's store. See ``spec.py`` for + the "why in one module" reasoning (memory2's SubjectNotifier is per-store). + +2. **Resume-if-exists semantics.** ``start()`` overrides ``Recorder.start()`` + to skip the overwrite-or-fail gate. If ``recording.db`` exists, we open it + and append (cross-session memory). If it doesn't, SQLite creates it fresh. + No flag flipping, no manual file deletion between runs. Users wanting a + clean slate delete the file themselves. +""" + +from __future__ import annotations + +from typing import Any + +from dimos.core.core import rpc +from dimos.core.stream import In +from dimos.manipulation.memory2.spec import RGBDCameraRecorderConfig +from dimos.memory2.embed import EmbedImages +from dimos.memory2.module import Recorder +from dimos.memory2.stream import Stream as MemoryStream +from dimos.memory2.transform import QualityWindow +from dimos.models.embedding.base import EmbeddingModel +from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo +from dimos.msgs.sensor_msgs.Image import Image +from dimos.utils.logging_config import setup_logger + +logger = setup_logger() + + +class RGBDCameraRecorder(Recorder): + """Records RGBD streams (color + depth + intrinsics) and continuously + CLIP-embeds qualifying color frames into ``color_image_embedded``.""" + + color_image: In[Image] + depth_image: In[Image] + camera_info: In[CameraInfo] + + config: RGBDCameraRecorderConfig + + _embedding_model: EmbeddingModel | None = None + _embeddings: MemoryStream[Any] | None = None + + @rpc + def start(self) -> None: + + super(Recorder, self).start() + + if self.config.g.replay: + logger.info( + "Replay mode active — recording disabled, leaving %s untouched", + self.config.db_path, + ) + return + + if not self.inputs: + logger.warning("RGBDCameraRecorder has no In ports — nothing to record") + return + + # Replicate Recorder.start()'s port-to-stream registration. Touching + # self.store lazily opens the SqliteStore at db_path (existing file or new). + # + # depth_image MUST use a lossless codec. memory2's codec auto-dispatch + # maps every `Image`-typed stream to JpegCodec (lossy 8-bit DCT) — correct + # for color RGB, catastrophic for uint16 depth (millimeters get shredded, + # then 3D projection finds no coherent points and find_objects returns + # nothing). We use memory2's documented per-stream `codec=` override + # (store/base.py: "Per-stream overrides ... codec=") to record depth as + # lossless lz4+lcm (raw Image.lcm_encode, no pixel loss). Color stays + # JPEG — lossy is fine for RGB and keeps the DB small. + for name, port in self.inputs.items(): + if name == "depth_image": + stream: MemoryStream[Any] = self.store.stream( + name, port.type, codec="lz4+lcm" + ) + else: + stream = self.store.stream(name, port.type) + self._port_to_stream(name, port, stream) + logger.info("Recording %s (%s)", name, port.type.__name__) + + # Continuous CLIP embed — same store, same notifier, .live() works. + self._embedding_model = self.register_disposable(self.config.embedding_model()) + self._embedding_model.start() + self._embeddings = self.store.stream("color_image_embedded", Image) + + # fmt: off + self.store.streams.color_image \ + .live() \ + .filter(lambda obs: obs.data.brightness > 0.1) \ + .transform(QualityWindow(lambda img: img.sharpness, window=0.5)) \ + .transform(EmbedImages(self._embedding_model, batch_size=2)) \ + .save(self._embeddings) \ + .drain_thread() + # fmt: on + logger.info("RGBDCameraRecorder: continuous CLIP embed pipeline active") diff --git a/dimos/manipulation/memory2/spec.py b/dimos/manipulation/memory2/spec.py new file mode 100644 index 0000000000..46830fc245 --- /dev/null +++ b/dimos/manipulation/memory2/spec.py @@ -0,0 +1,322 @@ +""" Memory2-native perception for manipulation. + + +Architecture +------------ + +:: + + camera + | + v + RGBDCameraRecorder + ├── records: color_image, depth_image, camera_info + └── continuous CLIP embed (brightness/sharpness filtered) + → color_image_embedded + | + ┌──────────── pulled on trigger (read-only) ──┐ + v | + LazyPerceptionModule | + | @skill find_objects(prompt) | + | .search(vec).filter(sim>=thr).order_by("ts", desc).first() + | → VLM detect → 3D project | + | @skill find_objects_near(prompt, x, y, z, radius) | + | .near((x,y,z), r).search(vec).filter(...).order_by(...).first() + | → VLM detect → 3D project | + | @skill recall(name) | + | .search(vec).filter(sim>=thr).order_by("ts", desc).first() + | → return camera pose + timestamp (no VLM, cheaper) + | | + | objects: Out[list[DetObject]] | + v | + PickAndPlaceModule (manipulation — In[objects]) | + | + Shared SQLite store (recording.db) ─────────────────────┘ + Recorder writes; LazyPerceptionModule opens its own SqliteStore + on the same file (read-only via WAL). + + Why recording + embedding in ONE module: memory2's SubjectNotifier + is in-memory per-store, so a separate-module embedder couldn't + receive .live() notifications from the recorder's writes even on + the same db_path. Keeping them in one module = one store = one + notifier = live pipeline works. The LazyPerceptionModule reads + only at skill-call time (no .live()), so cross-instance SQLite + reads (WAL) are sufficient there. + + +Three skills, three query shapes +-------------------------------- + +Each skill is a one-line composition of memory2 primitives. Every +skill returns the **most recent confident match** along with its +timestamp — the agent sees how fresh the data is and decides if it's +fresh enough to act on. No time-window parameter on the API; "current" +is whatever memory2 has most recently. + +- ``find_objects(prompt)`` — **semantic + most recent**. Search the + embedding stream, filter to confident matches, take the most recent. + Run VLM + 3D projection on that frame. Returns ``list[Object]`` (via + the ``objects`` port) plus formatted summary with timestamp. + +- ``find_objects_near(prompt, x, y, z, radius)`` — **spatial + semantic + + most recent**. Same as ``find_objects`` plus a camera-pose proximity + filter. NOTE on indexing: memory2 currently runs vector ``.search()`` + as a top-K vec0 query and applies the spatial ``.near()`` predicate in + Python afterward — the R*Tree index does NOT pre-filter the embedding + search. Correct results, but for very long recordings the top-K cap + can elide spatially-near low-similarity frames. Pushing R*Tree + pre-filtering through ``Backend._vector_search`` is a memory2 follow-up. + +- ``recall(name)`` — **cheaper cousin of find_objects**. Same query + shape but skips VLM detection and 3D projection; returns the camera + pose at the matching frame plus timestamp. Use for "where was I when + I saw X" questions where exact 3D object pose isn't needed. + +**Freshness contract is in the response.** Every result includes +``(seen N seconds ago)`` so the agent can reason: "2 seconds is fresh +— act"; "5 minutes is stale — re-query or skip." + + +The lazy pipeline +------------------ + +All three skills share the same shape: + +1. CLIP-embed the prompt → query vector. + +2. Build a memory2 query on ``color_image_embedded`` by composing the + relevant filters (``.near`` / ``.search`` / ``.filter`` / + ``.order_by``). ``.search`` pushes through the vec0 vector index; + tag/time/SQL-expressible filters push to SQL. ``.near`` runs as a + Python post-filter after vector search (R*Tree pre-gating of vector + search is a memory2 follow-up). + +3. Take ``.first()`` — the most recent confident match. Single + observation. + +4. (find_objects / find_objects_near only:) Pull aligned depth via + ``.at(obs.ts, tolerance)`` and latest intrinsics via ``.last()``. + Run VLM detection on the color frame, project via + ``Object.from_2d_to_list`` (camera→world transform from the + recorder's pose). + +5. Publish ``list[Object]`` on ``objects``. Return a formatted summary + that **includes the observation timestamp** so the agent can read + freshness. + +**Snapshot contract.** Every publish on ``objects`` replaces +``PickAndPlaceModule._detection_snapshot`` — the cache ``pick`` / +``place`` read. So: + +- A successful ``find_objects("cup")`` → snapshot = [cup objects]; ``pick("cup")`` works. +- An interleaved ``find_objects("apple")`` returning 0 matches → snapshot = []; later + ``pick("cup")`` fails ("not found"). Re-query before acting. +- This is intentional. Each ``find_objects`` call is a fresh world model, not an + accumulating one. The agent prompt makes this explicit. + +``PickAndPlaceModule`` reads ``objects`` to act on named +targets. Multiple instances in the result are handled either by +prompt qualifiers ("cup on the right" → VLM returns one detection in +the frame) or by the agent reading the returned positions and +choosing. + + +Streams (recorded continuously) +------------------------------- + +:: + + color_image JPEG-encoded RGB frames + world pose + depth_image depth frames, LOSSLESS lz4+lcm codec + camera_info intrinsics (rarely change, .last() suffices) + color_image_embedded CLIP vector + image bytes, vec0-indexed + for similarity search (populated by the + recorder's continuous embed pipeline) + +**depth_image MUST be recorded losslessly.** memory2's codec +auto-dispatch maps every ``Image``-typed stream to ``JpegCodec`` +(lossy 8-bit DCT) — correct for RGB color, but it shreds uint16 +depth (millimeters). JPEG'd depth → no coherent 3D points → +``from_2d_to_list`` returns ``[]`` → ``find_objects`` silently +returns nothing even though CLIP + VLM succeeded. ``RGBDCameraRecorder`` +overrides the depth stream's codec to ``lz4+lcm`` via memory2's +public per-stream ``codec=`` override (no memory2 changes). The codec +is persisted per-stream in the SQLite registry, so an existing +recording created before this fix keeps its JPEG depth — delete the +db file once to recreate the stream with the lossless codec. + + +Open vocab + cross-session memory +--------------------------------- + +The agent passes any natural-language description to ``find_objects`` +or ``find_objects_near`` (``"red mug with handle"``, +``"the screwdriver near the laptop"``). The VLM handles visual +disambiguation; the agent can encode spatial qualifiers in the prompt, +or call ``find_objects_near`` for camera-pose-bounded queries (Python +spatial post-filter; see the ``find_objects_near`` note above for the +indexing limitation). + +Cross-session memory is implicit: memory2 persists every observation +in ``recording.db``. Skills query the full embedding history regardless +of when the current process started — no replay logic, no bespoke +loaders. ``recall(name)`` returning "(seen 3 hours ago)" is normal and +correct after a process restart; the agent reads the timestamp and +decides. + +``RGBDCameraRecorder`` uses **resume-if-exists**: it opens ``recording.db`` +if the file is there (cross-session memory works out of the box) and +creates it fresh if it isn't. ``RecorderConfig.overwrite`` is inert in +this subclass — see ``recorder.py``. Users wanting a clean slate delete +the file manually before launching. + +""" + +from __future__ import annotations + +from pathlib import Path +from typing import TYPE_CHECKING, Protocol, runtime_checkable + +from dimos.memory2.module import MemoryModuleConfig, RecorderConfig +from dimos.models.embedding.base import EmbeddingModel +from dimos.models.embedding.clip import CLIPModel +from dimos.models.vl.base import VlModel +from dimos.models.vl.moondream import MoondreamVlModel + +if TYPE_CHECKING: + from dimos.agents.annotation import skill # noqa: F401 (referenced in docstrings) + from dimos.core.stream import In, Out + from dimos.msgs.sensor_msgs.CameraInfo import CameraInfo + from dimos.msgs.sensor_msgs.Image import Image + from dimos.perception.detection.type.detection3d.object import Object as DetObject + + +# ============================================================================= +# RGBDCameraRecorder — records color / depth / intrinsics to memory2 +# ============================================================================= + + +class RGBDCameraRecorderConfig(RecorderConfig): + """Config for the RGBD-camera recorder + continuous embedder.""" + + db_path: str | Path = "recording.db" + embedding_model: type[EmbeddingModel] = CLIPModel + + +@runtime_checkable +class RGBDCameraRecorderSpec(Protocol): + """Protocol for the RGBD-camera recorder + continuous embedder.""" + + config: RGBDCameraRecorderConfig + + color_image: In[Image] + depth_image: In[Image] + camera_info: In[CameraInfo] + + +# ============================================================================= +# LazyPerceptionModule — agent-callable open-vocab perception +# ============================================================================= + + +class LazyPerceptionModuleConfig(MemoryModuleConfig): + """Config for the lazy perception module.""" + + db_path: str | Path = "recording.db" + + # Models (blueprint can override) + vlm_provider: type[VlModel] = MoondreamVlModel + embedding_model: type[EmbeddingModel] = CLIPModel + + # Similarity threshold — observations below this don't count as a + # confident match for find_objects / find_objects_near. + min_similarity: float = 0.20 + + # 3D projection knobs threaded into ``Object.from_2d_to_list``. + max_distance: float = 1.0 + use_aabb: bool = True + max_obstacle_width: float = 0.06 + + +@runtime_checkable +class LazyPerceptionModuleSpec(Protocol): + """Protocol for the lazy perception module.""" + + config: LazyPerceptionModuleConfig + + objects: Out[list[DetObject]] + """Wired by the blueprint to ``PickAndPlaceModule.objects``. Cache + of the most recent ``find_objects`` / ``find_objects_near`` result. + Manipulation reads from here to act on the named target. Named + ``objects`` to match the existing manipulation In port for + autoconnect-by-name.""" + + def find_objects(self, prompt: str) -> str: + """Find objects matching ``prompt``. Returns the most recent + confident match with timestamp. + + Decorated ``@skill`` in the implementation. Composes + ``.search()`` + ``.filter(similarity >= min_similarity)`` + + ``.order_by("ts", desc=True)`` + ``.first()`` over + ``color_image_embedded``, then runs VLM detection on the + matching frame and projects to 3D. Publishes ``list[Object]`` + on the ``objects`` port. + + Open-vocab: ``prompt`` can be any natural language + (``"red mug"``, ``"the cup on the right"``). Comma-separated + prompts are split and processed per class because VLM + ``query_detections`` labels every result with the literal + query string. + + Returns a human-readable summary including the observation + timestamp (``"(seen 3s ago)"``) so the agent can judge + freshness. Returns a no-match message when no observation + meets ``min_similarity``. + """ + ... + + def find_objects_near( + self, + prompt: str, + x: float, + y: float, + z: float, + radius: float = 1.0, + ) -> str: + """Find objects matching ``prompt``, in frames recorded when + the camera was within ``radius`` meters of ``(x, y, z)``. + + Decorated ``@skill`` in the implementation. Same query shape + as ``find_objects`` plus an extra ``.near((x, y, z), radius)`` + predicate. ``.near()`` runs as a Python post-filter today — + memory2's R*Tree index does not pre-gate the vector search. + Take most recent confident match → VLM → 3D project → publish + on the ``objects`` port. + + ``.near()`` filters by the camera's pose at record time, NOT + by the detected object's position. Use for "frames captured + while looking at this workspace area"; object-position + filtering is downstream of detection. + """ + ... + + def recall(self, name: str) -> str: + """Where did I last see something matching ``name``? + + Decorated ``@skill`` in the implementation. Cheaper cousin of + ``find_objects``: composes ``.search()`` + + ``.order_by("ts", desc=True)`` + ``.first()`` over + ``color_image_embedded`` — returns the most recent confident + semantic match across the full embedding history, but **does + not run VLM**. Returns the camera pose at the matching frame + plus timestamp. + + Works after process restart because memory2's SQLite store is + the persistence layer; the query touches the same db that + previous sessions wrote. + + Returns a human-readable summary with timestamp + (``"Last saw 'cup' near (1.0, 0.5, 0.9) (seen 3min ago)"``), or + ``"No memory of ."`` when nothing matches. + """ + ... diff --git a/dimos/manipulation/memory2/test_perception_stack.py b/dimos/manipulation/memory2/test_perception_stack.py new file mode 100644 index 0000000000..5a8f1101ab --- /dev/null +++ b/dimos/manipulation/memory2/test_perception_stack.py @@ -0,0 +1,321 @@ +"""End-to-end stack smoke test for the memory2-native perception, bypassing the agent. + +What it verifies: + 1. ``recording_xarm6.db`` exists and has streams. + 2. CLIP is continuously embedding (``color_image_embedded`` stream growing). + 3. CLIP semantic search matches a text prompt to a recorded frame. + 4. Moondream VLM produces bbox detections on that frame. + 5. ``Object.from_2d_to_list`` projects to 3D against aligned depth + intrinsics. + +Run while ``dimos run coordinator-xarm6 xarm6-perception-agent`` is alive (so the +recorder is actively writing). It writes ``perception_diag.log`` in the cwd. + +Usage: + python dimos/manipulation/memory2/test_perception_stack.py [PROMPT] [DB_PATH] + + python dimos/manipulation/memory2/test_perception_stack.py "cup" + python dimos/manipulation/memory2/test_perception_stack.py "the red mug" recording_xarm6.db +""" + +from __future__ import annotations + +import sys +import time +from contextlib import redirect_stdout +from pathlib import Path + +LOG = Path("perception_diag.log").resolve() + + +def hr(title: str) -> None: + print() + print("=" * 78) + print(f"== {title}") + print("=" * 78) + + +def run(prompt: str, db_path: str) -> None: + hr("A. DB file + streams") + db = Path(db_path).resolve() + if not db.exists(): + print(f"NOT FOUND: {db}") + print("Is the recorder running? Check: `dimos run coordinator-xarm6 xarm6-perception-agent`") + return + print(f"db: {db} size: {db.stat().st_size / 1e6:.1f} MB") + + from dimos.memory2.store.sqlite import SqliteStore + + # read-only access; recorder still has its own connection + store = SqliteStore(path=str(db)) + store.start() + streams = store.list_streams() + print(f"streams found: {streams}") + + # Show the codec each stream was created with (persisted in the registry). + # depth_image MUST be a lossless codec (lz4+lcm / lcm). If it shows 'jpeg', + # the recorder.py codec fix is NOT deployed, OR the DB predates the fix. + print() + print("per-stream codec (from registry — the load-bearing line for depth):") + try: + for name in ["color_image", "depth_image", "camera_info", "color_image_embedded"]: + stored = store._registry.get(name) + cid = stored.get("codec_id") if stored else "(not registered)" + flag = "" + if name == "depth_image": + flag = " <-- MUST be lossless (lz4+lcm/lcm). 'jpeg' = fix not deployed." + print(f" {name}: codec={cid}{flag}") + except Exception as e: + print(f" codec introspection failed: {e}") + + expected = ["color_image", "depth_image", "camera_info", "color_image_embedded"] + newest_ts: dict[str, float] = {} + for name in expected: + if name not in streams: + print(f" MISSING: {name}") + continue + try: + st = store.streams.__getattr__(name) + cnt = sum(1 for _ in st.order_by("ts")) + last = st.order_by("ts", desc=True).first() + newest_ts[name] = last.ts + age = time.time() - last.ts + print(f" {name}: {cnt} obs | newest ts={last.ts:.1f} ({age:.1f}s ago)") + except Exception as e: + print(f" {name}: count/recency failed ({e})") + + hr("A2. staleness — is the embed pipeline keeping up with the camera?") + if "color_image" in newest_ts and "color_image_embedded" in newest_ts: + lag = newest_ts["color_image"] - newest_ts["color_image_embedded"] + print(f"newest color_image : {newest_ts['color_image']:.1f}") + print(f"newest color_image_embedded : {newest_ts['color_image_embedded']:.1f}") + print(f"embed lag behind live camera : {lag:.1f}s") + if lag > 30: + print(" -> STALE. The embed pipeline is NOT producing recent embeddings.") + print(" -> find_objects searches recent frames first; if the newest embedding") + print(" is minutes/hours old, it can never match what the camera sees NOW.") + elif lag > 5: + print(" -> Lagging but alive. Pipeline can't keep up with frame rate.") + else: + print(" -> Fresh. Embed pipeline is current with the camera.") + + hr("B. CLIP semantic search for prompt") + print(f"prompt: {prompt!r}") + from dimos.models.embedding.clip import CLIPModel + + clip = CLIPModel() + clip.start() + t0 = time.perf_counter() + vec = clip.embed_text(prompt) + print(f"text embed: {time.perf_counter() - t0:.3f}s dim={getattr(vec, 'shape', '?')}") + + try: + embedded = store.streams.color_image_embedded + results = list(embedded.search(vec).order_by("ts", desc=True).limit(5)) + except Exception as e: + print(f"search failed: {e}") + print(" -> color_image_embedded stream missing or no embeddings written yet.") + print(" -> Confirm the recorder logged 'continuous CLIP embed pipeline active'.") + return + print(f"top-5 matches (most recent first):") + if not results: + print(" (no matches at all — CLIP embed pipeline not writing? Check brightness/sharpness filters)") + return + for i, obs in enumerate(results): + sim = getattr(obs, "similarity", None) + pose = obs.pose + print(f" #{i} ts={obs.ts:.3f} sim={sim} pose={pose}") + + most_recent = results[0] + print(f"\npicking #0 for VLM (ts={most_recent.ts:.3f}, similarity={getattr(most_recent, 'similarity', None)})") + + hr("C. depth + intrinsics for that timestamp") + try: + depth_obs = store.streams.depth_image.at(most_recent.ts, tolerance=0.1).first() + info_obs = store.streams.camera_info.last() + print(f"depth ts={depth_obs.ts:.3f} (Δ={depth_obs.ts - most_recent.ts:+.3f}s)") + print(f"info ts ={info_obs.ts:.3f}") + except Exception as e: + print(f"depth/info lookup failed: {e}") + return + + hr("D. Moondream detection on the matched frame") + from dimos.models.vl.moondream import MoondreamVlModel + + vlm = MoondreamVlModel() + vlm.start() + t0 = time.perf_counter() + try: + dets_2d = vlm.query_detections(most_recent.data, prompt) + print(f"VLM call: {time.perf_counter() - t0:.3f}s") + print(f"detections: {len(dets_2d.detections)}") + for i, d in enumerate(dets_2d.detections): + bbox = getattr(d, "bbox", None) + print(f" #{i} bbox={bbox} name={getattr(d, 'name', '?')}") + except Exception as e: + print(f"VLM call failed: {e}") + return + + if not dets_2d.detections: + print(" -> VLM returned no detections for this prompt + this frame.") + print(" -> Try a more visible prompt or move the object into clearer view.") + return + + hr("E. depth stats in the detection bbox (why does projection fail?)") + import numpy as np + + from dimos.perception.detection.type.detection3d.object import Object as DetObject + + try: + depth_cv = depth_obs.data.to_opencv() + print(f"depth array: shape={depth_cv.shape} dtype={depth_cv.dtype}") + print(f"depth global: min={np.nanmin(depth_cv):.3f} max={np.nanmax(depth_cv):.3f} " + f"mean={np.nanmean(depth_cv):.3f}") + d0 = dets_2d.detections[0] + x1, y1, x2, y2 = (int(v) for v in d0.bbox) + h, w = depth_cv.shape[:2] + x1, y1 = max(0, x1), max(0, y1) + x2, y2 = min(w, x2), min(h, y2) + roi = depth_cv[y1:y2, x1:x2] + nonzero = roi[(roi > 0) & np.isfinite(roi)] + print(f"bbox roi: ({x1},{y1})-({x2},{y2}) pixels={roi.size}") + if nonzero.size: + print(f"bbox valid-depth: count={nonzero.size} " + f"min={nonzero.min():.3f} max={nonzero.max():.3f} mean={nonzero.mean():.3f}") + print(" -> If these look like 700-3000, depth is in MILLIMETERS (need depth_scale=1000)") + print(" -> If these look like 0.7-3.0, depth is in METERS (depth_scale=1.0 is correct)") + else: + print(" -> NO valid depth in bbox region. Sensor holes / reflective / out-of-range.") + except Exception as e: + print(f"depth-stat failed: {e}") + + hr("E2. 3D projection — production params vs filters-disabled") + + def _project(label: str, **kw): + try: + objs = DetObject.from_2d_to_list( + detections_2d=dets_2d, + color_image=most_recent.data, + depth_image=depth_obs.data, + camera_info=info_obs.data, + camera_transform=None, + **kw, + ) + print(f"[{label}] -> {len(objs)} objects") + for o in objs: + print(f" {o.name} at ({o.center.x:.3f}, {o.center.y:.3f}, {o.center.z:.3f})") + except Exception as e: + print(f"[{label}] raised: {e}") + + # 1. exactly what find_objects uses today + _project("production (max_dist=1.0, scale=1.0, trunc=10)", + max_distance=1.0, use_aabb=True, max_obstacle_width=0.06) + # 2. distance filter off — isolates the max_distance guard + _project("max_distance=0 (filter off)", + max_distance=0.0, use_aabb=True, max_obstacle_width=0.06) + # 3. treat depth as millimeters — isolates the unit-scale bug + _project("depth_scale=1000 (mm->m)", + depth_scale=1000.0, depth_trunc=10.0, max_distance=0.0, + use_aabb=True, max_obstacle_width=0.06) + # 4. both: mm scale + no distance filter + generous trunc + _project("mm scale + no dist filter + trunc=50", + depth_scale=1000.0, depth_trunc=50.0, max_distance=0.0, + use_aabb=True, max_obstacle_width=0.0) + print() + print("Whichever variant returns >0 objects tells us the exact fix:") + print(" - 'max_distance=0' works -> raise/disable max_distance in LazyPerceptionModuleConfig") + print(" - 'depth_scale=1000' works -> recorder/realsense emits mm; need depth_scale=1000 in find_objects") + print(" - none work -> empty depth in bbox; sensor/exposure issue") + + hr("F. EXACT find_objects() code path (the real query LazyPerceptionModule runs)") + print("Replicates lazy_perception.py: .search(vec).filter(sim>=min_similarity)") + print(".order_by('ts',desc=True).first() -> VLM -> from_2d_to_list(config params)") + print() + MIN_SIM = 0.20 # LazyPerceptionModuleConfig.min_similarity default + print(f"min_similarity = {MIN_SIM} (the filter find_objects applies)") + try: + filtered = ( + store.streams.color_image_embedded.search(vec) + .filter(lambda o: (getattr(o, "similarity", 0) or 0) >= MIN_SIM) + .order_by("ts", desc=True) + ) + chosen = filtered.first() + sim = getattr(chosen, "similarity", None) + print(f"chosen obs: ts={chosen.ts:.3f} sim={sim} (passed the >= {MIN_SIM} filter)") + except Exception as e: + print(f"NO observation passed the min_similarity filter: {e}") + print(f" -> Every embedded frame scored < {MIN_SIM} for prompt {prompt!r}.") + print(" -> THIS is why find_objects returns nothing. Two options:") + print(" (a) prompt/scene mismatch — try a prompt that matches what the camera sees") + print(" (b) min_similarity too high for this CLIP model — lower it in spec.py:") + print(" LazyPerceptionModuleConfig.min_similarity") + print(" -> Look at section B sims: if the best is e.g. 0.21, a threshold of 0.20") + print(" barely passes; 0.25+ would reject everything.") + chosen = None + + if chosen is not None: + try: + d_obs = store.streams.depth_image.at(chosen.ts, tolerance=0.1).first() + i_obs = store.streams.camera_info.last() + vlm2 = MoondreamVlModel() + vlm2.start() + dets = vlm2.query_detections(chosen.data, prompt) + print(f"VLM on chosen frame: {len(dets.detections)} detections") + if dets.detections: + objs = DetObject.from_2d_to_list( + detections_2d=dets, + color_image=chosen.data, + depth_image=d_obs.data, + camera_info=i_obs.data, + camera_transform=None, + max_distance=1.0, + use_aabb=True, + max_obstacle_width=0.06, + ) + print(f"find_objects WOULD return {len(objs)} object(s):") + for o in objs: + print(f" {o.name} at ({o.center.x:.3f}, {o.center.y:.3f}, {o.center.z:.3f})") + if objs: + print() + print(" => find_objects() WORKS. If HumanCLI still says 'nothing',") + print(" the agent isn't CALLING find_objects (system-prompt issue),") + print(" OR it's calling get_scene_info (empty until find_objects runs).") + else: + print() + print(" => find_objects() returns []. 3D projection still failing —") + print(" check section E depth stats: is depth coherent post-codec-fix?") + else: + print(" => VLM found nothing on the chosen frame. find_objects returns [].") + print(" CLIP matched a frame, but Moondream can't see the object in it.") + print(" Likely: CLIP matched on background context, not the object itself.") + except Exception as e: + print(f"find_objects replication failed at projection: {e}") + + print() + print("READ THIS:") + print(" Section B -> CLIP semantic search proven (similarity numbers)") + print(" Section D -> Moondream VLM proven (bbox detections)") + print(" Section E -> depth integrity post-codec-fix (coherent values = fix worked)") + print(" Section F -> the EXACT find_objects path. If F returns objects but HumanCLI") + print(" says nothing, it's 100% an agent-prompting issue, not the stack.") + + +def main() -> None: + prompt = sys.argv[1] if len(sys.argv) > 1 else "cup" + db_path = sys.argv[2] if len(sys.argv) > 2 else "recording_xarm6.db" + LOG.unlink(missing_ok=True) + with LOG.open("w") as f, redirect_stdout(f): + print(f"perception_diag — prompt={prompt!r} db={db_path} time={time.strftime('%Y-%m-%d %H:%M:%S')}") + try: + run(prompt, db_path) + except Exception as e: + import traceback + + print(f"FATAL: {e}") + traceback.print_exc() + print() + print("=== END ===") + print(f"wrote {LOG}") + + +if __name__ == "__main__": + main() diff --git a/dimos/manipulation/pick_and_place_module.py b/dimos/manipulation/pick_and_place_module.py index 9ca223832c..613c73d10e 100644 --- a/dimos/manipulation/pick_and_place_module.py +++ b/dimos/manipulation/pick_and_place_module.py @@ -130,8 +130,24 @@ def start(self) -> None: logger.info("PickAndPlaceModule started") def _on_objects(self, objects: list[DetObject]) -> None: - """Callback when objects received from perception (runs on RxPY thread pool).""" + """Callback when objects received from perception (runs on RxPY thread pool). + + Also writes ``_detection_snapshot`` so ``pick``/``place`` can act on + publish-driven detections (e.g., LazyPerceptionModule.find_objects) + without an explicit refresh_obstacles()/look() cycle. + + Snapshot has TWO writers now: + - this callback (publish-driven, xArm6-style continuous flow) + - ``refresh_obstacles()`` (scan-driven, xArm7-style episodic flow) + For the xArm7 path with ObjectSceneRegistrationModule (publishes + only non-empty ``all_permanent``, monotonically), the callback + keeps the snapshot self-healed to the latest cached set — strictly + additive vs the previous scan→freeze semantics. The previous + "freeze at refresh time" was a workaround for cache volatility, + not a hard contract. + """ try: + self._detection_snapshot = list(objects) if self._world_monitor is not None: self._world_monitor.on_objects(objects) except Exception as e: diff --git a/dimos/robot/all_blueprints.py b/dimos/robot/all_blueprints.py index b88159dd8e..3e164b4695 100644 --- a/dimos/robot/all_blueprints.py +++ b/dimos/robot/all_blueprints.py @@ -108,6 +108,8 @@ "xarm-perception-agent": "dimos.manipulation.blueprints:xarm_perception_agent", "xarm-perception-sim": "dimos.manipulation.blueprints:xarm_perception_sim", "xarm-perception-sim-agent": "dimos.manipulation.blueprints:xarm_perception_sim_agent", + "xarm6-perception": "dimos.manipulation.blueprints:xarm6_perception", + "xarm6-perception-agent": "dimos.manipulation.blueprints:xarm6_perception_agent", "xarm6-planner-only": "dimos.manipulation.blueprints:xarm6_planner_only", "xarm7-planner-coordinator": "dimos.manipulation.blueprints:xarm7_planner_coordinator", "xarm7-planner-coordinator-agent": "dimos.manipulation.blueprints:xarm7_planner_coordinator_agent",