Migrate lab_sim CLIPSeg objectives to SAM3

griswaldbrooks · claude · griswaldbrooks · commit 87bf727ae1b9 · 2026-03-04T18:14:01.000-05:00
Replace GetMasks2DFromTextQuery (CLIPSeg) with GetMasks2DFromExemplar
(SAM3) across 5 lab_sim objectives and 2 reusable subtrees. This
simplifies the grasping pipeline from a two-stage CLIPSeg+SAM2
refinement loop to a single SAM3 call, and removes the dependency on
moveit_pro_clipseg models.

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/src/lab_sim/objectives/addbottlestoplanningscene.xml b/src/lab_sim/objectives/addbottlestoplanningscene.xml
@@ -15,18 +15,20 @@
       />
       <SubTree ID="Take Wrist Camera Snapshot" _collapsed="true" />
       <SubTree
-        ID="Segment Image from Text Prompt Subtree"
+        ID="SAM3 Segment Image Subtree"
         _collapsed="true"
-        clip_model_path="models/clip.onnx"
-        clipseg_model_path="models/clipseg.onnx"
+        model_package="lab_sim"
+        encoder_model_path="models/sam3_vision_encoder.onnx"
+        decoder_model_path="models/sam3_decoder.onnx"
+        geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
+        text_encoder_model_path="models/sam3_text_encoder.onnx"
+        confidence_threshold="0.5"
         masks_visualization_topic="/masks_visualization"
-        masks2d="{masks2d}"
-        model_package="moveit_pro_clipseg"
-        prompts="conical flask"
-        threshold="0.15"
+        text_prompt="glass conical flask"
         image_topic_name="/wrist_camera/color"
-        negative_prompts="box"
-        erosion_size="10"
+        masks2d="{masks2d}"
+        confidence_scores="{confidence_scores}"
+        confidence_scores_str="{confidence_scores_str}"
       />
       <Action
         ID="GetPointCloud"
diff --git a/src/lab_sim/objectives/get_candidate_grasps_subtree.xml b/src/lab_sim/objectives/get_candidate_grasps_subtree.xml
@@ -15,6 +15,29 @@
         message_out="{image}"
         publisher_timeout_sec="5.000000"
       />
+      <Action
+        ID="GetMasks2DFromExemplar"
+        target_image="{image}"
+        text_prompt="{text_prompt}"
+        model_package="lab_sim"
+        encoder_model_path="models/sam3_vision_encoder.onnx"
+        decoder_model_path="models/sam3_decoder.onnx"
+        geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
+        text_encoder_model_path="models/sam3_text_encoder.onnx"
+        confidence_threshold="{confidence_threshold}"
+        masks2d="{masks2d}"
+        confidence_scores="{confidence_scores}"
+        confidence_scores_str="{confidence_scores_str}"
+      />
+      <Action
+        ID="PublishMask2D"
+        image="{image}"
+        masks="{masks2d}"
+        masks_visualization_topic="/masks_visualization"
+        opacity="0.500000"
+        bounding_box_labels="{confidence_scores_str}"
+        name="Show SAM3 masks"
+      />
       <Action
         ID="GetPointCloud"
         topic_name="{camera_points_topic}"
@@ -29,64 +52,11 @@
         message_timeout_sec="5.000000"
         publisher_timeout_sec="5.000000"
       />
-      <Action
-        ID="GetMasks2DFromTextQuery"
-        image="{image}"
-        masks2d="{masks2d}"
-        prompts="{object_prompt}"
-        threshold="{mask_threshold}"
-        clip_model_path="models/clip.onnx"
-        clipseg_model_path="models/clipseg.onnx"
-        model_package="moveit_pro_clipseg"
-        erosion_size="{mask_erosion}"
-      />
-      <Action
-        ID="PublishMask2D"
-        image="{image}"
-        masks="{masks2d}"
-        masks_visualization_topic="/masks_visualization"
-        opacity="0.500000"
-        bounding_box_detection_class="{object_prompt}"
-        name="Show ClipSeg masks"
-      />
-      <Decorator
-        ID="ForEachUntilSuccess"
-        index="{index}"
-        out="{input_mask}"
-        vector_in="{masks2d}"
-        name="Refine the first successful mask from ClipSeg"
-      >
-        <Control ID="Sequence">
-          <Action
-            ID="GetCenterFromMask2D"
-            center="{center2d}"
-            mask="{input_mask}"
-          />
-          <Action
-            ID="GetMasks2DFromPointQuery"
-            image="{image}"
-            masks2d="{refined_masks}"
-            pixel_coords="{center2d}"
-            decoder_model_path="models/decoder.onnx"
-            encoder_model_path="models/sam2_hiera_large_encoder.onnx"
-            model_package="lab_sim"
-          />
-        </Control>
-      </Decorator>
-      <Action
-        ID="PublishMask2D"
-        image="{image}"
-        masks="{refined_masks}"
-        masks_visualization_topic="/masks_visualization"
-        opacity="0.500000"
-        bounding_box_detection_class="{object_prompt}"
-        name="Show SAM2 mask"
-      />
       <Control ID="Sequence" name="Convert 2D masks to segmented pointcloud">
         <Action
           ID="GetMasks3DFromMasks2D"
           camera_info="{camera_info}"
-          masks2d="{refined_masks}"
+          masks2d="{masks2d}"
           point_cloud="{point_cloud}"
           masks3d="{masks3d}"
         />
@@ -133,10 +103,9 @@
       <inout_port name="camera_image_topic" default="{camera_image_topic}" />
       <inout_port name="camera_info_topic" default="{camera_info_topic}" />
       <inout_port name="camera_points_topic" default="{camera_points_topic}" />
+      <inout_port name="confidence_threshold" default="{confidence_threshold}" />
       <inout_port name="grasps" default="{grasps}" />
-      <inout_port name="mask_erosion" default="{mask_erosion}" />
-      <inout_port name="mask_threshold" default="{mask_threshold}" />
-      <inout_port name="object_prompt" default="{object_prompt}" />
+      <inout_port name="text_prompt" default="{text_prompt}" />
     </SubTree>
   </TreeNodesModel>
 </root>
diff --git a/src/lab_sim/objectives/get_grasp_from_text_prompt_subtree.xml b/src/lab_sim/objectives/get_grasp_from_text_prompt_subtree.xml
@@ -16,10 +16,9 @@
         camera_image_topic="{camera_image_topic}"
         camera_info_topic="{camera_info_topic}"
         camera_points_topic="{camera_points_topic}"
-        object_prompt="{object_prompt}"
+        text_prompt="{text_prompt}"
         grasps="{grasps}"
-        mask_erosion="{mask_erosion}"
-        mask_threshold="{mask_threshold}"
+        confidence_threshold="{confidence_threshold}"
         name="Infer grasp poses from wrist camera"
       />
       <SubTree
@@ -48,11 +47,10 @@
       <inout_port name="camera_image_topic" default="{camera_image_topic}" />
       <inout_port name="camera_info_topic" default="{camera_info_topic}" />
       <inout_port name="camera_points_topic" default="{camera_points_topic}" />
-      <inout_port name="mask_erosion" default="{mask_erosion}" />
-      <inout_port name="mask_threshold" default="{mask_threshold}" />
-      <inout_port name="object_prompt" default="{object_prompt}" />
+      <inout_port name="confidence_threshold" default="{confidence_threshold}" />
       <inout_port name="output_grasp" default="{output_grasp}" />
       <inout_port name="planning_group" default="{planning_group}" />
+      <inout_port name="text_prompt" default="{text_prompt}" />
     </SubTree>
   </TreeNodesModel>
 </root>
diff --git a/src/lab_sim/objectives/ml_segment_image.xml b/src/lab_sim/objectives/ml_segment_image.xml
@@ -12,17 +12,20 @@
         primary_view_name="/masks_visualization"
       />
       <SubTree
-        ID="Segment Image from No Negative Text Prompt Subtree"
+        ID="SAM3 Segment Image Subtree"
         _collapsed="true"
-        clip_model_path="models/clip.onnx"
-        clipseg_model_path="models/clipseg.onnx"
-        erosion_size="2"
+        model_package="lab_sim"
+        encoder_model_path="models/sam3_vision_encoder.onnx"
+        decoder_model_path="models/sam3_decoder.onnx"
+        geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
+        text_encoder_model_path="models/sam3_text_encoder.onnx"
+        confidence_threshold="0.5"
         masks_visualization_topic="/masks_visualization"
-        model_package="moveit_pro_clipseg"
-        threshold="0.15"
-        prompts="an object"
+        text_prompt="an object"
         image_topic_name="/wrist_camera/color"
         masks2d="{masks2d}"
+        confidence_scores="{confidence_scores}"
+        confidence_scores_str="{confidence_scores_str}"
       />
     </Control>
   </BehaviorTree>
diff --git a/src/lab_sim/objectives/ml_segment_point_cloud.xml b/src/lab_sim/objectives/ml_segment_point_cloud.xml
@@ -1,38 +1,16 @@
-<?xml version="1.0" encoding="UTF-8" ?>
+<?xml version='1.0' encoding='UTF-8'?>
 <root BTCPP_format="4" main_tree_to_execute="ML Segment Point Cloud">
   <!--//////////-->
-  <BehaviorTree
-    ID="ML Segment Point Cloud"
-    _description="Captures a point cloud and segments out all possible objects from a text prompt"
-    _favorite="false"
-  >
+  <BehaviorTree ID="ML Segment Point Cloud" _description="Captures a point cloud and segments out all possible objects from a text prompt" _favorite="false">
     <Control ID="Sequence">
-      <SubTree
-        ID="Segment Point Cloud from Text Prompt Subtree"
-        _collapsed="true"
-        camera_topic_name="/wrist_camera/camera_info"
-        decoder_model_path="models/decoder.onnx"
-        encoder_model_path="models/sam2_hiera_large_encoder.onnx"
-        image_topic_name="/wrist_camera/color"
-        masks_visualization_topic="/masks_visualization"
-        model_package="moveit_pro_clipseg"
-        points_topic_name="/wrist_camera/points"
-        clip_model_path="models/clip.onnx"
-        clipseg_model_path="models/clipseg.onnx"
-        erosion_size="2"
-        prompts="an object"
-        threshold="0.28"
-        masks3d="{masks3d}"
-        point_cloud="{point_cloud}"
-        point_cloud_vector="{point_cloud_vector}"
-      />
+      <SubTree ID="SAM3 Segment Point Cloud Subtree" _collapsed="false" model_package="lab_sim" encoder_model_path="models/sam3_vision_encoder.onnx" decoder_model_path="models/sam3_decoder.onnx" geometry_encoder_model_path="models/sam3_geometry_encoder.onnx" text_encoder_model_path="models/sam3_text_encoder.onnx" confidence_threshold="0.5" masks_visualization_topic="/masks_visualization" text_prompt="an object" image_topic_name="/wrist_camera/color" points_topic_name="/wrist_camera/points" camera_topic_name="/wrist_camera/camera_info" masks2d="{masks2d}" masks3d="{masks3d}" point_cloud="{point_cloud}" point_cloud_vector="{point_cloud_vector}" confidence_scores="{confidence_scores}" confidence_scores_str="{confidence_scores_str}"/>
     </Control>
   </BehaviorTree>
   <TreeNodesModel>
     <SubTree ID="ML Segment Point Cloud">
       <MetadataFields>
-        <Metadata runnable="true" />
-        <Metadata subcategory="Application - ML (GPU Recommended)" />
+        <Metadata runnable="true"/>
+        <Metadata subcategory="Application - ML (GPU Recommended)"/>
       </MetadataFields>
     </SubTree>
   </TreeNodesModel>
diff --git a/src/lab_sim/objectives/pick_1_pill_bottle.xml b/src/lab_sim/objectives/pick_1_pill_bottle.xml
@@ -16,23 +16,25 @@
         waypoint_name="Above Pick Cube"
       />
       <SubTree
-        ID="Segment Point Cloud from Text Prompt Subtree"
+        ID="SAM3 Segment Point Cloud Subtree"
         _collapsed="false"
-        camera_topic_name="/wrist_camera/camera_info"
-        decoder_model_path="models/decoder.onnx"
-        encoder_model_path="models/sam2_hiera_large_encoder.onnx"
-        image_topic_name="/wrist_camera/color"
+        model_package="lab_sim"
+        encoder_model_path="models/sam3_vision_encoder.onnx"
+        decoder_model_path="models/sam3_decoder.onnx"
+        geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
+        text_encoder_model_path="models/sam3_text_encoder.onnx"
+        confidence_threshold="0.5"
         masks_visualization_topic="/masks_visualization"
-        model_package="moveit_pro_clipseg"
+        text_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
+        image_topic_name="/wrist_camera/color"
         points_topic_name="/wrist_camera/points"
-        clip_model_path="models/clip.onnx"
-        clipseg_model_path="models/clipseg.onnx"
-        erosion_size="2"
-        prompts="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
-        threshold="0.28"
+        camera_topic_name="/wrist_camera/camera_info"
+        masks2d="{masks2d}"
         masks3d="{masks3d}"
         point_cloud="{point_cloud}"
         point_cloud_vector="{point_cloud_vector}"
+        confidence_scores="{confidence_scores}"
+        confidence_scores_str="{confidence_scores_str}"
       />
       <Decorator
         ID="ForEach"
diff --git a/src/lab_sim/objectives/pick_all_pill_bottles.xml b/src/lab_sim/objectives/pick_all_pill_bottles.xml
@@ -58,9 +58,8 @@
               camera_points_topic="/wrist_camera/points"
               planning_group="manipulator"
               output_grasp="{output_grasp}"
-              object_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
-              mask_threshold=".15"
-              mask_erosion="15"
+              text_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
+              confidence_threshold="0.5"
               name="Get grasp pose for a pill bottle"
             />
             <SubTree ID="Add Table to Planning Scene" _collapsed="true" />
diff --git a/src/lab_sim/objectives/sam3_segment_image_subtree.xml b/src/lab_sim/objectives/sam3_segment_image_subtree.xml
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<root BTCPP_format="4" main_tree_to_execute="SAM3 Segment Image Subtree">
+  <!--//////////-->
+  <BehaviorTree
+    ID="SAM3 Segment Image Subtree"
+    _description="Captures an image and runs SAM3 to produce 2D masks from a text prompt."
+    _favorite="false"
+  >
+    <Control ID="Sequence">
+      <Action
+        ID="GetImage"
+        message_timeout_sec="5.000000"
+        topic_name="{image_topic_name}"
+        message_out="{image}"
+      />
+      <Action
+        ID="GetMasks2DFromExemplar"
+        target_image="{image}"
+        text_prompt="{text_prompt}"
+        model_package="{model_package}"
+        encoder_model_path="{encoder_model_path}"
+        decoder_model_path="{decoder_model_path}"
+        geometry_encoder_model_path="{geometry_encoder_model_path}"
+        text_encoder_model_path="{text_encoder_model_path}"
+        confidence_threshold="{confidence_threshold}"
+        masks2d="{masks2d}"
+        confidence_scores="{confidence_scores}"
+        confidence_scores_str="{confidence_scores_str}"
+        mask_count="{mask_count}"
+      />
+      <Action
+        ID="PublishMask2D"
+        image="{image}"
+        masks="{masks2d}"
+        masks_visualization_topic="{masks_visualization_topic}"
+        opacity="0.500000"
+        bounding_box_labels="{confidence_scores_str}"
+      />
+    </Control>
+  </BehaviorTree>
+  <TreeNodesModel>
+    <SubTree ID="SAM3 Segment Image Subtree">
+      <MetadataFields>
+        <Metadata runnable="false" />
+        <Metadata subcategory="Perception - ML" />
+      </MetadataFields>
+      <input_port name="image_topic_name" />
+      <input_port name="text_prompt" />
+      <input_port name="model_package" />
+      <input_port name="encoder_model_path" />
+      <input_port name="decoder_model_path" />
+      <input_port name="geometry_encoder_model_path" />
+      <input_port name="text_encoder_model_path" />
+      <input_port name="confidence_threshold" />
+      <input_port name="masks_visualization_topic" />
+      <output_port name="masks2d" default="{masks2d}" />
+      <output_port name="confidence_scores" default="{confidence_scores}" />
+      <output_port name="confidence_scores_str" default="{confidence_scores_str}" />
+      <output_port name="mask_count" default="{mask_count}" />
+    </SubTree>
+  </TreeNodesModel>
+</root>
diff --git a/src/lab_sim/objectives/sam3_segment_point_cloud_subtree.xml b/src/lab_sim/objectives/sam3_segment_point_cloud_subtree.xml