Skip to content

Commit 87bf727

Browse files
Migrate lab_sim CLIPSeg objectives to SAM3
Replace GetMasks2DFromTextQuery (CLIPSeg) with GetMasks2DFromExemplar (SAM3) across 5 lab_sim objectives and 2 reusable subtrees. This simplifies the grasping pipeline from a two-stage CLIPSeg+SAM2 refinement loop to a single SAM3 call, and removes the dependency on moveit_pro_clipseg models. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a014161 commit 87bf727

9 files changed

Lines changed: 227 additions & 120 deletions

src/lab_sim/objectives/addbottlestoplanningscene.xml

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,20 @@
1515
/>
1616
<SubTree ID="Take Wrist Camera Snapshot" _collapsed="true" />
1717
<SubTree
18-
ID="Segment Image from Text Prompt Subtree"
18+
ID="SAM3 Segment Image Subtree"
1919
_collapsed="true"
20-
clip_model_path="models/clip.onnx"
21-
clipseg_model_path="models/clipseg.onnx"
20+
model_package="lab_sim"
21+
encoder_model_path="models/sam3_vision_encoder.onnx"
22+
decoder_model_path="models/sam3_decoder.onnx"
23+
geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
24+
text_encoder_model_path="models/sam3_text_encoder.onnx"
25+
confidence_threshold="0.5"
2226
masks_visualization_topic="/masks_visualization"
23-
masks2d="{masks2d}"
24-
model_package="moveit_pro_clipseg"
25-
prompts="conical flask"
26-
threshold="0.15"
27+
text_prompt="glass conical flask"
2728
image_topic_name="/wrist_camera/color"
28-
negative_prompts="box"
29-
erosion_size="10"
29+
masks2d="{masks2d}"
30+
confidence_scores="{confidence_scores}"
31+
confidence_scores_str="{confidence_scores_str}"
3032
/>
3133
<Action
3234
ID="GetPointCloud"

src/lab_sim/objectives/get_candidate_grasps_subtree.xml

Lines changed: 26 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,29 @@
1515
message_out="{image}"
1616
publisher_timeout_sec="5.000000"
1717
/>
18+
<Action
19+
ID="GetMasks2DFromExemplar"
20+
target_image="{image}"
21+
text_prompt="{text_prompt}"
22+
model_package="lab_sim"
23+
encoder_model_path="models/sam3_vision_encoder.onnx"
24+
decoder_model_path="models/sam3_decoder.onnx"
25+
geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
26+
text_encoder_model_path="models/sam3_text_encoder.onnx"
27+
confidence_threshold="{confidence_threshold}"
28+
masks2d="{masks2d}"
29+
confidence_scores="{confidence_scores}"
30+
confidence_scores_str="{confidence_scores_str}"
31+
/>
32+
<Action
33+
ID="PublishMask2D"
34+
image="{image}"
35+
masks="{masks2d}"
36+
masks_visualization_topic="/masks_visualization"
37+
opacity="0.500000"
38+
bounding_box_labels="{confidence_scores_str}"
39+
name="Show SAM3 masks"
40+
/>
1841
<Action
1942
ID="GetPointCloud"
2043
topic_name="{camera_points_topic}"
@@ -29,64 +52,11 @@
2952
message_timeout_sec="5.000000"
3053
publisher_timeout_sec="5.000000"
3154
/>
32-
<Action
33-
ID="GetMasks2DFromTextQuery"
34-
image="{image}"
35-
masks2d="{masks2d}"
36-
prompts="{object_prompt}"
37-
threshold="{mask_threshold}"
38-
clip_model_path="models/clip.onnx"
39-
clipseg_model_path="models/clipseg.onnx"
40-
model_package="moveit_pro_clipseg"
41-
erosion_size="{mask_erosion}"
42-
/>
43-
<Action
44-
ID="PublishMask2D"
45-
image="{image}"
46-
masks="{masks2d}"
47-
masks_visualization_topic="/masks_visualization"
48-
opacity="0.500000"
49-
bounding_box_detection_class="{object_prompt}"
50-
name="Show ClipSeg masks"
51-
/>
52-
<Decorator
53-
ID="ForEachUntilSuccess"
54-
index="{index}"
55-
out="{input_mask}"
56-
vector_in="{masks2d}"
57-
name="Refine the first successful mask from ClipSeg"
58-
>
59-
<Control ID="Sequence">
60-
<Action
61-
ID="GetCenterFromMask2D"
62-
center="{center2d}"
63-
mask="{input_mask}"
64-
/>
65-
<Action
66-
ID="GetMasks2DFromPointQuery"
67-
image="{image}"
68-
masks2d="{refined_masks}"
69-
pixel_coords="{center2d}"
70-
decoder_model_path="models/decoder.onnx"
71-
encoder_model_path="models/sam2_hiera_large_encoder.onnx"
72-
model_package="lab_sim"
73-
/>
74-
</Control>
75-
</Decorator>
76-
<Action
77-
ID="PublishMask2D"
78-
image="{image}"
79-
masks="{refined_masks}"
80-
masks_visualization_topic="/masks_visualization"
81-
opacity="0.500000"
82-
bounding_box_detection_class="{object_prompt}"
83-
name="Show SAM2 mask"
84-
/>
8555
<Control ID="Sequence" name="Convert 2D masks to segmented pointcloud">
8656
<Action
8757
ID="GetMasks3DFromMasks2D"
8858
camera_info="{camera_info}"
89-
masks2d="{refined_masks}"
59+
masks2d="{masks2d}"
9060
point_cloud="{point_cloud}"
9161
masks3d="{masks3d}"
9262
/>
@@ -133,10 +103,9 @@
133103
<inout_port name="camera_image_topic" default="{camera_image_topic}" />
134104
<inout_port name="camera_info_topic" default="{camera_info_topic}" />
135105
<inout_port name="camera_points_topic" default="{camera_points_topic}" />
106+
<inout_port name="confidence_threshold" default="{confidence_threshold}" />
136107
<inout_port name="grasps" default="{grasps}" />
137-
<inout_port name="mask_erosion" default="{mask_erosion}" />
138-
<inout_port name="mask_threshold" default="{mask_threshold}" />
139-
<inout_port name="object_prompt" default="{object_prompt}" />
108+
<inout_port name="text_prompt" default="{text_prompt}" />
140109
</SubTree>
141110
</TreeNodesModel>
142111
</root>

src/lab_sim/objectives/get_grasp_from_text_prompt_subtree.xml

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,9 @@
1616
camera_image_topic="{camera_image_topic}"
1717
camera_info_topic="{camera_info_topic}"
1818
camera_points_topic="{camera_points_topic}"
19-
object_prompt="{object_prompt}"
19+
text_prompt="{text_prompt}"
2020
grasps="{grasps}"
21-
mask_erosion="{mask_erosion}"
22-
mask_threshold="{mask_threshold}"
21+
confidence_threshold="{confidence_threshold}"
2322
name="Infer grasp poses from wrist camera"
2423
/>
2524
<SubTree
@@ -48,11 +47,10 @@
4847
<inout_port name="camera_image_topic" default="{camera_image_topic}" />
4948
<inout_port name="camera_info_topic" default="{camera_info_topic}" />
5049
<inout_port name="camera_points_topic" default="{camera_points_topic}" />
51-
<inout_port name="mask_erosion" default="{mask_erosion}" />
52-
<inout_port name="mask_threshold" default="{mask_threshold}" />
53-
<inout_port name="object_prompt" default="{object_prompt}" />
50+
<inout_port name="confidence_threshold" default="{confidence_threshold}" />
5451
<inout_port name="output_grasp" default="{output_grasp}" />
5552
<inout_port name="planning_group" default="{planning_group}" />
53+
<inout_port name="text_prompt" default="{text_prompt}" />
5654
</SubTree>
5755
</TreeNodesModel>
5856
</root>

src/lab_sim/objectives/ml_segment_image.xml

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,20 @@
1212
primary_view_name="/masks_visualization"
1313
/>
1414
<SubTree
15-
ID="Segment Image from No Negative Text Prompt Subtree"
15+
ID="SAM3 Segment Image Subtree"
1616
_collapsed="true"
17-
clip_model_path="models/clip.onnx"
18-
clipseg_model_path="models/clipseg.onnx"
19-
erosion_size="2"
17+
model_package="lab_sim"
18+
encoder_model_path="models/sam3_vision_encoder.onnx"
19+
decoder_model_path="models/sam3_decoder.onnx"
20+
geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
21+
text_encoder_model_path="models/sam3_text_encoder.onnx"
22+
confidence_threshold="0.5"
2023
masks_visualization_topic="/masks_visualization"
21-
model_package="moveit_pro_clipseg"
22-
threshold="0.15"
23-
prompts="an object"
24+
text_prompt="an object"
2425
image_topic_name="/wrist_camera/color"
2526
masks2d="{masks2d}"
27+
confidence_scores="{confidence_scores}"
28+
confidence_scores_str="{confidence_scores_str}"
2629
/>
2730
</Control>
2831
</BehaviorTree>

src/lab_sim/objectives/ml_segment_point_cloud.xml

Lines changed: 5 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,38 +1,16 @@
1-
<?xml version="1.0" encoding="UTF-8" ?>
1+
<?xml version='1.0' encoding='UTF-8'?>
22
<root BTCPP_format="4" main_tree_to_execute="ML Segment Point Cloud">
33
<!--//////////-->
4-
<BehaviorTree
5-
ID="ML Segment Point Cloud"
6-
_description="Captures a point cloud and segments out all possible objects from a text prompt"
7-
_favorite="false"
8-
>
4+
<BehaviorTree ID="ML Segment Point Cloud" _description="Captures a point cloud and segments out all possible objects from a text prompt" _favorite="false">
95
<Control ID="Sequence">
10-
<SubTree
11-
ID="Segment Point Cloud from Text Prompt Subtree"
12-
_collapsed="true"
13-
camera_topic_name="/wrist_camera/camera_info"
14-
decoder_model_path="models/decoder.onnx"
15-
encoder_model_path="models/sam2_hiera_large_encoder.onnx"
16-
image_topic_name="/wrist_camera/color"
17-
masks_visualization_topic="/masks_visualization"
18-
model_package="moveit_pro_clipseg"
19-
points_topic_name="/wrist_camera/points"
20-
clip_model_path="models/clip.onnx"
21-
clipseg_model_path="models/clipseg.onnx"
22-
erosion_size="2"
23-
prompts="an object"
24-
threshold="0.28"
25-
masks3d="{masks3d}"
26-
point_cloud="{point_cloud}"
27-
point_cloud_vector="{point_cloud_vector}"
28-
/>
6+
<SubTree ID="SAM3 Segment Point Cloud Subtree" _collapsed="false" model_package="lab_sim" encoder_model_path="models/sam3_vision_encoder.onnx" decoder_model_path="models/sam3_decoder.onnx" geometry_encoder_model_path="models/sam3_geometry_encoder.onnx" text_encoder_model_path="models/sam3_text_encoder.onnx" confidence_threshold="0.5" masks_visualization_topic="/masks_visualization" text_prompt="an object" image_topic_name="/wrist_camera/color" points_topic_name="/wrist_camera/points" camera_topic_name="/wrist_camera/camera_info" masks2d="{masks2d}" masks3d="{masks3d}" point_cloud="{point_cloud}" point_cloud_vector="{point_cloud_vector}" confidence_scores="{confidence_scores}" confidence_scores_str="{confidence_scores_str}"/>
297
</Control>
308
</BehaviorTree>
319
<TreeNodesModel>
3210
<SubTree ID="ML Segment Point Cloud">
3311
<MetadataFields>
34-
<Metadata runnable="true" />
35-
<Metadata subcategory="Application - ML (GPU Recommended)" />
12+
<Metadata runnable="true"/>
13+
<Metadata subcategory="Application - ML (GPU Recommended)"/>
3614
</MetadataFields>
3715
</SubTree>
3816
</TreeNodesModel>

src/lab_sim/objectives/pick_1_pill_bottle.xml

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,23 +16,25 @@
1616
waypoint_name="Above Pick Cube"
1717
/>
1818
<SubTree
19-
ID="Segment Point Cloud from Text Prompt Subtree"
19+
ID="SAM3 Segment Point Cloud Subtree"
2020
_collapsed="false"
21-
camera_topic_name="/wrist_camera/camera_info"
22-
decoder_model_path="models/decoder.onnx"
23-
encoder_model_path="models/sam2_hiera_large_encoder.onnx"
24-
image_topic_name="/wrist_camera/color"
21+
model_package="lab_sim"
22+
encoder_model_path="models/sam3_vision_encoder.onnx"
23+
decoder_model_path="models/sam3_decoder.onnx"
24+
geometry_encoder_model_path="models/sam3_geometry_encoder.onnx"
25+
text_encoder_model_path="models/sam3_text_encoder.onnx"
26+
confidence_threshold="0.5"
2527
masks_visualization_topic="/masks_visualization"
26-
model_package="moveit_pro_clipseg"
28+
text_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
29+
image_topic_name="/wrist_camera/color"
2730
points_topic_name="/wrist_camera/points"
28-
clip_model_path="models/clip.onnx"
29-
clipseg_model_path="models/clipseg.onnx"
30-
erosion_size="2"
31-
prompts="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
32-
threshold="0.28"
31+
camera_topic_name="/wrist_camera/camera_info"
32+
masks2d="{masks2d}"
3333
masks3d="{masks3d}"
3434
point_cloud="{point_cloud}"
3535
point_cloud_vector="{point_cloud_vector}"
36+
confidence_scores="{confidence_scores}"
37+
confidence_scores_str="{confidence_scores_str}"
3638
/>
3739
<Decorator
3840
ID="ForEach"

src/lab_sim/objectives/pick_all_pill_bottles.xml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,8 @@
5858
camera_points_topic="/wrist_camera/points"
5959
planning_group="manipulator"
6060
output_grasp="{output_grasp}"
61-
object_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
62-
mask_threshold=".15"
63-
mask_erosion="15"
61+
text_prompt="a cylindrical object with a dark gray and silver metallic body, featuring a circular blue cap on top"
62+
confidence_threshold="0.5"
6463
name="Get grasp pose for a pill bottle"
6564
/>
6665
<SubTree ID="Add Table to Planning Scene" _collapsed="true" />
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
<?xml version="1.0" encoding="UTF-8" ?>
2+
<root BTCPP_format="4" main_tree_to_execute="SAM3 Segment Image Subtree">
3+
<!--//////////-->
4+
<BehaviorTree
5+
ID="SAM3 Segment Image Subtree"
6+
_description="Captures an image and runs SAM3 to produce 2D masks from a text prompt."
7+
_favorite="false"
8+
>
9+
<Control ID="Sequence">
10+
<Action
11+
ID="GetImage"
12+
message_timeout_sec="5.000000"
13+
topic_name="{image_topic_name}"
14+
message_out="{image}"
15+
/>
16+
<Action
17+
ID="GetMasks2DFromExemplar"
18+
target_image="{image}"
19+
text_prompt="{text_prompt}"
20+
model_package="{model_package}"
21+
encoder_model_path="{encoder_model_path}"
22+
decoder_model_path="{decoder_model_path}"
23+
geometry_encoder_model_path="{geometry_encoder_model_path}"
24+
text_encoder_model_path="{text_encoder_model_path}"
25+
confidence_threshold="{confidence_threshold}"
26+
masks2d="{masks2d}"
27+
confidence_scores="{confidence_scores}"
28+
confidence_scores_str="{confidence_scores_str}"
29+
mask_count="{mask_count}"
30+
/>
31+
<Action
32+
ID="PublishMask2D"
33+
image="{image}"
34+
masks="{masks2d}"
35+
masks_visualization_topic="{masks_visualization_topic}"
36+
opacity="0.500000"
37+
bounding_box_labels="{confidence_scores_str}"
38+
/>
39+
</Control>
40+
</BehaviorTree>
41+
<TreeNodesModel>
42+
<SubTree ID="SAM3 Segment Image Subtree">
43+
<MetadataFields>
44+
<Metadata runnable="false" />
45+
<Metadata subcategory="Perception - ML" />
46+
</MetadataFields>
47+
<input_port name="image_topic_name" />
48+
<input_port name="text_prompt" />
49+
<input_port name="model_package" />
50+
<input_port name="encoder_model_path" />
51+
<input_port name="decoder_model_path" />
52+
<input_port name="geometry_encoder_model_path" />
53+
<input_port name="text_encoder_model_path" />
54+
<input_port name="confidence_threshold" />
55+
<input_port name="masks_visualization_topic" />
56+
<output_port name="masks2d" default="{masks2d}" />
57+
<output_port name="confidence_scores" default="{confidence_scores}" />
58+
<output_port name="confidence_scores_str" default="{confidence_scores_str}" />
59+
<output_port name="mask_count" default="{mask_count}" />
60+
</SubTree>
61+
</TreeNodesModel>
62+
</root>

0 commit comments

Comments
 (0)