sbp_page/index.html at main · ExistentialRobotics/sbp_page · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
<!DOCTYPE html>
<html lang="en" data-theme="light">
<head>
    <base target="_blank">
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <title>Seeing the Bigger Picture: 3D Latent Mapping for Mobile Manipulation Policy Learning</title>

    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.9.3/css/bulma.min.css">
    <link rel="stylesheet" href="./static/css/index.css">
    <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">

    <!-- Set initial theme before first paint -->
    <script>
        (function () {
            const prefersDark = window.matchMedia('(prefers-color-scheme: dark)').matches;
            document.documentElement.setAttribute('data-theme', prefersDark ? 'dark' : 'light');
        })();
    </script>

    <script src="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/5.15.4/js/all.min.js"></script>
</head>
<body>
    <section class="section">
        <div class="container">
            <button class="button dark-mode-toggle is-medium" onclick="toggleDarkMode()">
                <span class="icon">
                    <i class="fas fa-moon" id="theme-icon"></i>
                </span>
            </button>
            <div class="content">

                <h1 class="title is-3 is-size-4-mobile mt-0">Seeing the Bigger Picture:<br>3D Latent Mapping for Mobile Manipulation Policy Learning</h1>


                <div class="author-info">
                    <p class="is-size-5 mb-1" style="font-weight: bold;">
                        <a href="https://www.icra2026.org/" target="_blank">IEEE International Conference on Robotics and Automation (ICRA) 2026</a>
                    </p>
                    <p class="award-text is-size-5 mb-0">
                        Best Paper Nomination @ RoboReps Workshop RSS 2025
                    </p>
                    <p class="subtitle mt-3">
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://sunghwan.me/">Sunghwan Kim</a><sup>1</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://mastee7.github.io/">Woojeh Chung</a><sup>1</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://daizhirui.github.io/">Zhirui Dai</a><sup>1</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://dwaitbhatt.com/">Dwait Bhatt</a><sup>1</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://arth.website/">Arth Shukla</a><sup>1</sup>,</div>
                        <br>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://cseweb.ucsd.edu/~haosu/">Hao Su</a><sup>1</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://www.tianyulun.com/">Yulun Tian</a><sup>2</sup>,</div>
                        <div class="is-inline-block is-size-5"><a class="has-text-weight-semibold" href="https://natanaso.github.io/">Nikolay Atanasov</a><sup>1</sup></div>
                    </p>
                    <p>
                        <div class="is-inline-block"><sup>1</sup><a href="https://ucsd.edu/">UC San Diego</a>,</div>
                        <div class="is-inline-block"><sup>2</sup><a href="https://umich.edu/">University of Michigan</a></div>
                    </p>
                </div>

                <div class="columns is-vcentered mt-1">
                    <div class="column is-narrow">
                        <a href="https://arxiv.org/abs/2510.03885" class="button is-size-5 mr-1 mt-2">
                            <span class="icon">
                                <i class="ai ai-arxiv"></i>
                            </span>
                            <span>arXiv</span>
                        </a>
                        <a href="https://youtu.be/wJpM9yQ_Iuw" class="button is-size-5 mr-1 mt-2">
                            <span class="icon">
                                <i class="fab fa-youtube"></i>
                            </span>
                            <span>Video</span>
                        </a>
                        <a href="#BibTeX" target="_self" class="button is-size-5 mr-1 mt-2">
                            <span class="icon">
                                <i class="fas fa-book"></i>
                            </span>
                            <span>BibTeX</span>
                        </a>
                        <a href="https://github.com/ExistentialRobotics/SBP" class="button is-size-5 mr-1 mt-2">
                            <span class="icon">
                                <i class="fab fa-github"></i>
                            </span>
                            <span>Code</span>
                        </a>
                    </div>
                </div>

                <div class="video-card mt-4" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <img src="./static/images/teaser.png" alt="3D Latent Map Teaser">
                </div>
                <p class="has-text-centered mt-3" style="font-size: 0.95em;">
                    SBP (Seeing the Bigger Picture) leverages 3D maps as spatiotemporal memory for learning manipulation policies.
                </p>

                <h2 id="abstract" class="title is-4 mb-2 mt-5">Abstract</h2>
                <p style="font-size: 0.95em">
                    In this paper, we demonstrate that mobile manipulation policies utilizing a <em>3D latent map</em> achieve stronger spatial and temporal reasoning than policies relying solely on images. We introduce <em>Seeing the Bigger Picture</em> (SBP), an end-to-end policy learning approach that operates directly on a 3D map of latent features. In SBP, the map extends perception beyond the robot's current field of view and aggregates observations over long horizons. Our mapping approach incrementally fuses multiview observations into a grid of scene-specific latent features. A pre-trained, scene-agnostic decoder reconstructs target embeddings from these features and enables online optimization of the map features during task execution. A policy, trainable with behavior cloning or reinforcement learning, treats the latent map as a state variable and uses global context from the map obtained via a 3D feature aggregator. We evaluate SBP on scene-level mobile manipulation and sequential tabletop manipulation tasks. Our experiments demonstrate that SBP (i) reasons globally over the scene, (ii) leverages the map as long-horizon memory, and (iii) outperforms image-based policies in both in-distribution and novel scenes, e.g., improving the success rate by 15% for the sequential manipulation task.
                </p>

                <hr class="divider">

                <h2 class="title is-3">How it Works</h2>
                <p style="font-size: 0.95em">
                    <b>Latent Mapping.</b> We represent the robot's workspace as learnable latent vectors anchored at the vertices of a 3D regular grid that is trained to reconstruct the VLM embeddings (e.g., DINOv2).
                </p>
                <div class="video-card mt-3 mb-4" style="max-width: 70%; margin-left: auto; margin-right: auto;">
                    <img src="./static/images/mapping.png" alt="Latent Mapping">
                </div>
                <p style="font-size: 0.95em">
                    <b>Map-conditioned Policy.</b> We aggregate spatially distributed map features into a compact global token using a 3D feature aggregator and use it as an additional state input to policy networks.
                </p>
                <div class="video-card mt-3 mb-4" style="max-width: 70%; margin-left: auto; margin-right: auto;">
                    <img src="./static/images/map_token.png" alt="Map-conditioned Policy">
                </div>

                <hr class="divider">

                <h2 class="title is-3">Spatial Reasoning</h2>
                <p style="font-size: 0.95em">
                    The 3D latent map acts as <em>spatial memory</em>, offering global visibility of object locations and task goals while mitigating occlusions from the current field of view. In mobile manipulation tasks, the target object is often completely outside the robot's initial field of view. Image-based policies fail to localize the object in these settings, producing erratic and inefficient trajectories. In contrast, the map-conditioned policy leverages the latent map to reason globally over the scene, navigating directly toward the target object and completing the task efficiently.
                </p>
                <div class="video-card mt-3 mb-4" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <video autoplay controls loop muted playsinline>
                        <source src="./static/images/mobile1.mp4" type="video/mp4">
                    </video>
                </div>
                <div class="video-card mb-4" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <video autoplay controls loop muted playsinline>
                        <source src="./static/images/mobile2.mp4" type="video/mp4">
                    </video>
                </div>

                <hr class="divider">

                <h2 class="title is-3">Temporal Reasoning</h2>
                <p style="font-size: 0.95em">
                    The 3D latent map also serves as <em>long-term context</em>, enabling the policy to reason beyond short observation windows. In sequential pick-and-place tasks, the robot must pick objects from a cluttered tabletop and place them in a basket in a prescribed order, relying solely on an egocentric camera with limited visibility. With online latent map updates, the map captures temporal changes in the environment, allowing the policy to track the task state and locate objects even after they leave the egocentric view.                </p>
                <div class="video-card mt-3 mb-4" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <video autoplay controls loop muted playsinline>
                        <source src="./static/images/seq1.mp4" type="video/mp4">
                    </video>
                </div>
                <div class="video-card mb-4" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <video autoplay controls loop muted playsinline>
                        <source src="./static/images/seq2.mp4" type="video/mp4">
                    </video>
                </div>

                <hr class="divider">

                <h2 class="title is-3">Zero-Shot Sim-to-Real Deployment</h2>
                <div class="video-card mt-3" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <video autoplay controls loop muted playsinline>
                        <source src="./static/images/video_fast.mp4" type="video/mp4">
                    </video>
                </div>

                <hr class="divider">

                <h2 class="title is-3">Video</h2>
                <div class="video-card mt-3" style="max-width: 80%; margin-left: auto; margin-right: auto;">
                    <div class="youtube-wrapper">
                        <iframe src="https://www.youtube.com/embed/wJpM9yQ_Iuw" allowfullscreen></iframe>
                    </div>
                </div>

                <hr class="divider">

                <h2 class="title is-4">Acknowledgements</h2>
                <p style="font-size: 0.95em">
                    We gratefully acknowledge support from NSF CCF-2402689 (ExpandAI), ONR N00014-23-1-2353, and the Technology Innovation Program (20018112, Development of autonomous manipulation and gripping technology using imitation learning based on visual and tactile sensing) funded by the Ministry of Trade, Industry & Energy (MOTIE), Korea.
                </p>

                <hr class="divider">

                <!-- Citation section -->
                <h3 id="BibTeX" class="title is-4">Citation</h3>
                <div class="is-relative">
                    <pre class="textarea is-family-code" readonly id="citation-text">@article{kim2025seeing,
  title={Seeing the Bigger Picture: 3D Latent Mapping for Mobile Manipulation Policy Learning},
  author={Kim, Sunghwan and Chung, Woojeh and Dai, Zhirui and Bhatt, Dwait and Shukla, Arth and Su, Hao and Tian, Yulun and Atanasov, Nikolay},
  booktitle={IEEE International Conference on Robotics and Automation (ICRA)},
  year={2026}
}</pre>
                    <button id="citation-copy-button" class="button is-small" onclick="copyCitation()">
                        <span class="icon">
                            <i class="fas fa-copy"></i>
                        </span>
                    </button>
                    <div id="toast" class="notification is-success is-light is-hidden">
                        Copied to clipboard!
                    </div>
                </div>

                <hr class="divider">
                <div class="footer-info has-text-centered">
                    <p class="has-text-weight-light" style="font-size: 0.9em;">
                        Template inspired from <a href="https://cutamp.github.io/">cutamp.github.io</a>
                    </p>
                </div>

            </div>
        </div>
    </section>

    <script>
        // Handle copying citation text
        const copyCitation = async () => {
            try {
                const citationText = document.getElementById('citation-text').innerText;
                await navigator.clipboard.writeText(citationText);

                const toast = document.getElementById('toast');
                toast.classList.remove('is-hidden');
                setTimeout(() => toast.classList.add('is-hidden'), 2000);
            } catch (err) {
                console.error('Failed to copy citation:', err);
            }
        };

        // Handle dark/light theme toggling
        const updateTheme = (isDark) => {
            const html = document.documentElement;
            const themeIcon = document.getElementById('theme-icon');

            html.setAttribute('data-theme', isDark ? 'dark' : 'light');

            themeIcon.classList.remove('fa-moon', 'fa-sun');
            themeIcon.classList.add(isDark ? 'fa-sun' : 'fa-moon');
        };

        const toggleDarkMode = () => {
            const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
            updateTheme(!isDark);
        };

        // Listen for system theme changes
        window.matchMedia('(prefers-color-scheme: dark)').addEventListener('change', e => {
            updateTheme(e.matches);
        });

        // Set initial icon state
        document.addEventListener('DOMContentLoaded', () => {
            const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
            const themeIcon = document.getElementById('theme-icon');
            themeIcon.classList.remove('fa-moon', 'fa-sun');
            themeIcon.classList.add(isDark ? 'fa-sun' : 'fa-moon');
        });
    </script>
</body>
</html>