You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
abstract = {We introduce SPFSplat, an efficient framework for 3D Gaussian splatting from sparse multi-view images, requiring \textbf{no ground-truth poses} during both training and inference. Our method simultaneously predicts Gaussians and camera poses from unposed images in a canonical space within a single feed-forward step. During training, the pose head estimates the poses at target views, which are supervised through the image rendering loss. Additionally, a reprojection loss is introduced to ensure alignment between Gaussians and the estimated poses of input views, reinforcing geometric consistency. This pose-free training paradigm and efficient one-step feed-forward inference makes SPFSplat well-suited for practical applications. Despite the absence of pose supervision, our self-supervised SPFSplat achieves state-of-the-art performance in novel view synthesis, even under significant viewpoint changes. Furthermore, it surpasses recent methods trained with geometry priors in relative pose estimation, demonstrating its effectiveness in both 3D scene reconstruction and camera pose learning.},
abstract = {We introduce SPFSplat, an efficient framework for 3D Gaussian splatting from sparse multi-view images, requiring no ground-truth poses during training or inference. It employs a shared feature extraction backbone, enabling simultaneous prediction of 3D Gaussian primitives and camera poses in a canonical space from unposed inputs within a single feed-forward step. Alongside the rendering loss based on estimated novel-view poses, a reprojection loss is integrated to enforce the learning of pixel-aligned Gaussian primitives for enhanced geometric constraints. This pose-free training paradigm and efficient one-step feedforward design make SPFSplat well-suited for practical applications. Remarkably, despite the absence of pose supervision, SPFSplat achieves state-of-the-art performance in novel view synthesis even under significant viewpoint changes and limited image overlap. It also surpasses recent methods trained with geometry priors in relative pose estimation.},
11
13
}
12
14
13
15
@article{jing2025stereo,
14
16
title={Stereo Any Video: Temporally Consistent Stereo Matching},
abstract = {The rise of vision-language foundation models marks an advancement in bridging the gap between human and machine capabilities in 3D scene reasoning. Existing 3D reasoning benchmarks assume real-time scene accessibility, which is impractical due to the high cost of frequent scene updates. To this end, we introduce Hypothetical 3D Reasoning, namely Hypo3D, a benchmark designed to evaluate models' ability to reason without access to real-time scene data. Models need to imagine the scene state based on a provided change description before reasoning. Hypo3D is formulated as a 3D Visual Question Answering (VQA) benchmark, comprising 7,727 context changes across 700 indoor scenes, resulting in 14,885 question-answer pairs. An anchor-based world frame is established for all scenes, ensuring consistent reference to a global frame for directional terms in context changes and QAs. Extensive experiments show that state-of-the-art foundation models struggle to reason in hypothetically changed scenes. This reveals a substantial performance gap compared to humans, particularly in scenarios involving movement changes and directional reasoning. Even when the context change is irrelevant to the question, models often incorrectly adjust their answers.},
30
32
journal = {International Conference on Machine Learning},
0 commit comments