diff --git a/cosmos_framework/__init__.py b/cosmos_framework/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/__init__.py
+++ b/cosmos_framework/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/auxiliary/guardrail/common/presets.py b/cosmos_framework/auxiliary/guardrail/common/presets.py
index d320b5e..8536685 100644
--- a/cosmos_framework/auxiliary/guardrail/common/presets.py
+++ b/cosmos_framework/auxiliary/guardrail/common/presets.py
@@ -7,9 +7,6 @@
 from cosmos_framework.auxiliary.guardrail.common.core import GuardrailRunner
 from cosmos_framework.auxiliary.guardrail.face_blur_filter.face_blur_filter import RetinaFaceFilter
 from cosmos_framework.auxiliary.guardrail.qwen3guard.qwen3guard import Qwen3Guard
-from cosmos_framework.auxiliary.guardrail.video_content_safety_filter.video_content_safety_filter import (
-    VideoContentSafetyFilter,
-)
 from cosmos_framework.utils import log
 
 
@@ -27,7 +24,8 @@ def create_video_guardrail_runner(offload_model_to_cpu: bool = False) -> Guardra
     """Create the video guardrail runner."""
     return GuardrailRunner(
         safety_models=[
-            # VideoContentSafetyFilter(offload_model_to_cpu=offload_model_to_cpu), # Too many false positives
+            # VideoContentSafetyFilter(offload_model_to_cpu=offload_model_to_cpu)
+            # Too many false positives, add back when fixed
         ],
         postprocessors=[RetinaFaceFilter(offload_model_to_cpu=offload_model_to_cpu)],
     )
diff --git a/cosmos_framework/auxiliary/guardrail/face_blur_filter/retinaface_utils.py b/cosmos_framework/auxiliary/guardrail/face_blur_filter/retinaface_utils.py
index cffebc2..805ecd5 100644
--- a/cosmos_framework/auxiliary/guardrail/face_blur_filter/retinaface_utils.py
+++ b/cosmos_framework/auxiliary/guardrail/face_blur_filter/retinaface_utils.py
@@ -1,4 +1,3 @@
-# Copyright (c) 2019
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
diff --git a/cosmos_framework/auxiliary/guardrail/qwen3guard/__init__.py b/cosmos_framework/auxiliary/guardrail/qwen3guard/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/auxiliary/guardrail/qwen3guard/__init__.py
+++ b/cosmos_framework/auxiliary/guardrail/qwen3guard/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/callbacks/compile_tokenizer.py b/cosmos_framework/callbacks/compile_tokenizer.py
index 84efa16..3ee15d2 100644
--- a/cosmos_framework/callbacks/compile_tokenizer.py
+++ b/cosmos_framework/callbacks/compile_tokenizer.py
@@ -4,7 +4,7 @@
 """Training callback that defers AOT compilation of the VAE tokenizer.
 
 The actual compilation logic lives in
-:meth:`~projects.cosmos3.vfm.tokenizers.wan2pt2_vae_4x16x16.Wan2pt2VAEInterface.compile_encode`.
+:meth:`~cosmos_framework.model.vfm.tokenizers.wan2pt2_vae_4x16x16.Wan2pt2VAEInterface.compile_encode`.
 This module provides a :class:`CompileTokenizer` callback that invokes it
 at the right point during training (after ``compile_after_iterations``
 steps, to avoid NCCL timeouts during CUDA/cuDNN warm-up).
@@ -21,6 +21,7 @@
 """
 
 from collections.abc import Sequence
+from typing import Literal
 
 import torch
 
@@ -43,6 +44,10 @@ def __init__(
         enabled: bool = False,
         compile_after_iterations: int = 3,
         warmup_resolutions: Sequence[str] | None = None,
+        backend: Literal["cudagraphs", "inductor"] = "inductor",
+        mode: Literal["reduce-overhead", "max-autotune"] | None = "reduce-overhead",
+        fullgraph: bool = False,
+        dynamic: bool = False,
     ):
         """
         Args:
@@ -60,6 +65,10 @@ def __init__(
         self.compile_after_iterations: int = compile_after_iterations
         self.skip_counter: int = 0
         self.warmup_resolutions: Sequence[str] | None = warmup_resolutions
+        self.backend: Literal["cudagraphs", "inductor"] = backend
+        self.mode: Literal["reduce-overhead", "max-autotune"] | None = mode
+        self.fullgraph: bool = fullgraph
+        self.dynamic: bool = dynamic
 
         if self.enabled:
             if self.warmup_resolutions is None:
@@ -101,6 +110,10 @@ def on_training_step_start(
                 tokenizer.compile_encode(
                     self.warmup_resolutions,
                     output_dir=self.config.job.path_local,
+                    backend=self.backend,
+                    mode=self.mode,
+                    fullgraph=self.fullgraph,
+                    dynamic=self.dynamic,
                 )
 
         self.skip_counter += 1
diff --git a/cosmos_framework/callbacks/data_stats.py b/cosmos_framework/callbacks/data_stats.py
index 20e3161..2914981 100644
--- a/cosmos_framework/callbacks/data_stats.py
+++ b/cosmos_framework/callbacks/data_stats.py
@@ -51,7 +51,6 @@ def on_training_step_end(
 
         # Handle case where dataset_name gets batched into a list
         if isinstance(dataset_name, list):
-
             assert len(dataset_name) == 1, "dataset_name should be a list of 1"
             dataset_name = dataset_name[0]
 
diff --git a/cosmos_framework/callbacks/dataloader_state.py b/cosmos_framework/callbacks/dataloader_state.py
index ec20eea..bee9e43 100644
--- a/cosmos_framework/callbacks/dataloader_state.py
+++ b/cosmos_framework/callbacks/dataloader_state.py
@@ -26,18 +26,14 @@ class DataLoaderStateCallback(Callback):
     def __init__(
         self,
         distributor_type: str | None = None,
-        name: str = "",
     ) -> None:
         super().__init__()
         self.distributor_type = distributor_type
-        self.name = name
         self.config: Any = None
         self.state: dict[int, NoReplaceShardlistState] = {}
         self.verbose = True
 
     def _update_state_from_batch(self, data_batch: dict[str, torch.Tensor]) -> None:
-        if "sample_worker_id" not in data_batch:
-            return  # batch has no position metadata (shuffle=False or iterable data_source)
         worker_ids = data_batch["sample_worker_id"].tolist()  # [B]
         epochs = data_batch["sample_epoch"].tolist()  # [B]
         indices = data_batch["sample_index"].tolist()  # [B]
@@ -50,8 +46,6 @@ def _update_state_from_batch(self, data_batch: dict[str, torch.Tensor]) -> None:
             ):
                 self.state[worker_id] = NoReplaceShardlistState(epoch=epoch, index=index)
 
-    _ACTIVE_DISTRIBUTOR_TYPES = ("no_replace",)
-
     def on_training_step_batch_end(
         self,
         model: ImaginaireModel,
@@ -60,7 +54,7 @@ def on_training_step_batch_end(
         loss: torch.Tensor,
         iteration: int = 0,
     ) -> None:
-        if self.distributor_type in self._ACTIVE_DISTRIBUTOR_TYPES:
+        if self.distributor_type == "no_replace":
             self._update_state_from_batch(data_batch)
 
     def on_training_step_end(
@@ -71,7 +65,7 @@ def on_training_step_end(
         loss: torch.Tensor,
         iteration: int = 0,
     ) -> None:
-        if self.distributor_type in self._ACTIVE_DISTRIBUTOR_TYPES:
+        if self.distributor_type == "no_replace":
             if self.verbose:
                 if iteration % self.config.trainer.logging_iter == 0:
                     msg = "\n"
@@ -80,10 +74,10 @@ def on_training_step_end(
                     log.info(msg)
 
     def has_checkpoint_state(self) -> bool:
-        return self.distributor_type in self._ACTIVE_DISTRIBUTOR_TYPES
+        return self.distributor_type == "no_replace"
 
     def state_dict(self) -> dict[int, dict[str, int]]:
-        if self.distributor_type not in self._ACTIVE_DISTRIBUTOR_TYPES:
+        if self.distributor_type != "no_replace":
             return {}
 
         state_dict: dict[int, dict[str, int]] = {}
@@ -96,7 +90,7 @@ def state_dict(self) -> dict[int, dict[str, int]]:
         return state_dict
 
     def load_state_dict(self, state_dict: dict[int, dict[str, int]]) -> None:
-        if self.distributor_type not in self._ACTIVE_DISTRIBUTOR_TYPES:
+        if self.distributor_type != "no_replace":
             return
 
         if not state_dict:
@@ -110,4 +104,4 @@ def load_state_dict(self, state_dict: dict[int, dict[str, int]]) -> None:
             self.state[worker_id] = NoReplaceShardlistState(epoch=epoch, index=index)
             os.environ[f"NSL_STATE_WORKER_{worker_id}_EPOCH"] = str(epoch)
             os.environ[f"NSL_STATE_WORKER_{worker_id}_INDEX"] = str(index)
-            log.info(f"Loaded no_replace dataloader state for worker {worker_id}: epoch={epoch}, index={index}")
+            log.info(f"Loaded no replace dataloader state for worker {worker_id}: epoch={epoch}, index={index}")
diff --git a/cosmos_framework/callbacks/every_n_draw_sample.py b/cosmos_framework/callbacks/every_n_draw_sample.py
index baf1ffc..9aa96fa 100644
--- a/cosmos_framework/callbacks/every_n_draw_sample.py
+++ b/cosmos_framework/callbacks/every_n_draw_sample.py
@@ -154,8 +154,6 @@ def x0_pred(self, trainer, model, data_batch, output_batch, loss, iteration):
         tag = "ema" if self.is_ema else "reg"
 
         log.debug("starting data and condition model", rank0_only=False)
-
-
         data_clean = model.get_data_and_condition(data_batch)
         raw_data = data_clean.raw_state_vision
         x0 = data_clean.x0_tokens_vision
@@ -185,7 +183,6 @@ def x0_pred(self, trainer, model, data_batch, output_batch, loss, iteration):
             log.debug(f"done denoising {sigma}", rank0_only=False)
             mse_loss = distributed.dist_reduce_tensor(F.mse_loss(sample, x0))
             mse_loss_list.append(mse_loss)
-
             if hasattr(model, "decode"):
                 sample = model.decode(sample)
             to_show.append(sample.float().cpu())
@@ -316,7 +313,6 @@ def sample(self, trainer, model, data_batch, output_batch, loss, iteration):
             for sample_idx in range(data_clean.batch_size):
                 n_vis = num_items[sample_idx]
                 # First item(s) are condition, last item is generation target
-
                 # but we need to support multiple conditions per sample in the future. Current code
                 # can handle this without throwing an error.
                 condition_images.append(raw_data[vis_offset])  # source image (1, C, 1, H, W)
diff --git a/cosmos_framework/callbacks/grad_clip.py b/cosmos_framework/callbacks/grad_clip.py
index 151c1bc..f3cb4fa 100644
--- a/cosmos_framework/callbacks/grad_clip.py
+++ b/cosmos_framework/callbacks/grad_clip.py
@@ -132,7 +132,7 @@ def _clip_grad(
         # `torch.distributed._tensor.ops.math_ops._NormPartial`.
         # We can simply reduce the DTensor to get the total norm in this
         # tensor's process group and then convert it to a local tensor.
-
+        # NOTE: It has two purposes:
         # 1. to make sure the total norm is computed correctly when PP is used (see below)
         # 2. to return a reduced mesh_norm tensor whose .item() would return the correct value
         if isinstance(mesh_norm, DTensor):
diff --git a/cosmos_framework/callbacks/hf_export.py b/cosmos_framework/callbacks/hf_export.py
index 8bcba5a..6d23568 100644
--- a/cosmos_framework/callbacks/hf_export.py
+++ b/cosmos_framework/callbacks/hf_export.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """HFExportCallback: export VLM DCP checkpoints to HuggingFace safetensors format.
 
 Design notes
@@ -137,11 +138,11 @@ def on_save_checkpoint(self, model: Any, state_dict: dict[str, Any]) -> None:
         if not isinstance(model, VLMModel):
             # The legacy vlm/train.py path passes model_parts: list[nn.Module] (raw HF
             # models without the VLMModel attribute structure).  HF export requires the
-            # VLMModel wrapper, which is only available via the unified cosmos_framework/scripts/train.py path.
+            # VLMModel wrapper, which is only available via the unified scripts/train.py path.
             if isinstance(model, list):
                 log.warning(
                     "[HFExportCallback] Received model_parts (list) instead of VLMModel. "
-                    "HF export requires the unified training path (cosmos_framework/scripts/train.py). Skipping."
+                    "HF export requires the unified training path (scripts/train.py). Skipping."
                 )
             else:
                 log.warning(
diff --git a/cosmos_framework/callbacks/mfu.py b/cosmos_framework/callbacks/mfu.py
index 3035437..4a6c792 100644
--- a/cosmos_framework/callbacks/mfu.py
+++ b/cosmos_framework/callbacks/mfu.py
@@ -138,7 +138,6 @@ def _ensure_initialised(self, model: ImaginaireModel) -> None:
         ac_cfg = getattr(model_cfg, "activation_checkpointing", None)
         ac_mode = getattr(ac_cfg, "mode", "none")
 
-
         # Some activations don't need to be recomputed under selective AC, so
         # we need to remove them from the FLOP computation.
         self._use_activation_checkpointing = ac_mode != "none"
diff --git a/cosmos_framework/callbacks/wandb_log_eval.py b/cosmos_framework/callbacks/wandb_log_eval.py
index ac6911f..abea93f 100644
--- a/cosmos_framework/callbacks/wandb_log_eval.py
+++ b/cosmos_framework/callbacks/wandb_log_eval.py
@@ -88,7 +88,6 @@ def on_validation_step_end(
 
         # Handle case where dataset_name gets batched into a list
         if isinstance(dataset_name, list):
-
             assert len(dataset_name) == 1, "dataset_name should be a list of 1"
             dataset_name = dataset_name[0]
 
diff --git a/cosmos_framework/checkpoint/dcp.py b/cosmos_framework/checkpoint/dcp.py
index 4e036c9..7318e4a 100644
--- a/cosmos_framework/checkpoint/dcp.py
+++ b/cosmos_framework/checkpoint/dcp.py
@@ -63,6 +63,7 @@
     set_model_state_dict,
 )
 from torch.distributed.checkpoint.stateful import Stateful
+from torch.nn.modules.module import _IncompatibleKeys
 
 from cosmos_framework.checkpoint.base import AbstractCheckpointer
 from cosmos_framework.checkpoint.s3_filesystem import S3StorageReader, S3StorageWriter
@@ -85,11 +86,11 @@ def __init__(self, model: nn.Module) -> None:
     def state_dict(self) -> dict[str, Any]:
         return get_model_state_dict(self.model)
 
-    def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        set_model_state_dict(
+    def load_state_dict(self, state_dict: dict[str, Any]) -> _IncompatibleKeys:
+        return set_model_state_dict(
             self.model,
             model_state_dict=state_dict,
-            options=StateDictOptions(strict=True),
+            options=StateDictOptions(strict=False),
         )
 
 
@@ -539,28 +540,13 @@ def load(
                                     "Ensure the model has net_ema submodule."
                                 )
                                 _state_dict[sd_key] = _state_dict[key_ema]
-                    elif warm_start and any(str(s).startswith("net_ema") for s in self.keys_to_skip_loading):
-                        # Only when net_ema.* is explicitly skipped on load (e.g. an HF->DCP
-                        # init from convert_model_to_dcp that has only net.*): the skipped
-                        # net_ema.* keep build_net() construction values (random init when
-                        # vlm_config.pretrained_weights.enabled=False), which would seed EMA
-                        # from random weights -> copy net.* -> net_ema.* so EMA starts from the
-                        # freshly-loaded init. When net_ema.* IS loaded (e.g. a training DCP
-                        # that carries a trained EMA), do NOT clobber it.
-                        log.info("Warm start: net_ema. skipped on load -> resetting net_ema = net.")
-                        for sd_key in list(_state_dict.keys()):
-                            if sd_key.startswith("net."):
-                                key_ema = "net_ema." + sd_key.removeprefix("net.")
-                                if key_ema in _state_dict:
-                                    _state_dict[key_ema] = _state_dict[sd_key]
                     results = _model_wrapper.load_state_dict(_state_dict)
-                    if results is not None:
-                        if len(results.missing_keys) > 0:
-                            raise ValueError(f"Missing keys (not found in checkpoint): {results.missing_keys}")
-                        if len(results.unexpected_keys) > 0:
-                            raise ValueError(
-                                f"Unexpected keys (found in checkpoint but not in model): {results.unexpected_keys}"
-                            )
+                    if len(results.missing_keys) > 0:
+                        raise ValueError(f"Missing keys (not found in checkpoint): {results.missing_keys}")
+                    if len(results.unexpected_keys) > 0:
+                        raise ValueError(
+                            f"Unexpected keys (found in checkpoint but not in model): {results.unexpected_keys}"
+                        )
 
                 elif key == "optim":
                     log.info("- Loading the optimizer...")
diff --git a/cosmos_framework/checkpoint/s3_filesystem.py b/cosmos_framework/checkpoint/s3_filesystem.py
index e47219e..029570e 100644
--- a/cosmos_framework/checkpoint/s3_filesystem.py
+++ b/cosmos_framework/checkpoint/s3_filesystem.py
@@ -3,32 +3,89 @@
 
 import io
 import os
+import threading
 import time
 from contextlib import contextmanager
 from typing import Generator, Union
 from urllib.parse import urlparse
 
+import boto3
+from botocore.config import Config as S3Config
 from botocore.exceptions import ClientError
 from torch.distributed.checkpoint import FileSystemReader, FileSystemWriter
 from torch.distributed.checkpoint.filesystem import FileSystemBase
 
 from cosmos_framework.utils import log
 from cosmos_framework.utils.easy_io import easy_io
+from cosmos_framework.utils.easy_io.backends import auto_auth
 
 
-class S3Stream(io.BytesIO):
+class _CancellableReader:
+    """Pipe-reader wrapper whose ``read`` raises once a cancel event is set.
+
+    Lets us abort an in-flight ``client.upload_fileobj`` on producer error: a
+    read exception makes boto3 abort the multipart upload, whereas just
+    closing the pipe writer would signal EOF and finalize a truncated file.
     """
-    Workaround for PyTorch manually closing the stream before we can upload it to S3. We override the close() as noop
-    and instead call our own _true_close() method to close the stream after we are done using it.
-    The commit at fault is https://github.com/pytorch/pytorch/commit/9c909bf3bb122db2cce95e2eb7459bbe50dfa15a
+
+    def __init__(self, f, cancel_event: threading.Event) -> None:
+        self._f = f
+        self._cancel = cancel_event
+
+    def read(self, n: int = -1) -> bytes:
+        if self._cancel.is_set():
+            raise IOError("S3 upload cancelled by caller")
+        return self._f.read(n)
+
+    def readable(self) -> bool:
+        return True
+
+    def close(self) -> None:
+        self._f.close()
+
+
+class _CountingPipeWriter(io.RawIOBase):
+    """Write-only pipe wrapper that fakes ``tell()`` by counting bytes written.
+
+    DCP calls ``stream.tell()`` to record per-tensor byte offsets in the
+    checkpoint metadata, but kernel pipes aren't seekable. We maintain the
+    byte count ourselves; nothing actually seeks.
     """
 
-    def close(self):
-        self.flush()
-        # No close
+    def __init__(self, write_file) -> None:
+        super().__init__()
+        self._f = write_file
+        self._pos = 0
+
+    def write(self, b) -> int:
+        n = self._f.write(b)
+        if n is None:
+            raise OSError("_CountingPipeWriter: underlying pipe write returned None; expected a blocking write.")
+        self._pos += n
+        return n
+
+    def writable(self) -> bool:
+        return True
+
+    def seekable(self) -> bool:
+        return False  # pipes can't seek; consumers (zipfile, etc.) check this
+
+    def tell(self) -> int:
+        return self._pos
 
-    def _true_close(self):
-        super().close()
+    def fileno(self) -> int:
+        return self._f.fileno()
+
+    def flush(self) -> None:
+        self._f.flush()
+
+    def close(self) -> None:
+        if self.closed:
+            return
+        try:
+            super().close()  # invokes self.flush(), then sets self.closed = True
+        finally:
+            self._f.close()
 
 
 class S3FileSystem(FileSystemBase):
@@ -69,6 +126,33 @@ def __init__(
         if enable_gcs_patch_in_boto3:
             log.info("enable_gcs_patch_in_boto3: True")
 
+        # Direct boto3 client for streaming-multipart uploads (``upload_fileobj``
+        # via boto3's TransferManager). We can't reuse ``self.easy_io_backend``'s
+        # client: easy_io abstracts the transport (could be ``Boto3Backend`` or
+        # ``MSCBackend``) and intentionally doesn't expose a raw boto3 client.
+        # Built lazily so read-only callers don't pay for it.
+        self._credential_path = credential_path
+        self._boto3_client = None
+
+    def _get_boto3_client(self):
+        """Lazily build a boto3 S3 client configured for our endpoint.
+
+        Config mirrors cosmos_framework/utils/easy_io/backends/boto3_client.py:289 to
+        preserve GCS-via-S3 signature/checksum compatibility.
+        """
+        if self._boto3_client is None:
+            with auto_auth.open_auth(self._credential_path, "r") as f:
+                cred_info = auto_auth.json_load_auth(f)
+            cfg = S3Config(
+                signature_version="s3v4",
+                s3={"addressing_style": "virtual"},
+                response_checksum_validation="when_required",
+                request_checksum_calculation="when_required",
+                retries={"max_attempts": 5, "mode": "adaptive"},
+            )
+            self._boto3_client = boto3.client("s3", **cred_info, config=cfg)
+        return self._boto3_client
+
     def _retry_with_backoff(self, operation_func, *args, **kwargs):
         """
         Execute an operation with exponential backoff retry logic.
@@ -135,24 +219,61 @@ def download_operation():
 
                 log.info(f"S3 Filesystem: Downloading {key} from bucket {bucket}", rank0_only=False)
                 self._retry_with_backoff(download_operation)
-                log.info("S3 Filesystem: Download complete", rank0_only=False)
+                log.info(f"S3 Filesystem: Download complete for {key} in bucket {bucket}", rank0_only=False)
                 yield stream
             finally:
                 stream.close()
         elif mode == "wb":
-            stream = S3Stream()
+            # Streaming multipart upload: yield the writer end of a pipe to DCP
+            # and drain the reader end via ``client.upload_fileobj`` in a
+            # background thread. Peak memory is bounded by boto3's TransferConfig
+            # (~80 MiB) regardless of file size; the pipe (~64 KiB) provides
+            # backpressure. See ``_CancellableReader`` for how producer-side
+            # errors abort the multipart upload.
+            client = self._get_boto3_client()
+            r_fd, w_fd = os.pipe()
+            read_file = os.fdopen(r_fd, "rb")
+            write_file = os.fdopen(w_fd, "wb")
+            counting_writer = _CountingPipeWriter(write_file)
+            upload_err: list = [None]
+            cancel_event = threading.Event()
+
+            def _upload_thread():
+                try:
+                    client.upload_fileobj(
+                        _CancellableReader(read_file, cancel_event),
+                        Bucket=bucket,
+                        Key=key,
+                    )
+                except Exception as e:  # noqa: BLE001 — capture and re-raise on main thread
+                    upload_err[0] = e
+                finally:
+                    try:
+                        read_file.close()
+                    except Exception:
+                        pass
+
+            log.info(f"S3 Filesystem: Streaming upload {key} to bucket {bucket}", rank0_only=False)
+            uploader = threading.Thread(target=_upload_thread, daemon=True, name=f"s3-upload-{key[-32:]}")
+            uploader.start()
+
+            caller_raised = False
             try:
-                yield stream
-
-                def upload_operation():
-                    stream.seek(0)
-                    self.easy_io_backend.put(obj=stream, filepath=path_str)
-
-                log.info(f"S3 Filesystem: Uploading {key} to bucket {bucket}", rank0_only=False)
-                self._retry_with_backoff(upload_operation)
-                log.info("S3 Filesystem: Upload complete", rank0_only=False)
+                yield counting_writer
+            except Exception:
+                caller_raised = True
+                cancel_event.set()
+                raise
             finally:
-                stream._true_close()
+                try:
+                    counting_writer.close()  # closes the pipe write end → EOF for the reader
+                except Exception:
+                    pass
+                uploader.join()
+                if upload_err[0] is not None and not caller_raised:
+                    # Upload thread failed; surface that to the caller.
+                    raise upload_err[0]
+            log.info(f"S3 Filesystem: Upload complete for {key}", rank0_only=False)
         else:
             raise ValueError(f"Unsupported mode: {mode}")
 
@@ -285,7 +406,7 @@ def __init__(
         """
         super().__init__(
             path=path,
-            sync_files=False,
+            sync_files=False,  # FIXME: setting this to True makes the run to fail (L#333: `os.fsync(stream.fileno())`)
             **kwargs,
         )
         self.fs = S3FileSystem(credential_path, enable_gcs_patch_in_boto3=enable_gcs_patch_in_boto3)  # type: ignore
diff --git a/cosmos_framework/configs/base/__init__.py b/cosmos_framework/configs/base/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/__init__.py
+++ b/cosmos_framework/configs/base/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/base_config_test.py b/cosmos_framework/configs/base/base_config_test.py
deleted file mode 100644
index 3eb2b0d..0000000
--- a/cosmos_framework/configs/base/base_config_test.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-"""
-This file is used to test the config of the cosmos3 vfm project.
-It is used to verify the config is loadable.
-
-To run the test, you can use the following command:
-pytest -s cosmos_framework/configs/base/base_config_test.py
-"""
-
-import importlib
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from cosmos_framework.utils.config_helper import get_config_module, override
-
-
-@pytest.mark.L0
-@pytest.mark.parametrize(
-    "experiment_name",
-    [
-        "vision_sft_nano",
-    ],
-)
-def test_config_init_experiment_mot(experiment_name, monkeypatch):
-    """
-    Parameterized test to verify config initialization for multiple experiments.
-    PYTHONPATH=. torchrun --nproc_per_node=8 -m pytest -s cosmos_framework/configs/base/config_test_mot.py --L1
-    """
-    # The SFT experiments interpolate the dataset location from ${oc.env:DATASET_PATH};
-    # config construction only needs the variable defined, not a real dataset on disk.
-    monkeypatch.setenv("DATASET_PATH", "/tmp/dataset")
-    config_file = "cosmos_framework/configs/base/config.py"
-    config_module = get_config_module(config_file)
-    config = importlib.import_module(config_module).make_config()
-    config = override(
-        config,
-        [
-            "--",
-            f"experiment={experiment_name}",
-        ],
-    )
-
-
-def _make_self_mock(*, pretrained_enabled: bool, load_weights_from_pretrained: bool) -> MagicMock:
-    """Mock the OmniMoTModel attributes that load_pretrained_model_if_needed reads."""
-    self_mock = MagicMock()
-    self_mock.vlm_config.pretrained_weights.enabled = pretrained_enabled
-    self_mock.config.diffusion_expert_config.load_weights_from_pretrained = load_weights_from_pretrained
-    self_mock.config.ema.enabled = False
-    return self_mock
-
-
-@pytest.mark.L0
-class TestLoadPretrainedGate:
-    """Decision matrix for ``OmniMoTModel.load_pretrained_model_if_needed``.
-
-    Replaces the previous ``OmniMoTModelConfig.validate`` tests now that
-    LoadPretrained callback probes ``latest_checkpoint.txt`` / ``load_path`` at
-    ``on_train_start`` and forwards the two booleans, instead of mutating the
-    config during validation.
-    """
-
-    _LOADER_TARGET = "cosmos_framework.model.vfm.omni_mot_model.load_language_model_safetensors"
-
-    def _call(self, self_mock: MagicMock, *, has_resumable_checkpoint: bool, has_load_path: bool) -> MagicMock:
-        from cosmos_framework.model.vfm.omni_mot_model import OmniMoTModel
-
-        with patch(self._LOADER_TARGET) as loader:
-            OmniMoTModel.load_pretrained_model_if_needed(
-                self_mock,
-                has_resumable_checkpoint=has_resumable_checkpoint,
-                has_load_path=has_load_path,
-            )
-        return loader
-
-    def test_fresh_init_loads_and_copies(self):
-        """No checkpoint, no load_path: HF load AND understanding→generation copy."""
-        self_mock = _make_self_mock(pretrained_enabled=True, load_weights_from_pretrained=True)
-        loader = self._call(self_mock, has_resumable_checkpoint=False, has_load_path=False)
-        loader.assert_called_once()
-        self_mock.net.language_model.init_moe.assert_called_once()
-
-    def test_resume_skips_everything(self):
-        """Resumable checkpoint exists: neither HF load nor copy."""
-        self_mock = _make_self_mock(pretrained_enabled=True, load_weights_from_pretrained=True)
-        loader = self._call(self_mock, has_resumable_checkpoint=True, has_load_path=False)
-        loader.assert_not_called()
-        self_mock.net.language_model.init_moe.assert_not_called()
-
-    def test_warm_start_loads_but_skips_copy(self):
-        """load_path set, no checkpoint: HF load but skip understanding→generation copy."""
-        self_mock = _make_self_mock(pretrained_enabled=True, load_weights_from_pretrained=True)
-        loader = self._call(self_mock, has_resumable_checkpoint=False, has_load_path=True)
-        loader.assert_called_once()
-        self_mock.net.language_model.init_moe.assert_not_called()
-
-    def test_pretrained_disabled_short_circuits(self):
-        """pretrained_weights.enabled=False: early return regardless of other flags."""
-        self_mock = _make_self_mock(pretrained_enabled=False, load_weights_from_pretrained=True)
-        loader = self._call(self_mock, has_resumable_checkpoint=False, has_load_path=False)
-        loader.assert_not_called()
-        self_mock.net.language_model.init_moe.assert_not_called()
diff --git a/cosmos_framework/configs/base/config.py b/cosmos_framework/configs/base/config.py
index d1f975d..e766c5c 100644
--- a/cosmos_framework/configs/base/config.py
+++ b/cosmos_framework/configs/base/config.py
@@ -38,7 +38,6 @@ class Config(config.Config):
             {"ema": "power"},
             {"tokenizer": "wan2pt2_tokenizer"},
             {"sound_tokenizer": None},  # Optional: for audio-video generation
-            {"cluster": "default"},
             {"vlm_config": None},
             {"ckpt_type": "dcp"},
             {"experiment": None},
@@ -72,7 +71,6 @@ def make_config() -> Config:
 
     from cosmos_framework.configs.base.defaults.callbacks import register_callbacks
     from cosmos_framework.configs.base.defaults.checkpointer import register_checkpoint, register_ckpt_type
-    from cosmos_framework.configs.base.defaults.cluster import register_cluster
     from cosmos_framework.configs.base.defaults.ema import register_ema
 
     # from cosmos_framework.configs.base.defaults.data import register_data
@@ -92,7 +90,6 @@ def make_config() -> Config:
     register_tokenizer()
     register_sound_tokenizer()
     register_ema()
-    register_cluster()
     register_vlm()
 
     # Register shipped experiments explicitly. (vision_sft_nano also defines
diff --git a/cosmos_framework/configs/base/defaults/__init__.py b/cosmos_framework/configs/base/defaults/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/defaults/__init__.py
+++ b/cosmos_framework/configs/base/defaults/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/defaults/callbacks.py b/cosmos_framework/configs/base/defaults/callbacks.py
index 46c85ed..805ffb5 100644
--- a/cosmos_framework/configs/base/defaults/callbacks.py
+++ b/cosmos_framework/configs/base/defaults/callbacks.py
@@ -10,7 +10,6 @@
 from cosmos_framework.utils.lazy_config import LazyCall as L
 from cosmos_framework.utils.callback import LowPrecisionCallback, WandBCallback
 from cosmos_framework.callbacks.compile_tokenizer import CompileTokenizer
-
 from cosmos_framework.callbacks.device_monitor import DeviceMonitor
 from cosmos_framework.callbacks.every_n_draw_sample import EveryNDrawSample
 from cosmos_framework.callbacks.expert_heatmap import ExpertHeatmap
diff --git a/cosmos_framework/configs/base/defaults/cluster.py b/cosmos_framework/configs/base/defaults/cluster.py
deleted file mode 100644
index 23b49dd..0000000
--- a/cosmos_framework/configs/base/defaults/cluster.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-import attrs
-from hydra.core.config_store import ConfigStore
-
-
-@attrs.define(slots=False)
-class ClusterConfig:
-    """
-    Config for the cluster specific information.
-    Everything cluster specific should be here.
-    """
-
-    object_store_bucket_data: str
-    object_store_bucket_checkpoint: str
-    object_store_bucket_pretrained: str
-
-    object_store_credential_data: str
-    object_store_credential_checkpoint: str
-    object_store_credential_pretrained: str
-
-
-DefaultClusterConfig: ClusterConfig = ClusterConfig(
-    object_store_bucket_data="",
-    object_store_bucket_checkpoint="bucket-checkpoint",
-    object_store_bucket_pretrained="bucket-pretrained",
-    object_store_credential_data="credentials/data.secret",
-    object_store_credential_checkpoint="credentials/checkpoint.secret",
-    object_store_credential_pretrained="credentials/pretrained.secret",
-)
-
-
-def register_cluster():
-    cs = ConfigStore.instance()
-    cs.store(group="cluster", package="job.cluster", name="default", node=DefaultClusterConfig)
diff --git a/cosmos_framework/configs/base/defaults/compile.py b/cosmos_framework/configs/base/defaults/compile.py
index b0e1c88..3d5ebf7 100644
--- a/cosmos_framework/configs/base/defaults/compile.py
+++ b/cosmos_framework/configs/base/defaults/compile.py
@@ -24,7 +24,7 @@ class CompileConfig:
     # (maps to ``torch.compile(dynamic=...)``).  Defaults to True for training,
     # which sees varying shapes across batches (sequence length, CP sharding, ...);
     # specializing would recompile continuously.  See ParallelismOverrides in
-    # cosmos_framework/inference/common/args.py for the inference-side rationale
+    # packages/cosmos3/cosmos3/common/args.py for the inference-side rationale
     # (where dynamic=False is preferred for stable AR shapes).
     compile_dynamic: bool = True
 
diff --git a/cosmos_framework/configs/base/defaults/model_config.py b/cosmos_framework/configs/base/defaults/model_config.py
index c7b9a5c..f7e7c8d 100644
--- a/cosmos_framework/configs/base/defaults/model_config.py
+++ b/cosmos_framework/configs/base/defaults/model_config.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-
 from typing import Any
 
 import attrs
@@ -42,6 +41,14 @@ class DiffusionExpertConfig:
     rope_t_extrapolation_ratio: float = 1.0
     enable_fps_modulation: bool = False
     base_fps: int = 24
+    # Base temporal compression factor for SOUND m-RoPE. None = current behavior
+    # (sound advances at base_fps positions/sec). Set to the vision tcf (4) to put
+    # sound on the same latent-frame temporal grid as vision/action.
+    sound_base_temporal_compression_factor: int | None = None
+    # Temporal coordinates used for unified_3d_mrope vision tokens.
+    # - "latent_index": legacy behavior, positions are 0, 1, ..., T_latent-1.
+    # - "uniae_source_right_edge": use UniAE padded-patch right-edge source-frame coordinates.
+    vision_temporal_position_mode: str = "latent_index"
     # For unified_3d_mrope: whether spatial (H, W) indices reset to 0 for each vision segment
     unified_3d_mrope_reset_spatial_ids: bool = True
     # Setting the temporal gap on the boundary of the different modalities, default is 0, using a value greater than 0 will add an additional offset on the accumulated temporal offset.
@@ -273,3 +280,12 @@ class OmniMoTModelConfig:
     sound_latent_fps: int = 25  # Sound tokenizer's latent rate (e.g., 48kHz / 1920 hop = 25 Hz)
 
     log_enc_time_every_n: int = 100  # Frequency of logging encoding time to W&B
+
+    # When True, ``OmniMoTModel.state_dict`` / ``load_state_dict`` skip the
+    # reasoner (und) pathway weights under ``language_model`` — i.e. every key
+    # WITHOUT a ``_moe_gen`` suffix (including ``visual`` / ``lm_head`` /
+    # ``embed_tokens``).  These are not written to checkpoints and are left
+    # untouched on load (typically already populated from the HF pretrained
+    # backbone).  Generation-pathway (``_moe_gen``) and VFM heads are saved /
+    # loaded as usual.
+    exclude_reasoner_weights_from_checkpoint: bool = False
diff --git a/cosmos_framework/configs/base/defaults/multiview_dataloader.py b/cosmos_framework/configs/base/defaults/multiview_dataloader.py
deleted file mode 100644
index a646ac6..0000000
--- a/cosmos_framework/configs/base/defaults/multiview_dataloader.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-"""
-Hydra ConfigStore registration for multiview dataloaders.
-
-Registers named dataloader configs that can be referenced via Hydra overrides
-(e.g. ``{override /data_train: video_control_mads_multiview_0823_gcs_720p_10fps_93frames_7views}``)
-or used as templates for inline ``L(get_multiview_video_loader)(...)`` in
-experiment configs.
-
-Two naming conventions:
-
-  **Transfer** (with control signal):
-    ``video_control_{dataset}_{store}_{res}_{fps}_{frames}_{views}``
-
-  **Predict** (no control signal):
-    ``video_{dataset}_{store}_{res}_{fps}_{frames}_{views}``
-"""
-
-from hydra.core.config_store import ConfigStore
-
-from cosmos_framework.utils.lazy_config import LazyCall as L
-from cosmos_framework.data.vfm.multiview.multiview_data_source import (
-    DEFAULT_CAMERAS,
-    INDEX_TO_CAMERA_MAPPING,
-    TRANSFER_CAPTION_KEY_MAPPING,
-    TRANSFER_CONTROL_KEY_MAPPING,
-    TRANSFER_VIDEO_KEY_MAPPING,
-)
-from cosmos_framework.data.vfm.multiview.multiview_dataset import (
-    MultiviewAugmentationConfig,
-    get_multiview_video_loader,
-)
-
-# ---------------------------------------------------------------------------
-# Camera view subsets
-# ---------------------------------------------------------------------------
-
-CAMERA_VIEW_CONFIGS: dict[str, tuple[str, ...]] = {
-    "7views": DEFAULT_CAMERAS,
-    "1view_front": ("camera_front_wide_120fov",),
-    "4views": (
-        "camera_front_wide_120fov",
-        "camera_cross_right_120fov",
-        "camera_rear_tele_30fov",
-        "camera_cross_left_120fov",
-    ),
-}
-
-# ---------------------------------------------------------------------------
-# Grid dimensions
-# ---------------------------------------------------------------------------
-
-_TRANSFER_DATASETS = ["mads_multiview_0823"]
-_OBJECT_STORES = ["gcs"]
-
-_RESOLUTIONS: list[tuple[str, tuple[int, int]]] = [
-    ("720p", (720, 1280)),
-]
-
-_FPS: list[tuple[str, int]] = [
-    ("10fps", 1),  # MADS transfer data is already at 10 fps
-]
-
-_NUM_VIDEO_FRAMES: list[tuple[str, int]] = [
-    ("29frames", 29),
-    ("61frames", 61),
-    ("93frames", 93),
-]
-
-
-def register_multiview_dataloaders() -> None:
-    """Register all multiview dataloader configs with Hydra ConfigStore."""
-
-    cs = ConfigStore.instance()
-
-    # ----- Transfer dataloaders (with control signals) -----
-    for dataset in _TRANSFER_DATASETS:
-        for object_store in _OBJECT_STORES:
-            for resolution_str, resolution_hw in _RESOLUTIONS:
-                for fps_str, downsample_factor in _FPS:
-                    for num_frames_str, num_frames in _NUM_VIDEO_FRAMES:
-                        for views_str, camera_keys in CAMERA_VIEW_CONFIGS.items():
-                            name = (
-                                f"video_control_{dataset}_{object_store}_{resolution_str}_"
-                                f"{fps_str}_{num_frames_str}_{views_str}"
-                            )
-                            cs.store(
-                                group="data_train",
-                                package="dataloader_train",
-                                name=name,
-                                node=L(get_multiview_video_loader)(
-                                    dataset_name=dataset,
-                                    is_train=True,
-                                    augmentation_config=L(MultiviewAugmentationConfig)(
-                                        resolution_hw=resolution_hw,
-                                        fps_downsample_factor=downsample_factor,
-                                        num_video_frames=num_frames,
-                                        camera_keys=camera_keys,
-                                        camera_video_key_mapping=TRANSFER_VIDEO_KEY_MAPPING,
-                                        camera_caption_key_mapping=TRANSFER_CAPTION_KEY_MAPPING,
-                                        camera_control_key_mapping=TRANSFER_CONTROL_KEY_MAPPING,
-                                        position_to_camera_mapping=INDEX_TO_CAMERA_MAPPING,
-                                        single_caption_camera_name="camera_front_wide_120fov",
-                                    ),
-                                ),
-                            )
-
-    # ----- Predict dataloaders (no control signals, for future use) -----
-    # These use named keys (video_camera_front_wide_120fov, etc.) and need
-    # different datasets (e.g. alpamayo_dec2024) with 30 fps native data.
-    # Uncomment and add predict datasets to the catalog when needed.
-    #
-    # _PREDICT_DATASETS = ["alpamayo_dec2024"]
-    # _PREDICT_FPS = [("10fps", 3), ("15fps", 2)]  # 30 fps native → downsample
-    # for dataset in _PREDICT_DATASETS:
-    #     for object_store in _OBJECT_STORES:
-    #         for resolution_str, resolution_hw in _RESOLUTIONS:
-    #             for fps_str, downsample_factor in _PREDICT_FPS:
-    #                 for num_frames_str, num_frames in _NUM_VIDEO_FRAMES:
-    #                     for views_str, camera_keys in CAMERA_VIEW_CONFIGS.items():
-    #                         name = (
-    #                             f"video_{dataset}_{object_store}_{resolution_str}_"
-    #                             f"{fps_str}_{num_frames_str}_{views_str}"
-    #                         )
-    #                         cs.store(
-    #                             group="data_train",
-    #                             package="dataloader_train",
-    #                             name=name,
-    #                             node=L(get_multiview_video_loader)(
-    #                                 dataset_name=dataset,
-    #                                 is_train=True,
-    #                                 augmentation_config=L(MultiviewAugmentationConfig)(
-    #                                     resolution_hw=resolution_hw,
-    #                                     fps_downsample_factor=downsample_factor,
-    #                                     num_video_frames=num_frames,
-    #                                     camera_keys=camera_keys,
-    #                                     camera_video_key_mapping=PREDICT_VIDEO_KEY_MAPPING,
-    #                                     camera_caption_key_mapping=PREDICT_CAPTION_KEY_MAPPING,
-    #                                     camera_control_key_mapping=None,
-    #                                     position_to_camera_mapping=None,
-    #                                     single_caption_camera_name=None,
-    #                                 ),
-    #                             ),
-    #                         )
-
-
-# Auto-register on import
-register_multiview_dataloaders()
diff --git a/cosmos_framework/configs/base/defaults/tokenizer.py b/cosmos_framework/configs/base/defaults/tokenizer.py
index 55cb01c..526d579 100644
--- a/cosmos_framework/configs/base/defaults/tokenizer.py
+++ b/cosmos_framework/configs/base/defaults/tokenizer.py
@@ -17,10 +17,12 @@
 PRETRAINED_TOKENIZER_FLUX_VAE_PTH = "pretrained/tokenizers/image/flux/ae.safetensors"
 
 # UniAE checkpoint paths
-PRETRAINED_TOKENIZER_UNIAE_4X16X16_C48_T8TO24_64TO512P_FPS_ALL_ENCODER_NONCAUSAL_DECODER_NONCAUSAL_NOGAN_BEST_S1_VAE_PTH = "pretrained/tokenizers/video/cosmos/uniae4x16x16_c48_t8to24_64to512p_fps_all_encoder_noncausal_decoder_noncausal_nogan_best_s1.pt"
+PRETRAINED_TOKENIZER_UNIAE_4X16X16_C48_T16TO160_MIXP_FPS_MIX_ENCODER_NONCAUSAL_DECODER_NONCAUSAL_NOGAN_S3_NEMOTRON2B_VAE_PTH = (
+    "s3://bucket1/uniae/tok_experiments/"
+    "s3_siglip2_so400m_singledec_l48_textdec_nemotron2b_32node_bucketed_256480_v45i32c23_t16-160_exp009/checkpoints/iter_000050000.pt"
+)
 
 # DCAE checkpoint paths
-PRETRAINED_TOKENIZER_DCAE_PTH = "pretrained/tokenizers/video/cosmos/dc-ae-v-1.0-f32t4c64-cosmos-encoder-causal-decoder-chunk-causal-4-frame-120-pad-7-no-gan.pt"
 PRETRAINED_TOKENIZER_DCAE_4X32X32_C64_T120_256P_FPS_ALL_ENCODER_CAUSAL_DECODER_CHUNKCAUSAL4_NOGAN_COSMOS_PAD_7_V0PT2_PTH = "pretrained/tokenizers/video/cosmos/dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2.pt"
 PRETRAINED_TOKENIZER_DCAE_4X32X32_C96_T120_256P_FPS_ALL_ENCODER_CAUSAL_DECODER_CHUNKCAUSAL4_NOGAN_COSMOS_PAD_7_V0PT2_PTH = "pretrained/tokenizers/video/cosmos/dcae4x32x32_c96_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2.pt"
 PRETRAINED_TOKENIZER_DCAE_4X32X32_C128_T120_256P_FPS_ALL_ENCODER_CAUSAL_DECODER_CHUNKCAUSAL4_NOGAN_COSMOS_PAD_7_V0PT2_PTH = "pretrained/tokenizers/video/cosmos/dcae4x32x32_c128_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2.pt"
@@ -44,6 +46,7 @@
     chunk_duration=1,
     spatial_compression_factor=8,
     temporal_compression_factor=1,
+    causal=True,
 )
 
 Wan2pt1VAEConfig: LazyDict = L(Wan2pt1VAEInterface)(
@@ -53,6 +56,7 @@
     vae_path=PRETRAINED_TOKENIZER_WAN2PT1_VAE_PTH,
     spatial_compression_factor=8,
     temporal_compression_factor=4,
+    causal=True,
 )
 
 Wan2pt2VAEConfig: LazyDict = L(Wan2pt2VAEInterface)(
@@ -61,14 +65,7 @@
     vae_path=PRETRAINED_TOKENIZER_WAN2PT2_VAE_PTH,
     spatial_compression_factor=16,
     temporal_compression_factor=4,
-)
-
-DCAE4x32x32Config: LazyDict = L(DCAE4x32x32Interface)(
-    bucket_name=PLACEHOLDER,
-    object_store_credential_path_pretrained=PLACEHOLDER,
-    vae_path=PRETRAINED_TOKENIZER_DCAE_PTH,
-    spatial_compression_factor=32,
-    temporal_compression_factor=4,
+    causal=True,
 )
 
 DCAE4x32x32C64T120_256pFpsAllEncoderCausalDecoderChunkCausal4NoganCosmosPad7V0pt2Config: LazyDict = L(
@@ -80,6 +77,7 @@
     model_name="dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2",
     spatial_compression_factor=32,
     temporal_compression_factor=4,
+    causal=True,
 )
 
 DCAE4x32x32C96T120_256pFpsAllEncoderCausalDecoderChunkCausal4NoganCosmosPad7V0pt2Config: LazyDict = L(
@@ -104,12 +102,17 @@
     temporal_compression_factor=4,
 )
 
-UniAE4x16x16C48T8to24_64to512pFpsAllEncoderNoncausalDecoderNoncausalNoganBestS1Config: LazyDict = L(UniAEVAEInterface)(
+
+UniAE4x16x16C48T16to160MixpFpsMixEncoderNoncausalDecoderNoncausalNoganS3Nemotron2bVAEConfig: LazyDict = L(
+    UniAEVAEInterface
+)(
     bucket_name=PLACEHOLDER,
     object_store_credential_path_pretrained=PLACEHOLDER,
-    vae_path=PRETRAINED_TOKENIZER_UNIAE_4X16X16_C48_T8TO24_64TO512P_FPS_ALL_ENCODER_NONCAUSAL_DECODER_NONCAUSAL_NOGAN_BEST_S1_VAE_PTH,
+    vae_path=PRETRAINED_TOKENIZER_UNIAE_4X16X16_C48_T16TO160_MIXP_FPS_MIX_ENCODER_NONCAUSAL_DECODER_NONCAUSAL_NOGAN_S3_NEMOTRON2B_VAE_PTH,
     spatial_compression_factor=16,
     temporal_compression_factor=4,
+    pixel_trim=True,
+    causal=False,
 )
 
 # =============================================================================
@@ -173,8 +176,8 @@ def register_tokenizer():
     cs.store(
         group="tokenizer",
         package="model.config.tokenizer",
-        name="uniae_4x16x16_c48_t8to24_64to512p_fps_all_encoder_noncausal_decoder_noncausal_nogan_best_s1_tokenizer",
-        node=UniAE4x16x16C48T8to24_64to512pFpsAllEncoderNoncausalDecoderNoncausalNoganBestS1Config,
+        name="uniae_4x16x16_c48_t16to160_mixp_fps_mix_encoder_noncausal_decoder_noncausal_nogan_s3_nemotron2b_tokenizer",
+        node=UniAE4x16x16C48T16to160MixpFpsMixEncoderNoncausalDecoderNoncausalNoganS3Nemotron2bVAEConfig,
     )
     # Flux tokenizer
     cs.store(group="tokenizer", package="model.config.tokenizer", name="flux_tokenizer", node=FluxVAEConfig)
@@ -182,25 +185,19 @@ def register_tokenizer():
     cs.store(
         group="tokenizer",
         package="model.config.tokenizer",
-        name="dc_ae_4x32x32_tokenizer",
-        node=DCAE4x32x32Config,
-    )
-    cs.store(
-        group="tokenizer",
-        package="model.config.tokenizer",
-        name="dc_ae_4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
+        name="dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
         node=DCAE4x32x32C64T120_256pFpsAllEncoderCausalDecoderChunkCausal4NoganCosmosPad7V0pt2Config,
     )
     cs.store(
         group="tokenizer",
         package="model.config.tokenizer",
-        name="dc_ae_4x32x32_c96_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
+        name="dcae4x32x32_c96_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
         node=DCAE4x32x32C96T120_256pFpsAllEncoderCausalDecoderChunkCausal4NoganCosmosPad7V0pt2Config,
     )
     cs.store(
         group="tokenizer",
         package="model.config.tokenizer",
-        name="dc_ae_4x32x32_c128_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
+        name="dcae4x32x32_c128_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2_tokenizer",
         node=DCAE4x32x32C128T120_256pFpsAllEncoderCausalDecoderChunkCausal4NoganCosmosPad7V0pt2Config,
     )
 
diff --git a/cosmos_framework/configs/base/defaults/unittest.py b/cosmos_framework/configs/base/defaults/unittest.py
deleted file mode 100644
index 5af5bb2..0000000
--- a/cosmos_framework/configs/base/defaults/unittest.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-import attrs
-
-# We are hardcoding the unittest assets in this file.
-
-# add codeowner for cosmos_framework/model/vfm/tokenizers
-
-
-@attrs.define(slots=False)
-class SwfitStackPDXrConfig:
-    """
-    Config for the cluster specific information.
-    Everything cluster specific should be here.
-    """
-
-    object_store_bucket_data: str
-    object_store_credential_data: str
-
-
-UNITTEST_CONFIG = SwfitStackPDXrConfig(
-    object_store_bucket_data="unittest",
-    object_store_credential_data="credentials/pdx_dir.secret",
-)
-
-TOKENIZER_RECONSTRUCTION_VIDEO_PATH = "tokenizer/video/panda70m_test_0000039_00000.mp4"
-AVAE_RECONSTRUCTION_AUDIO_PATH = "tokenizer/audio/test_audio.wav"
diff --git a/cosmos_framework/configs/base/defaults/vlm.py b/cosmos_framework/configs/base/defaults/vlm.py
index fa4d7c8..32718f4 100644
--- a/cosmos_framework/configs/base/defaults/vlm.py
+++ b/cosmos_framework/configs/base/defaults/vlm.py
@@ -95,11 +95,7 @@ def download_tokenizer_files(model_name: str, config_variant: str) -> str:
     return destination_dir
 
 
-def create_qwen2_tokenizer_with_download(pretrained_model_name: str, config_variant: str, **_unused_kwargs):
-    # **_unused_kwargs absorbs extras (e.g. tokenizer_type) that OmegaConf
-    # merges in from a vlm_config preset's tokenizer block when an experiment
-    # overrides the tokenizer with this function but doesn't fully replace
-    # the preset's kwarg dict.
+def create_qwen2_tokenizer_with_download(pretrained_model_name: str, config_variant: str):
     destination_dir = download_tokenizer_files(pretrained_model_name, config_variant)
     return LLMTokenizerProcessor(Qwen2Tokenizer.from_pretrained(destination_dir))
 
@@ -140,7 +136,7 @@ class VLMConfig:
     # HuggingFace model identifier or local path. Drives AutoConfig + AutoModel selection.
     model_name: str = ""
 
-    # Safetensor path for model
+    # Safetensor path for model for load a safetensor from different folder
     safetensors_path: str = ""
 
     # Optional pretrained-weights overlay (separate from the AutoModel structural
@@ -285,29 +281,6 @@ class VLMConfig:
     ),
 )
 
-CosmosReason2_VLM_30b_a3b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos-Reason2-30B-A3B-Private",
-    model_instance=L(Qwen3VLMoeTextForCausalLM)(
-        config=L(create_vlm_config)(
-            base_config=L(Qwen3VLMoeMoTConfig.from_json_file)(
-                json_file="cosmos_framework/model/vfm/vlm/qwen3_vl_moe/configs/Qwen3-VL-30B-A3B-Instruct.json"
-            ),
-            layer_module="Qwen3VLMoeTextMoTDecoderLayer",
-            qk_norm_for_text=True,
-        ),
-    ),
-    tokenizer=L(build_processor_lazy)(
-        tokenizer_type="Qwen/Qwen3-VL-30B-A3B-Instruct",
-        config_variant="gcp",
-    ),
-    layer_module="Qwen3VLMoeTextMoTDecoderLayer",
-    pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos-Reason2-30B-A3B-Private/",
-        credentials_path="credentials/gcp_checkpoint.secret",
-        enable_gcs_patch_in_boto3=True,
-    ),
-)
-
 # Config for Qwen3VL 235B A22B Instruct model
 # Qwen3VLMoE uses Qwen2Tokenizer
 Qwen3VLMoT_VLM_235b_a22b_Instruct_GCP_Config: VLMConfig = VLMConfig(
@@ -458,48 +431,6 @@ class VLMConfig:
     ),
 )
 
-CosmosReason2_VLM_2b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos-Reason2-2B-Private",
-    model_instance=L(Qwen3VLTextForCausalLM)(
-        config=L(create_vlm_config)(
-            base_config=L(Qwen3VLMoTConfig.from_json_file)(
-                json_file="cosmos_framework/model/vfm/vlm/qwen3_vl/configs/Qwen3-VL-2B-Instruct.json"
-            ),
-            qk_norm_for_text=True,
-        ),
-    ),
-    tokenizer=L(build_processor_lazy)(
-        tokenizer_type="Qwen/Qwen3-VL-2B-Instruct",
-        config_variant="gcp",
-    ),
-    pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos-Reason2-2B-Private/",
-        credentials_path="credentials/gcp_checkpoint.secret",
-        enable_gcs_patch_in_boto3=True,
-    ),
-)
-
-Cosmos3Reasoner_VLM_2b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos3-Reasoner-2B-Private",
-    model_instance=L(Qwen3VLTextForCausalLM)(
-        config=L(create_vlm_config)(
-            base_config=L(Qwen3VLMoTConfig.from_json_file)(
-                json_file="cosmos_framework/model/vfm/vlm/qwen3_vl/configs/Qwen3-VL-2B-Instruct.json"
-            ),
-            qk_norm_for_text=True,
-        ),
-    ),
-    tokenizer=L(build_processor_lazy)(
-        tokenizer_type="Qwen/Qwen3-VL-2B-Instruct",
-        config_variant="gcp",
-    ),
-    pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Reasoner-2B-Private/",
-        credentials_path="credentials/gcp_checkpoint.secret",
-        enable_gcs_patch_in_boto3=True,
-    ),
-)
-
 # Config for Qwen3VL 4B Instruct model
 # Qwen3VL uses Qwen2Tokenizer
 Qwen3VLMoT_VLM_4b_Instruct_Config: VLMConfig = VLMConfig(
@@ -586,8 +517,8 @@ class VLMConfig:
     ),
 )
 
-CosmosReason2_VLM_8b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos-Reason2-8B-Private",
+Cosmos3Reasoner_VLM_8b_Private_GCP_Config: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Reasoner-8B-Private",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
             base_config=L(Qwen3VLMoTConfig.from_json_file)(
@@ -601,14 +532,14 @@ class VLMConfig:
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos-Reason2-8B-Private/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Reasoner-8B-Private/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
-Cosmos3Reasoner_VLM_8b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos3-Reasoner-8B-Private",
+Cosmos3NanoReasoner_VLM_GCP_Config: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Nano-Reasoner",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
             base_config=L(Qwen3VLMoTConfig.from_json_file)(
@@ -617,18 +548,18 @@ class VLMConfig:
             qk_norm_for_text=True,
         ),
     ),
-    tokenizer=L(build_processor_lazy)(
-        tokenizer_type="Qwen/Qwen3-VL-8B-Instruct",
+    tokenizer=L(create_qwen2_tokenizer_with_download)(
+        pretrained_model_name="Qwen/Qwen3-VL-8B-Instruct",
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Reasoner-8B-Private/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
-Cosmos3NanoReasoner_VLM_GCP_Config: VLMConfig = VLMConfig(
+Cosmos3NanoReasoner_VLM_GCP_Config_0517: VLMConfig = VLMConfig(
     model_name="nvidia/Cosmos3-Nano-Reasoner",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
@@ -643,13 +574,12 @@ class VLMConfig:
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Nano-Reasoner-bb9c6f5/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
-
 # Config for Qwen3VL 32B Instruct model
 # Qwen3VL uses Qwen2Tokenizer
 Qwen3VLMoT_VLM_32b_Instruct_Config: VLMConfig = VLMConfig(
@@ -693,8 +623,8 @@ class VLMConfig:
     ),
 )
 
-CosmosReason2_VLM_32b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos-Reason2-32B-Private",
+Cosmos3Reasoner_VLM_32b_Private_GCP_Config: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Reasoner-32B-Private",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
             base_config=L(Qwen3VLMoTConfig.from_json_file)(
@@ -708,14 +638,14 @@ class VLMConfig:
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos-Reason2-32B-Private/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Reasoner-32B-Private/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
-Cosmos3Reasoner_VLM_32b_Private_GCP_Config: VLMConfig = VLMConfig(
-    model_name="nvidia/Cosmos3-Reasoner-32B-Private",
+Cosmos3SuperReasoner_VLM_GCP_Config: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Super-Reasoner",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
             base_config=L(Qwen3VLMoTConfig.from_json_file)(
@@ -724,18 +654,18 @@ class VLMConfig:
             qk_norm_for_text=True,
         ),
     ),
-    tokenizer=L(build_processor_lazy)(
-        tokenizer_type="Qwen/Qwen3-VL-32B-Instruct",
+    tokenizer=L(create_qwen2_tokenizer_with_download)(
+        pretrained_model_name="Qwen/Qwen3-VL-32B-Instruct",
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Reasoner-32B-Private/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Super-Reasoner/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
-Cosmos3SuperReasoner_VLM_GCP_Config: VLMConfig = VLMConfig(
+Cosmos3SuperReasoner_VLM_GCP_Config_0517: VLMConfig = VLMConfig(
     model_name="nvidia/Cosmos3-Super-Reasoner",
     model_instance=L(Qwen3VLTextForCausalLM)(
         config=L(create_vlm_config)(
@@ -750,12 +680,63 @@ class VLMConfig:
         config_variant="gcp",
     ),
     pretrained_weights=PretrainedWeightsConfig(
-        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Super-Reasoner/",
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/Cosmos-Reason/Cosmos3-Super-Reasoner-b6df0d1/",
         credentials_path="credentials/gcp_checkpoint.secret",
         enable_gcs_patch_in_boto3=True,
     ),
 )
 
+# Cosmos3-Edge-Reasoner at commit 4acb717.
+# nemotron_siglip2 architecture: Nemotron text backbone (56-block hybrid layout, 2048 hidden)
+# + SigLIP2 vision encoder.  The text transformer is identical in shape to
+# Nemotron-3-Dense-VL-2B (hidden_size=2048, 56 alternating attn/MLP blocks → 28
+# effective MoT layers after _transform_text_dict).  Uses the same
+# nemotron_3_dense_vl weight remapping and config JSON.
+Cosmos3EdgeReasoner_VLM_GCP_Config_4acb717: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Edge-Reasoner",
+    model_instance=L(Nemotron3DenseVLTextForCausalLM)(
+        config=L(create_vlm_config)(
+            base_config=L(Nemotron3DenseVLMoTConfig.from_json_file)(
+                json_file="cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/configs/Nemotron-2B-Dense-VL.json"
+            ),
+            qk_norm_for_text=False,
+        ),
+    ),
+    tokenizer=L(build_processor_lazy)(
+        tokenizer_type="nvidia/Cosmos3-Edge-Reasoner",
+        config_variant="gcp",
+    ),
+    pretrained_weights=PretrainedWeightsConfig(
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/nvidia/Cosmos3-Edge-Reasoner-4acb717/",
+        credentials_path="credentials/gcp_checkpoint.secret",
+        enable_gcs_patch_in_boto3=True,
+        checkpoint_format="nemotron_3_dense_vl",
+    ),
+)
+
+# Cosmos3-Edge-Reasoner at commit 9b4c028 (2026-05-29).
+# Same nemotron_siglip2 architecture as 4acb717; new weights uploaded 2026-05-29.
+Cosmos3EdgeReasoner_VLM_GCP_Config_9b4c028: VLMConfig = VLMConfig(
+    model_name="nvidia/Cosmos3-Edge-Reasoner",
+    model_instance=L(Nemotron3DenseVLTextForCausalLM)(
+        config=L(create_vlm_config)(
+            base_config=L(Nemotron3DenseVLMoTConfig.from_json_file)(
+                json_file="cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/configs/Nemotron-2B-Dense-VL.json"
+            ),
+            qk_norm_for_text=False,
+        ),
+    ),
+    tokenizer=L(build_processor_lazy)(
+        tokenizer_type="nvidia/Cosmos3-Edge-Reasoner",
+        config_variant="gcp",
+    ),
+    pretrained_weights=PretrainedWeightsConfig(
+        backbone_path="s3://bucket0/cosmos3/pretrained/huggingface/nvidia/Cosmos3-Edge-Reasoner-9b4c028/",
+        credentials_path="credentials/gcp_checkpoint.secret",
+        enable_gcs_patch_in_boto3=True,
+        checkpoint_format="nemotron_3_dense_vl",
+    ),
+)
 
 
 def register_vlm():
@@ -832,24 +813,6 @@ def register_vlm():
         name="cosmos_reason2_vlm_2b_gcp",
         node=CosmosReason2_VLM_2b_GCP_Config,
     )
-    cs.store(
-        group="vlm_config",
-        package="model.config.vlm_config",
-        name="cosmos_reason2_vlm_2b_private_gcp",
-        node=CosmosReason2_VLM_2b_Private_GCP_Config,
-    )
-    cs.store(
-        group="vlm_config",
-        package="model.config.vlm_config",
-        name="cosmos3_reasoner_vlm_2b_private_gcp",
-        node=Cosmos3Reasoner_VLM_2b_Private_GCP_Config,
-    )
-    cs.store(
-        group="vlm_config",
-        package="model.config.vlm_config",
-        name="cosmos_reason2_vlm_8b_private_gcp",
-        node=CosmosReason2_VLM_8b_Private_GCP_Config,
-    )
     cs.store(
         group="vlm_config",
         package="model.config.vlm_config",
@@ -865,8 +828,8 @@ def register_vlm():
     cs.store(
         group="vlm_config",
         package="model.config.vlm_config",
-        name="cosmos_reason2_vlm_32b_private_gcp",
-        node=CosmosReason2_VLM_32b_Private_GCP_Config,
+        name="cosmos3_nano_reasoner_vlm_gcp_0517",
+        node=Cosmos3NanoReasoner_VLM_GCP_Config_0517,
     )
     cs.store(
         group="vlm_config",
@@ -883,8 +846,8 @@ def register_vlm():
     cs.store(
         group="vlm_config",
         package="model.config.vlm_config",
-        name="cosmos_reason2_vlm_30b_a3b_private_gcp",
-        node=CosmosReason2_VLM_30b_a3b_Private_GCP_Config,
+        name="cosmos3_super_reasoner_vlm_gcp_0517",
+        node=Cosmos3SuperReasoner_VLM_GCP_Config_0517,
     )
     cs.store(
         group="vlm_config",
@@ -922,3 +885,15 @@ def register_vlm():
         name="qwen3_vl_mot_vlm_32b_instruct_gcp",
         node=Qwen3VLMoT_VLM_32b_Instruct_GCP_Config,
     )
+    cs.store(
+        group="vlm_config",
+        package="model.config.vlm_config",
+        name="cosmos3_edge_reasoner_vlm_gcp_4acb717",
+        node=Cosmos3EdgeReasoner_VLM_GCP_Config_4acb717,
+    )
+    cs.store(
+        group="vlm_config",
+        package="model.config.vlm_config",
+        name="cosmos3_edge_reasoner_vlm_gcp_9b4c028",
+        node=Cosmos3EdgeReasoner_VLM_GCP_Config_9b4c028,
+    )
diff --git a/cosmos_framework/configs/base/experiment/action/__init__.py b/cosmos_framework/configs/base/experiment/action/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/experiment/action/__init__.py
+++ b/cosmos_framework/configs/base/experiment/action/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/__init__.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/__init__.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
index 4c3a6fb..4bc0c29 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
@@ -59,7 +59,6 @@
             {"override /ema": "power"},
             {"override /tokenizer": "wan2pt2_tokenizer"},
             {"override /sound_tokenizer": None},
-            {"override /cluster": None},
             {"override /vlm_config": None},
             {"override /ckpt_type": "dcp"},
             "_self_",
diff --git a/cosmos_framework/configs/base/experiment/action/pretrained_config/__init__.py b/cosmos_framework/configs/base/experiment/action/pretrained_config/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/experiment/action/pretrained_config/__init__.py
+++ b/cosmos_framework/configs/base/experiment/action/pretrained_config/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/experiment/posttrain_video/__init__.py b/cosmos_framework/configs/base/experiment/posttrain_video/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/experiment/posttrain_video/__init__.py
+++ b/cosmos_framework/configs/base/experiment/posttrain_video/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/experiment/sft/vision_sft_nano.py b/cosmos_framework/configs/base/experiment/sft/vision_sft_nano.py
index fc78994..46957d1 100644
--- a/cosmos_framework/configs/base/experiment/sft/vision_sft_nano.py
+++ b/cosmos_framework/configs/base/experiment/sft/vision_sft_nano.py
@@ -78,7 +78,6 @@
             {"override /ema": "power"},
             {"override /tokenizer": "wan2pt2_tokenizer"},
             {"override /sound_tokenizer": None},
-            {"override /cluster": None},
             {"override /vlm_config": None},
             {"override /ckpt_type": "dcp"},
             "_self_",
diff --git a/cosmos_framework/configs/base/experiment/sft/vision_sft_super.py b/cosmos_framework/configs/base/experiment/sft/vision_sft_super.py
index a1134c5..a49bb3d 100644
--- a/cosmos_framework/configs/base/experiment/sft/vision_sft_super.py
+++ b/cosmos_framework/configs/base/experiment/sft/vision_sft_super.py
@@ -87,7 +87,6 @@
             {"override /ema": "power"},
             {"override /tokenizer": "wan2pt2_tokenizer"},
             {"override /sound_tokenizer": None},
-            {"override /cluster": None},
             {"override /vlm_config": None},
             {"override /ckpt_type": "dcp"},
             "_self_",
diff --git a/cosmos_framework/configs/base/vlm/__init__.py b/cosmos_framework/configs/base/vlm/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/vlm/__init__.py
+++ b/cosmos_framework/configs/base/vlm/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/vlm/config.py b/cosmos_framework/configs/base/vlm/config.py
index 66dc3ed..4d1de85 100644
--- a/cosmos_framework/configs/base/vlm/config.py
+++ b/cosmos_framework/configs/base/vlm/config.py
@@ -4,10 +4,9 @@
 from cosmos_framework.trainer import ImaginaireTrainer
 from cosmos_framework.utils import log
 from cosmos_framework.utils.config_helper import import_all_modules_from_package
+from cosmos_framework.configs.base.defaults.checkpointer import register_checkpoint, register_ckpt_type
 from cosmos_framework.configs.base.vlm.defaults.callbacks import register_callbacks
-from cosmos_framework.configs.base.vlm.defaults.checkpointer import register_checkpoint, register_ckpt_type
 from cosmos_framework.configs.base.vlm.defaults.config import Config
-
 from cosmos_framework.configs.base.vlm.defaults.model import register_model
 from cosmos_framework.configs.base.vlm.defaults.optimizer import register_optimizer, register_scheduler
 from cosmos_framework.configs.base.vlm.defaults.vlm_policy import register_vlm_policy
diff --git a/cosmos_framework/configs/base/vlm/defaults/__init__.py b/cosmos_framework/configs/base/vlm/defaults/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/vlm/defaults/__init__.py
+++ b/cosmos_framework/configs/base/vlm/defaults/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/vlm/defaults/callbacks.py b/cosmos_framework/configs/base/vlm/defaults/callbacks.py
index 1910b63..3392205 100644
--- a/cosmos_framework/configs/base/vlm/defaults/callbacks.py
+++ b/cosmos_framework/configs/base/vlm/defaults/callbacks.py
@@ -12,7 +12,6 @@
 from cosmos_framework.utils.lazy_config import LazyCall as L
 from cosmos_framework.utils.callback import LowPrecisionCallback, WandBCallback
 from cosmos_framework.callbacks.dataloader_state import DataLoaderStateCallback
-
 from cosmos_framework.callbacks.grad_clip import GradClip
 from cosmos_framework.callbacks.hf_export import HFExportCallback
 from cosmos_framework.callbacks.iter_speed import IterSpeed
@@ -47,7 +46,6 @@ def register_callbacks():
             config=PLACEHOLDER,
             trainer=PLACEHOLDER,
         ),  # reads model.precision; no extra kwarg needed
-
         # nvtx=L(NVTXCallback)(synchronize=True),
     )
 
diff --git a/cosmos_framework/configs/base/vlm/defaults/dataloader.py b/cosmos_framework/configs/base/vlm/defaults/dataloader.py
deleted file mode 100644
index 36b878d..0000000
--- a/cosmos_framework/configs/base/vlm/defaults/dataloader.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-from torch.utils.data import DataLoader
-
-from cosmos_framework.utils.lazy_config import LazyCall as L
-from cosmos_framework.utils.config_helper import ConfigStore
-from cosmos_framework.data.vfm.vlm.collate_fn import custom_collate
-from cosmos_framework.data.vfm.vlm.debug_data_qwen import DebugQwenDataset
-from cosmos_framework.data.vfm.vlm.dummy_data_qwen import DummyQwenDataset
-from cosmos_framework.data.vfm.processors import build_processor_lazy
-
-
-# Debug dataset
-def create_debug_dataloader_config_qwen(
-    num_images, loss_on_completion_only: bool = True, use_dummy_image: bool = False
-):
-    return L(DataLoader)(
-        dataset=L(DebugQwenDataset)(
-            tokenizer=L(build_processor_lazy)(
-                tokenizer_type="${model.config.policy.backbone.model_name}",
-                credentials="${checkpoint.load_from_object_store.credentials}",
-                bucket="${checkpoint.load_from_object_store.bucket}",
-            ),
-            num_images=num_images,
-            seq_len="${model.config.policy.model_max_length}",
-            image_token_len="${model.config.policy.qwen_max_video_token_length}",
-            # use_dummy_image=use_dummy_image,
-        ),
-        num_workers=8,
-        prefetch_factor=4,
-        batch_size=1,
-        sampler=None,
-        persistent_workers=False,
-        pin_memory=True,
-        collate_fn=custom_collate,
-    )
-
-
-def create_dummy_dataloader_config_qwen():
-    return L(DataLoader)(
-        dataset=L(DummyQwenDataset)(
-            tokenizer=L(build_processor_lazy)(
-                tokenizer_type="${model.config.policy.backbone.model_name}",
-                credentials="${checkpoint.load_from_object_store.credentials}",
-                bucket="${checkpoint.load_from_object_store.bucket}",
-            ),
-            num_visual_tokens="${model.config.policy.qwen_max_video_token_length}",
-            total_tokens="${model.config.policy.model_max_length}",
-            batch_size="${dataloader_train.batch_size}",
-        ),
-        num_workers=8,
-        prefetch_factor=4,
-        batch_size=1,
-        sampler=None,
-        persistent_workers=False,
-        pin_memory=True,
-        collate_fn=custom_collate,
-    )
-
-
-def register_data_debug():
-    cs = ConfigStore.instance()
-    for split in ["train", "val"]:
-        cs.store(
-            group=f"data_{split}",
-            package=f"dataloader_{split}",
-            name="debug_image_data_qwen",  # This data is from pixtral model output, expected to have low loss ~1.4
-            node=create_debug_dataloader_config_qwen(1),
-        )
-        cs.store(
-            group=f"data_{split}",
-            package=f"dataloader_{split}",
-            name="dummy_image_data_qwen",
-            node=create_dummy_dataloader_config_qwen(),
-        )
-
-
-def register_data():
-    register_data_debug()
diff --git a/cosmos_framework/configs/base/vlm/defaults/optimizer.py b/cosmos_framework/configs/base/vlm/defaults/optimizer.py
index 6632dc8..538ab7d 100644
--- a/cosmos_framework/configs/base/vlm/defaults/optimizer.py
+++ b/cosmos_framework/configs/base/vlm/defaults/optimizer.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Hydra config registrations for VLM optimizer + LR scheduler."""
 
 from typing import Any
diff --git a/cosmos_framework/configs/base/vlm/defaults/policy_config.py b/cosmos_framework/configs/base/vlm/defaults/policy_config.py
index 307eb92..7df9987 100644
--- a/cosmos_framework/configs/base/vlm/defaults/policy_config.py
+++ b/cosmos_framework/configs/base/vlm/defaults/policy_config.py
@@ -29,7 +29,7 @@ class PolicyConfig:
     trainable_map: Union[str, None] = None
     monkey_patch_for_text_only_data: bool = False
 
-    # HF attention impl. Default "cosmos" routes through imaginaire.attention
+    # HF attention impl. Default "cosmos" routes through cosmos_framework.model.attention
     # (NATTEN/blackwell-fmha on GB200). Override to "flash_attention_2",
     # "sdpa", or "eager" for fallback.
     attn_implementation: str = "cosmos"
@@ -53,5 +53,6 @@ class VLMModelConfig:
     ema: EMAConfig = EMAConfig(enabled=False)
 
     # Force deterministic kernels in Flash-Attention init (slower; required for
-    # parity bit-exactness)
+    # parity bit-exactness). VLM-only knob — consumed by VLMModel.__init__ via
+    # init_flash_attn_meta.
     deterministic: bool = False
diff --git a/cosmos_framework/configs/base/vlm/experiment/__init__.py b/cosmos_framework/configs/base/vlm/experiment/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/configs/base/vlm/experiment/__init__.py
+++ b/cosmos_framework/configs/base/vlm/experiment/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/configs/base/vlm/freeze_config.py b/cosmos_framework/configs/base/vlm/freeze_config.py
index 9629782..d161fe8 100644
--- a/cosmos_framework/configs/base/vlm/freeze_config.py
+++ b/cosmos_framework/configs/base/vlm/freeze_config.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """VLM freeze config (read by ``vlm_model._apply_freeze_config``)."""
 
 import attrs
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/__init__.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/__init__.py
new file mode 100644
index 0000000..28a81be
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/cropping.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/cropping.py
new file mode 100644
index 0000000..b34cb81
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/cropping.py
@@ -0,0 +1,150 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Optional
+
+import torch
+import torchvision.transforms.functional as transforms_F
+from loguru import logger as logging
+
+from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
+from cosmos_framework.data.imaginaire.webdataset.augmentors.image.misc import obtain_augmentation_size, obtain_image_size
+
+
+class CenterCrop(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs center crop.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are center cropped.
+            We also save the cropping parameters in the aug_params dict
+            so that it will be used by other transforms.
+        """
+        assert (self.args is not None) and ("size" in self.args), "Please specify size in args"
+
+        img_size = obtain_augmentation_size(data_dict, self.args)
+        width, height = img_size
+
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        for key in self.input_keys:
+            data_dict[key] = transforms_F.center_crop(data_dict[key], [height, width])
+
+        # We also add the aug params we use. This will be useful for other transforms
+        crop_x0 = (orig_w - width) // 2
+        crop_y0 = (orig_h - height) // 2
+        cropping_params = {
+            "resize_w": orig_w,
+            "resize_h": orig_h,
+            "crop_x0": crop_x0,
+            "crop_y0": crop_y0,
+            "crop_w": width,
+            "crop_h": height,
+        }
+
+        if "aug_params" not in data_dict:
+            data_dict["aug_params"] = dict()
+
+        data_dict["aug_params"]["cropping"] = cropping_params
+        return data_dict
+
+
+class BottomCrop(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Crops rows from the bottom of the image/video to reach ``target_height``.
+
+        The top of the frame is preserved (content is top-anchored). Width is unchanged.
+        Works for 3-D ``[C, H, W]`` images and 4-D ``[C, T, H, W]`` or ``[T, C, H, W]``
+        videos — the last two dims are always treated as (H, W).
+
+        Args:
+            data_dict (dict): Input data dict. ``self.args["target_height"]`` is the
+                desired output height. Source height must be ``>= target_height``.
+
+        Returns:
+            data_dict (dict): Output dict where images are bottom-cropped and
+            ``image_size`` is updated to ``[target_h, w, orig_h, orig_w]`` to mirror
+            :class:`ReflectionPadding`'s contract.
+        """
+        assert (self.args is not None) and ("target_height" in self.args), "Please specify target_height in args"
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+
+        target_h = int(self.args["target_height"])
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        assert orig_h >= target_h, (
+            f"BottomCrop requires source height >= target_height: got orig_h={orig_h}, target_h={target_h}"
+        )
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            tensor = data_dict[inp_key]
+            # Slice the last 2 dims; the second-to-last dim is height regardless of
+            # whether the tensor is CHW, CTHW, or TCHW.
+            data_dict[out_key] = tensor[..., :target_h, :]
+
+            if out_key != inp_key:
+                del data_dict[inp_key]
+
+        data_dict["image_size"] = torch.tensor([target_h, orig_w, orig_h, orig_w], dtype=torch.float)
+
+        return data_dict
+
+
+class RandomCrop(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs random crop.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are center cropped.
+            We also save the cropping parameters in the aug_params dict
+            so that it will be used by other transforms.
+        """
+
+        img_size = obtain_augmentation_size(data_dict, self.args)
+        width, height = img_size
+
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        # Obtaining random crop coords
+        try:
+            crop_x0 = int(torch.randint(0, orig_w - width + 1, size=(1,)).item())
+            crop_y0 = int(torch.randint(0, orig_h - height + 1, size=(1,)).item())
+        except Exception as e:
+            logging.warning(
+                f"Random crop failed. Performing center crop, original_size(wxh): {orig_w}x{orig_h}, random_size(wxh): {width}x{height}"
+            )
+            for key in self.input_keys:
+                data_dict[key] = transforms_F.center_crop(data_dict[key], [height, width])
+            crop_x0 = (orig_w - width) // 2
+            crop_y0 = (orig_h - height) // 2
+
+        # We also add the aug params we use. This will be useful for other transforms
+        cropping_params = {
+            "resize_w": orig_w,
+            "resize_h": orig_h,
+            "crop_x0": crop_x0,
+            "crop_y0": crop_y0,
+            "crop_w": width,
+            "crop_h": height,
+        }
+
+        if "aug_params" not in data_dict:
+            data_dict["aug_params"] = dict()
+
+        data_dict["aug_params"]["cropping"] = cropping_params
+
+        # We must perform same random cropping for all input keys
+        for key in self.input_keys:
+            data_dict[key] = transforms_F.crop(data_dict[key], crop_y0, crop_x0, height, width)
+        return data_dict
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/flip.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/flip.py
new file mode 100644
index 0000000..8f0bb7d
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/flip.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Optional
+
+import torch
+import torchvision.transforms.functional as transforms_F
+
+from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
+
+
+class HorizontalFlip(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs horizontal flipping.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are center cropped.
+        """
+        flip_enabled = getattr(self.args, "enabled", True)
+        if flip_enabled:
+            p = getattr(self.args, "prob", 0.5)
+            coin_flip = torch.rand(1).item() > p
+            for key in self.input_keys:
+                if coin_flip:
+                    data_dict[key] = transforms_F.hflip(data_dict[key])
+
+        return data_dict
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/misc.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/misc.py
new file mode 100644
index 0000000..d3e5216
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/misc.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Union
+
+import torch
+from PIL import Image
+
+
+def obtain_image_size(data_dict: dict, input_keys: list) -> tuple[int, int]:
+    r"""Function for obtaining the image size from the data dict.
+
+    Args:
+        data_dict (dict): Input data dict
+        input_keys (list): List of input keys
+    Returns:
+        width (int): Width of the input image
+        height (int): Height of the input image
+    """
+
+    data1 = data_dict[input_keys[0]]
+    if isinstance(data1, Image.Image):
+        width, height = data1.size
+    elif isinstance(data1, torch.Tensor):
+        height, width = data1.size()[-2:]
+    else:
+        raise ValueError("data to random crop should be PIL Image or tensor")
+
+    return width, height
+
+
+def obtain_augmentation_size(data_dict: dict, augmentor_cfg: dict) -> Union[int, tuple]:
+    r"""Function for obtaining size of the augmentation.
+    When dealing with multi-aspect ratio dataloaders, we need to
+    find the augmentation size from the aspect ratio of the data.
+    If data_dict contains "_res_size_map" (e.g. from resolution sampling),
+    that map is used instead of augmentor_cfg["size"].
+
+    Args:
+        data_dict (dict): Input data dict
+        augmentor_cfg (dict): Augmentor config
+    Returns:
+        aug_size (int): Size of augmentation
+    """
+    if "__url__" in data_dict and "aspect_ratio" in data_dict["__url__"].meta.opts:
+        aspect_ratio = data_dict["__url__"].meta.opts["aspect_ratio"]
+    else:  # Non-webdataset format
+        aspect_ratio = data_dict["aspect_ratio"]
+    if "_res_size_map" in data_dict:
+        return data_dict["_res_size_map"][aspect_ratio]
+    return augmentor_cfg["size"][aspect_ratio]
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/normalize.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/normalize.py
new file mode 100644
index 0000000..a949230
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/normalize.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Optional
+
+import torch
+import torchvision.transforms.functional as transforms_F
+
+from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
+
+
+class Normalize(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs data normalization.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are center cropped.
+        """
+        assert self.args is not None, "Please specify args"
+
+        mean = self.args["mean"]
+        std = self.args["std"]
+
+        for key in self.input_keys:
+            if isinstance(data_dict[key], torch.Tensor):
+                data_dict[key] = data_dict[key].to(dtype=torch.get_default_dtype()).div(255)
+            else:
+                data_dict[key] = transforms_F.to_tensor(data_dict[key])  # division by 255 is applied in to_tensor()
+
+            data_dict[key] = transforms_F.normalize(tensor=data_dict[key], mean=mean, std=std)
+        return data_dict
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/padding.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/padding.py
new file mode 100644
index 0000000..e14d66f
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/padding.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Optional
+
+import omegaconf
+import torch
+import torchvision.transforms.functional as transforms_F
+
+from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
+from cosmos_framework.data.imaginaire.webdataset.augmentors.image.misc import obtain_augmentation_size, obtain_image_size
+
+
+class ReflectionPadding(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs reflection padding. This function also returns a padding mask.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are center cropped.
+        """
+
+        assert self.args is not None, "Please specify args in augmentation"
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+
+        # Obtain image and augmentation sizes
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        target_size = obtain_augmentation_size(data_dict, self.args)
+
+        assert isinstance(target_size, (tuple, omegaconf.listconfig.ListConfig)), "Please specify target size as tuple"
+        target_w, target_h = target_size
+
+        target_w = int(target_w)
+        target_h = int(target_h)
+
+        # One-sided padding (bottom and right only, content stays at top-left)
+        padding_right = target_w - orig_w
+        padding_bottom = target_h - orig_h
+        padding_vals = [0, 0, padding_right, padding_bottom]
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            if max(padding_vals[0], padding_vals[2]) >= orig_w or max(padding_vals[1], padding_vals[3]) >= orig_h:
+                # In this case, we can't perform reflection padding. This is because padding values
+                # are larger than the image size. So, perform edge padding instead.
+                data_dict[out_key] = transforms_F.pad(data_dict[inp_key], padding_vals, padding_mode="edge")
+            else:
+                # Perform reflection padding
+                data_dict[out_key] = transforms_F.pad(data_dict[inp_key], padding_vals, padding_mode="reflect")
+
+            if out_key != inp_key:
+                del data_dict[inp_key]
+
+        data_dict["image_size"] = torch.tensor([target_h, target_w, orig_h, orig_w], dtype=torch.float)
+
+        return data_dict
diff --git a/cosmos_framework/data/imaginaire/webdataset/augmentors/image/resize.py b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/resize.py
new file mode 100644
index 0000000..82cdea9
--- /dev/null
+++ b/cosmos_framework/data/imaginaire/webdataset/augmentors/image/resize.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+from typing import Optional
+
+import omegaconf
+import torchvision.transforms.functional as transforms_F
+
+from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
+from cosmos_framework.data.imaginaire.webdataset.augmentors.image.misc import obtain_augmentation_size, obtain_image_size
+
+
+class ResizeSmallestSide(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs resizing to smaller side
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are resized
+        """
+
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+        assert self.args is not None, "Please specify args in augmentations"
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            out_size = obtain_augmentation_size(data_dict, self.args)
+            assert isinstance(out_size, int), "Arg size in resize should be an integer"
+            data_dict[out_key] = transforms_F.resize(
+                data_dict[inp_key],
+                size=out_size,  # type: ignore
+                interpolation=getattr(self.args, "interpolation", transforms_F.InterpolationMode.BICUBIC),
+                antialias=True,
+            )
+            if out_key != inp_key:
+                del data_dict[inp_key]
+        return data_dict
+
+
+class ResizeLargestSide(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs resizing to larger side
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are resized
+        """
+
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+        assert self.args is not None, "Please specify args in augmentations"
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            out_size = obtain_augmentation_size(data_dict, self.args)
+            assert isinstance(out_size, int), "Arg size in resize should be an integer"
+            orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+
+            scaling_ratio = min(out_size / orig_w, out_size / orig_h)
+            target_size = [int(scaling_ratio * orig_h), int(scaling_ratio * orig_w)]
+
+            data_dict[out_key] = transforms_F.resize(
+                data_dict[inp_key],
+                size=target_size,
+                interpolation=getattr(self.args, "interpolation", transforms_F.InterpolationMode.BICUBIC),
+                antialias=True,
+            )
+            if out_key != inp_key:
+                del data_dict[inp_key]
+        return data_dict
+
+
+class ResizeSmallestSideAspectPreserving(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs aspect-ratio preserving resizing.
+        Image is resized to the dimension which has the smaller ratio of (size / target_size).
+        First we compute (w_img / w_target) and (h_img / h_target) and resize the image
+        to the dimension that has the smaller of these ratios.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are resized
+        """
+
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+        assert self.args is not None, "Please specify args in augmentations"
+
+        img_size = obtain_augmentation_size(data_dict, self.args)
+        assert isinstance(img_size, (tuple, omegaconf.listconfig.ListConfig)), (
+            f"Arg size in resize should be a tuple, get {type(img_size)}, {img_size}"
+        )
+        img_w, img_h = img_size
+
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        scaling_ratio = max((img_w / orig_w), (img_h / orig_h))
+        target_size = (int(scaling_ratio * orig_h + 0.5), int(scaling_ratio * orig_w + 0.5))
+
+        assert target_size[0] >= img_h and target_size[1] >= img_w, (
+            f"Resize error. orig {(orig_w, orig_h)} desire {img_size} compute {target_size}"
+        )
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            data_dict[out_key] = transforms_F.resize(
+                data_dict[inp_key],
+                size=target_size,  # type: ignore
+                interpolation=(
+                    self.args["interpolation"]
+                    if "interpolation" in self.args
+                    else transforms_F.InterpolationMode.BICUBIC
+                ),
+                antialias=True,
+            )
+
+            if out_key != inp_key:
+                del data_dict[inp_key]
+        return data_dict
+
+
+class ResizeLargestSideAspectPreserving(Augmentor):
+    def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
+        super().__init__(input_keys, output_keys, args)
+
+    def __call__(self, data_dict: dict) -> dict:
+        r"""Performs aspect-ratio preserving resizing.
+        Image is resized to the dimension which has the larger ratio of (size / target_size).
+        First we compute (w_img / w_target) and (h_img / h_target) and resize the image
+        to the dimension that has the larger of these ratios.
+
+        Args:
+            data_dict (dict): Input data dict
+        Returns:
+            data_dict (dict): Output dict where images are resized
+        """
+
+        if self.output_keys is None:
+            self.output_keys = self.input_keys
+        assert self.args is not None, "Please specify args in augmentations"
+
+        img_size = obtain_augmentation_size(data_dict, self.args)
+        assert isinstance(img_size, (tuple, omegaconf.listconfig.ListConfig)), (
+            f"Arg size in resize should be a tuple, get {type(img_size)}, {img_size}"
+        )
+        img_w, img_h = img_size
+
+        orig_w, orig_h = obtain_image_size(data_dict, self.input_keys)
+        scaling_ratio = min((img_w / orig_w), (img_h / orig_h))
+        target_size = (int(scaling_ratio * orig_h + 0.5), int(scaling_ratio * orig_w + 0.5))
+
+        assert target_size[0] <= img_h and target_size[1] <= img_w, (
+            f"Resize error. orig {(orig_w, orig_h)} desire {img_size} compute {target_size}"
+        )
+
+        for inp_key, out_key in zip(self.input_keys, self.output_keys):
+            data_dict[out_key] = transforms_F.resize(
+                data_dict[inp_key],
+                size=target_size,  # type: ignore
+                interpolation=getattr(self.args, "interpolation", transforms_F.InterpolationMode.BICUBIC),
+                antialias=True,
+            )
+
+            if out_key != inp_key:
+                del data_dict[inp_key]
+        return data_dict
diff --git a/cosmos_framework/data/vfm/action/__init__.py b/cosmos_framework/data/vfm/action/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/data/vfm/action/__init__.py
+++ b/cosmos_framework/data/vfm/action/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/data/vfm/action/action_processing.py b/cosmos_framework/data/vfm/action/action_processing.py
new file mode 100644
index 0000000..1a09170
--- /dev/null
+++ b/cosmos_framework/data/vfm/action/action_processing.py
@@ -0,0 +1,257 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""Shared Action processing records and normalization helpers."""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Literal, Protocol
+
+import numpy as np
+import torch
+
+from cosmos_framework.utils import log
+
+ActionNormalizationMethod = Literal["quantile", "quantile_rot", "meanstd", "minmax"]
+
+
+class ActionNormalizer(Protocol):
+    """Tensor-level action normalization interface used by ActionProcessor."""
+
+    def normalize_action(self, action: torch.Tensor) -> torch.Tensor:  # action: [...,D], returns [...,D]
+        """Map raw action values into model-space values."""
+        ...
+
+    def denormalize_action(self, action: torch.Tensor) -> torch.Tensor:  # action: [...,D], returns [...,D]
+        """Invert model-space action values back into raw action values."""
+        ...
+
+
+@dataclass(frozen=True)
+class ActionAffineNormalization:
+    """Resolved affine action normalizer.
+
+    Forward normalization is ``(raw - offset) / scale``.  Inverse
+    denormalization is ``normalized * scale + offset``.
+
+    ``forward_clamp`` records lossy range-style forward clamping.  When
+    ``forward_clamp_mask`` is provided, only channels with a ``True`` mask
+    entry are clamped; this represents mixed UMI normalizers where some fields
+    are range-clamped and others are plain affine transforms.
+    """
+
+    offset: torch.Tensor
+    scale: torch.Tensor
+    forward_clamp: tuple[float, float] | None = None
+    forward_clamp_mask: torch.Tensor | None = None
+
+    def normalize_action(self, action: torch.Tensor) -> torch.Tensor:  # action: [...,D], returns [...,D]
+        """Normalize raw action values with resolved affine parameters."""
+        offset = self.offset.to(device=action.device, dtype=action.dtype)  # [D]
+        scale = self.scale.to(device=action.device, dtype=action.dtype)  # [D]
+        normalized = (action - offset) / scale  # [...,D]
+        if self.forward_clamp is not None:
+            lo, hi = self.forward_clamp
+            clamped = normalized.clamp(lo, hi)  # [...,D]
+            if self.forward_clamp_mask is None:
+                normalized = clamped  # [...,D]
+            else:
+                clamp_mask = self.forward_clamp_mask.to(device=action.device, dtype=torch.bool)  # [D]
+                normalized = torch.where(clamp_mask, clamped, normalized)  # [...,D]
+        return normalized  # [...,D]
+
+    def denormalize_action(self, action: torch.Tensor) -> torch.Tensor:  # action: [...,D], returns [...,D]
+        """Invert action normalization with resolved affine parameters."""
+        offset = self.offset.to(device=action.device, dtype=action.dtype)  # [D]
+        scale = self.scale.to(device=action.device, dtype=action.dtype)  # [D]
+        return action * scale + offset  # [...,D]
+
+
+def load_action_stats(stats_path: str, stats_key: str = "global") -> dict[str, np.ndarray]:
+    """Load pre-computed action normalization stats from a JSON file."""
+    path = Path(stats_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Action normalization stats not found at {stats_path}.")
+    log.info(f"Loading action normalization stats from {stats_path}")
+    with path.open("r") as f:
+        raw = json.load(f)
+    stat_keys = {"mean", "std", "min", "max", "q01", "q99"}
+    if stats_key in raw:
+        raw = raw[stats_key]
+        if not isinstance(raw, dict):
+            raise TypeError(f"Action normalization stats block {stats_key!r} in {stats_path} must be a dict.")
+    elif stats_key != "global" and not any(key in raw for key in stat_keys):
+        raise KeyError(f"Action normalization stats block {stats_key!r} not found in {stats_path}.")
+    return {k: np.array(v, dtype=np.float32) for k, v in raw.items() if k in stat_keys}
+
+
+def resolve_action_normalization(
+    method: ActionNormalizationMethod,
+    stats: dict[str, torch.Tensor],
+) -> ActionAffineNormalization:
+    """Resolve configured action stats into affine forward/inverse parameters."""
+    if method == "meanstd":
+        offset = stats["mean"]  # [D]
+        scale = stats["std"].clamp(min=1e-8)  # [D]
+        return ActionAffineNormalization(offset=offset, scale=scale)
+
+    if method == "minmax":
+        lo = stats["min"]  # [D]
+        hi = stats["max"]  # [D]
+    elif method in ("quantile", "quantile_rot"):
+        lo = stats["q01"]  # [D]
+        hi = stats["q99"]  # [D]
+    else:
+        raise ValueError(f"Unknown normalization method: {method!r}")
+
+    offset = (hi + lo) / 2.0  # [D]
+    scale = (hi - lo).clamp(min=1e-8) / 2.0  # [D]
+    return ActionAffineNormalization(
+        offset=offset,
+        scale=scale,
+        forward_clamp=(-1.0, 1.0),
+    )
+
+
+def make_pose_action_scale_normalizer(
+    action_dim: int,
+    *,
+    translation_scale: float = 1.0,
+    rotation_scale: float = 1.0,
+) -> ActionAffineNormalization:
+    """Build a normalizer that maps raw pose deltas into scaled model space.
+
+    Pose actions use the shared layout ``[translation(3), rotation(...)]``.
+    The returned normalizer multiplies translation channels by
+    ``translation_scale`` and rotation channels by ``rotation_scale`` during
+    preprocessing, then inverts those factors during postprocessing.
+    """
+    if action_dim < 3:
+        raise ValueError(f"Pose action_dim must be at least 3, got {action_dim}")
+    if translation_scale == 0:
+        raise ValueError("translation_scale must be non-zero")
+    if rotation_scale == 0:
+        raise ValueError("rotation_scale must be non-zero")
+
+    offset = torch.zeros(action_dim, dtype=torch.float32)  # [D]
+    scale = torch.ones(action_dim, dtype=torch.float32)  # [D]
+    scale[:3] = 1.0 / float(translation_scale)  # [D]
+    if action_dim > 3:
+        scale[3:] = 1.0 / float(rotation_scale)  # [D]
+    return ActionAffineNormalization(offset=offset, scale=scale)
+
+
+@dataclass(frozen=True)
+class ActionProcessingRecord:
+    """Per-sample metadata needed to invert Action model-space preprocessing."""
+
+    raw_action_dim: int
+    action_normalizer: ActionNormalizer | None
+
+
+def pad_action_to_max_dim(
+    action: torch.Tensor, max_action_dim: int
+) -> torch.Tensor:  # action: [T,D], returns [T,D_model]
+    """Pad action tensor to max_action_dim along the last dimension.
+
+    Args:
+        action: Action tensor of shape (T, D) where D is the current action dimension.
+        max_action_dim: Target action dimension to pad to.
+
+    Returns:
+        Padded action tensor of shape (T, max_action_dim).
+    """
+    if action.shape[-1] > max_action_dim:
+        raise ValueError(f"Action dimension {action.shape[-1]} is greater than max_action_dim {max_action_dim}")
+    if action.shape[-1] == max_action_dim:
+        return action  # [T,D_model]
+    padding_size = max_action_dim - action.shape[-1]
+    zero_padding = torch.zeros(*action.shape[:-1], padding_size, dtype=action.dtype, device=action.device)  # [T,D_pad]
+    return torch.cat([action, zero_padding], dim=-1)  # [T,D_model]
+
+
+def make_batched_action_processing_fields(
+    record: ActionProcessingRecord,
+    batch_size: int,
+    *,
+    action_channel_masking: bool = True,
+) -> dict[str, list[torch.Tensor | ActionProcessingRecord | None]]:
+    """Build batch-list fields whose action width and inverse record cannot drift apart."""
+    raw_action_dim = torch.tensor(record.raw_action_dim, dtype=torch.long) if action_channel_masking else None  # []
+    return {
+        "raw_action_dim": [raw_action_dim] * batch_size,
+        "action_processing_record": [record] * batch_size,
+    }
+
+
+class ActionProcessor:
+    """Forward and inverse Action tensor processing for a single sample."""
+
+    def __init__(self, max_action_dim: int, action_channel_masking: bool = True) -> None:
+        self.max_action_dim = int(max_action_dim)
+        self.action_channel_masking = bool(action_channel_masking)
+
+    def preprocess_action(
+        self,
+        data_dict: dict[str, Any],
+        action: torch.Tensor,
+        *,
+        action_normalizer: ActionNormalizer | None,
+    ) -> dict[str, Any]:
+        """Return a sample with normalized, padded action fields and the inverse record."""
+
+        raw_action_dim = int(action.shape[-1])
+        if action_normalizer is not None:
+            action = action_normalizer.normalize_action(action)  # [T,D]
+            if int(action.shape[-1]) != raw_action_dim:
+                raise ValueError(
+                    f"Action normalizer changed action width from {raw_action_dim} to {int(action.shape[-1])}"
+                )
+
+        processed_data_dict = dict(data_dict)
+        processed_data_dict["action"] = pad_action_to_max_dim(action, self.max_action_dim)  # [T,D_model]
+        record = ActionProcessingRecord(
+            raw_action_dim=raw_action_dim,
+            action_normalizer=action_normalizer,
+        )
+        processed_data_dict["raw_action_dim"] = (
+            torch.tensor(record.raw_action_dim, dtype=torch.long) if self.action_channel_masking else None
+        )  # []
+        processed_data_dict["action_processing_record"] = record
+        return processed_data_dict
+
+    @staticmethod
+    def _unpad_action(action: torch.Tensor, raw_action_dim: int) -> torch.Tensor:
+        """Drop model-only padded action channels."""
+        if action.shape[-1] < raw_action_dim:
+            raise ValueError(f"invalid raw_action_dim={raw_action_dim} for action with shape {tuple(action.shape)}")
+        return action[..., :raw_action_dim].contiguous()  # [...,D_raw]
+
+    @staticmethod
+    def postprocess_action(
+        action: torch.Tensor,
+        record: ActionProcessingRecord,
+    ) -> torch.Tensor:
+        """Unpad and denormalize a model-space action tensor."""
+        action = ActionProcessor._unpad_action(action, record.raw_action_dim)  # [...,D_raw]
+        if record.action_normalizer is not None:
+            action = record.action_normalizer.denormalize_action(action)  # [...,D_raw]
+        return action  # [...,D_raw]
+
+
+def get_action_processing_records(data_batch: dict[str, Any]) -> list[ActionProcessingRecord | None]:
+    """Read all per-sample processing records from a collated Action batch."""
+    records = data_batch.get("action_processing_record")
+    if records is None:
+        return []
+    if isinstance(records, ActionProcessingRecord):
+        return [records]
+    if isinstance(records, list):
+        for record in records:
+            if record is not None and not isinstance(record, ActionProcessingRecord):
+                raise TypeError(f"Unexpected action_processing_record entry type: {type(record).__name__}")
+        return records
+    raise TypeError(f"Unexpected action_processing_record type: {type(records).__name__}")
diff --git a/cosmos_framework/data/vfm/action/domain_utils.py b/cosmos_framework/data/vfm/action/domain_utils.py
index 910cc39..6f433f7 100644
--- a/cosmos_framework/data/vfm/action/domain_utils.py
+++ b/cosmos_framework/data/vfm/action/domain_utils.py
@@ -14,9 +14,12 @@
     "bridge_orig_lerobot": 7,
     "droid_lerobot": 8,
     "robomind-franka": 8,  # Both Droid and RoboMIND-Franka are using robotiq and franka
+    "embodiment_b": 9,
     "robomind-franka-dual": 12,
     "robomind-ur": 13,
     "agibotworld": 15,
+    "embodiment_c_gripper": 15,
+    "embodiment_c_gripper_ext": 15,
     "fractal": 20,
 }
 
@@ -24,7 +27,6 @@
 EMBODIMENT_TO_RAW_ACTION_DIM: dict[str, int] = {
     "av": 9,
     "camera_pose": 9,
-    "hand_pose": 57,
     "pusht": 2,
     "umi": 10,
     "bridge_orig_lerobot": 10,
@@ -32,8 +34,16 @@
     "robomind-franka": 10,
     "robomind-franka-dual": 20,
     "robomind-ur": 10,
+    "embodiment_b": 30,
     "agibotworld": 29,
+    "embodiment_c_gripper": 29,
+    "embodiment_c_gripper_ext": 29,
     "fractal": 10,
+    # NOTE: ``libero`` (7/10/13 depending on ``rotation_space``) and ``hand_pose``
+    # (variable with ``keypoint_option`` and ``rotation_format``) are absent
+    # because their raw width is set per-dataset at construction time. Inference
+    # in inverse_dynamics/policy modes is not supported for these domains until
+    # canonical widths are added here.
 }
 
 
@@ -46,3 +56,20 @@ def get_domain_id(embodiment_type: str) -> int:
             f"Available embodiments: {sorted(EMBODIMENT_TO_DOMAIN_ID.keys())}"
         )
     return EMBODIMENT_TO_DOMAIN_ID[key]
+
+
+def get_action_dim(embodiment_type: str) -> int:
+    """Get the raw action dimension for a given embodiment type."""
+    key = embodiment_type.lower().strip()
+    if key not in EMBODIMENT_TO_RAW_ACTION_DIM:
+        raise KeyError(
+            f"Unknown embodiment type: {embodiment_type!r}. "
+            f"Available embodiments: {sorted(EMBODIMENT_TO_RAW_ACTION_DIM.keys())}"
+        )
+    return EMBODIMENT_TO_RAW_ACTION_DIM[key]
+
+
+def is_valid_domain_name(embodiment_type: str) -> bool:
+    """Check if the given embodiment type is recognized."""
+    key = embodiment_type.lower().strip()
+    return key in EMBODIMENT_TO_RAW_ACTION_DIM
diff --git a/cosmos_framework/data/vfm/action/json_formatter.py b/cosmos_framework/data/vfm/action/json_formatter.py
index b511e93..201a76e 100644
--- a/cosmos_framework/data/vfm/action/json_formatter.py
+++ b/cosmos_framework/data/vfm/action/json_formatter.py
@@ -7,9 +7,9 @@
 
 import torch
 
+from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.action.viewpoint_utils import DEFAULT_VIEWPOINT_TEMPLATES
 from cosmos_framework.data.vfm.utils import VIDEO_RES_SIZE_INFO
-from cosmos_framework.utils import log
 
 
 def _should_append_idle_frame_info(mode: object) -> bool:
diff --git a/cosmos_framework/data/vfm/action/pose_utils.py b/cosmos_framework/data/vfm/action/pose_utils.py
index 12ea51b..d6d26a4 100644
--- a/cosmos_framework/data/vfm/action/pose_utils.py
+++ b/cosmos_framework/data/vfm/action/pose_utils.py
@@ -298,24 +298,15 @@ def build_abs_pose_from_components(
 def _delta_transform_to_pose_vector(
     delta_T: np.ndarray,
     rotation_output_format: RotationConvention,
-    translation_scale: float = 1.0,
-    rotation_scale: float = 1.0,
 ) -> np.ndarray:
     """Encode a relative transform as an action vector.
 
     The shared action-vector layout is always ``[translation(3), rotation(...)]``.
-    The translation block is multiplied by ``translation_scale`` before concatenation,
-    and the rotation block is multiplied by ``rotation_scale``.
 
     Args:
         delta_T: Relative transform of shape ``(4, 4)``.
         rotation_output_format: Concrete convention used for the output rotation
             block.
-        translation_scale: Scalar multiplier applied to the translation block.
-        rotation_scale: Scalar multiplier applied to the rotation block. Used to
-            match the loss scale of the rotation block to the translation block.
-            The decoder must divide by the same factor before reconstructing the
-            rotation matrix.
 
     Returns:
         A ``float32`` action vector whose first three values are translation and
@@ -325,12 +316,11 @@ def _delta_transform_to_pose_vector(
     if delta_np.shape != (4, 4):
         raise ValueError(f"delta_T must have shape (4, 4), got {delta_np.shape}")
 
-    translation = delta_np[:3, 3] * translation_scale
+    translation = delta_np[:3, 3]
     rotation = np.asarray(
         convert_rotation(delta_np[:3, :3], input_format="matrix", output_format=rotation_output_format),
         dtype=np.float32,
     )
-    rotation = rotation * rotation_scale
     return np.concatenate([translation, rotation]).astype(np.float32)
 
 
@@ -344,19 +334,19 @@ def _pose_vector_to_delta_transform(
     """Decode an action vector back into a relative homogeneous transform.
 
     This is the inverse of `_delta_transform_to_pose_vector()` when the same
-    rotation convention and scale are used.
+    rotation convention is used. Scale arguments are provided for callers that
+    need to decode model-space pose actions before action-normalizer
+    denormalization has been applied.
 
     Args:
         pose_vector: Relative-pose action vector with layout
             ``[translation(3), rotation(...)]``.
         rotation_input_format: Concrete convention used by the rotation block.
-        translation_scale: Scalar used to undo the translation scaling applied during
-            encoding.
+        translation_scale: Scalar used to undo translation scaling in the input
+            vector.
         normalize_rotation: Whether to project the decoded rotation to a valid
             matrix before assembling the transform.
-        rotation_scale: Scalar used to undo the rotation scaling applied during
-            encoding. Must match the value used by
-            `_delta_transform_to_pose_vector()`.
+        rotation_scale: Scalar used to undo rotation scaling in the input vector.
 
     Returns:
         A relative homogeneous transform with shape ``(4, 4)`` and dtype
@@ -440,8 +430,6 @@ def pose_abs_to_rel(
     poses_abs: np.ndarray,
     rotation_format: RotationConvention = "rot9d",
     pose_convention: PoseConvention = "backward_framewise",
-    translation_scale: float = 1.0,
-    rotation_scale: float = 1.0,
 ) -> np.ndarray:
     """Convert an absolute-pose trajectory into relative-pose action vectors.
 
@@ -454,12 +442,6 @@ def pose_abs_to_rel(
         pose_convention: Pose convention:
             - ``backward_framewise``: ``delta_T = T_i^{-1} @ T_{i+1}``
             - ``backward_anchored``: ``delta_T = T_0^{-1} @ T_{i+1}``
-        translation_scale: Scalar multiplier applied to the translation block of each
-            encoded action vector.
-        rotation_scale: Scalar multiplier applied to the rotation block of each
-            encoded action vector. Use this to match the loss scale of rotation
-            and translation. `pose_rel_to_abs()` must be called with the same
-            value to invert the scaling.
 
     Returns:
         An array of shape ``(T - 1, D)`` where ``D = 3 + rotation_dim``.
@@ -481,8 +463,6 @@ def pose_abs_to_rel(
             _delta_transform_to_pose_vector(
                 delta_T,
                 rotation_output_format=rotation_format,
-                translation_scale=translation_scale,
-                rotation_scale=rotation_scale,
             )
         )
 
@@ -510,10 +490,12 @@ def pose_rel_to_abs(
             identity transform is used.
         normalize_rotation: Whether to project decoded rotations onto ``SO(3)``
             before composing them back into the trajectory.
-        translation_scale: Scalar used to undo the translation scaling applied during
-            `pose_abs_to_rel()`.
-        rotation_scale: Scalar used to undo the rotation scaling applied during
-            `pose_abs_to_rel()`. Must match the value passed there.
+        translation_scale: Scalar used to undo translation scaling in
+            ``poses_rel``. Prefer denormalizing with the dataset action
+            normalizer before calling this function.
+        rotation_scale: Scalar used to undo rotation scaling in ``poses_rel``.
+            Prefer denormalizing with the dataset action normalizer before
+            calling this function.
 
     Returns:
         Absolute poses with shape ``(T, 4, 4)`` where ``T = len(poses_rel) + 1``.
diff --git a/cosmos_framework/data/vfm/action/pose_utils_test.py b/cosmos_framework/data/vfm/action/pose_utils_test.py
index 93f9791..3cdc20b 100644
--- a/cosmos_framework/data/vfm/action/pose_utils_test.py
+++ b/cosmos_framework/data/vfm/action/pose_utils_test.py
@@ -196,14 +196,12 @@ def test_pose_abs_to_rel_roundtrips_through_pose_rel_to_abs(
         poses_abs,
         rotation_format=rotation_format,
         pose_convention=pose_convention,
-        translation_scale=2.5,
     )
     reconstructed = pose_rel_to_abs(
         poses_rel,
         rotation_format=rotation_format,
         pose_convention=pose_convention,
         initial_pose=poses_abs[0],
-        translation_scale=2.5,
     )
 
     np.testing.assert_allclose(reconstructed, poses_abs, atol=1e-5)
diff --git a/cosmos_framework/data/vfm/action/transforms.py b/cosmos_framework/data/vfm/action/transforms.py
index d17b141..3462f1e 100644
--- a/cosmos_framework/data/vfm/action/transforms.py
+++ b/cosmos_framework/data/vfm/action/transforms.py
@@ -19,6 +19,11 @@
 import torch
 import torchvision.transforms.functional as transforms_F
 
+from cosmos_framework.utils import log
+from cosmos_framework.data.vfm.action.action_processing import (
+    ActionNormalizer,
+    ActionProcessor,
+)
 from cosmos_framework.data.vfm.action.json_formatter import ActionPromptJsonFormatter
 from cosmos_framework.data.vfm.action.viewpoint_utils import ViewpointTextInfo
 from cosmos_framework.data.vfm.augmentors.duration_fps_text_timestamps import DurationFPSTextTimeStamps
@@ -27,7 +32,6 @@
 from cosmos_framework.data.vfm.augmentors.text_tokenizer import TextTokenizerTransform
 from cosmos_framework.data.vfm.sequence_packing import SequencePlan
 from cosmos_framework.data.vfm.utils import VIDEO_RES_SIZE_INFO
-from cosmos_framework.utils import log
 from cosmos_framework.utils.vfm.data_utils import get_vision_data_resolution
 
 
@@ -36,28 +40,6 @@ def _should_append_idle_frame_info(mode: object) -> bool:
     return mode != "inverse_dynamics"
 
 
-def pad_action_to_max_dim(action: torch.Tensor, max_action_dim: int) -> torch.Tensor:
-    """Pad action tensor to max_action_dim along the last dimension.
-
-    Args:
-        action: Action tensor of shape (T, D) where D is the current action dimension.
-        max_action_dim: Target action dimension to pad to.
-
-    Returns:
-        Padded action tensor of shape (T, max_action_dim).
-    """
-    if action.shape[-1] > max_action_dim:
-        raise ValueError(f"Action dimension {action.shape[-1]} is greater than max_action_dim {max_action_dim}")
-    elif action.shape[-1] == max_action_dim:
-        return action
-    else:
-        padding_size = max_action_dim - action.shape[-1]
-        zero_padding = torch.zeros(
-            *action.shape[:-1], padding_size, dtype=action.dtype, device=action.device
-        )  # [T,padding_size]
-        return torch.cat([action, zero_padding], dim=-1)  # [T,max_action_dim]
-
-
 def find_closest_target_size(h: int, w: int, resolution: str | int) -> tuple[int, int]:
     """Find the closest predefined target size for a given input resolution.
 
@@ -205,7 +187,7 @@ def reflection_pad_to_target(
 
 def remove_reflection_padding(
     tensor: torch.Tensor,
-    image_size: torch.Tensor,
+    image_size: torch.Tensor | list[torch.Tensor] | None,
 ) -> torch.Tensor:
     """Remove reflection padding added by :func:`reflection_pad_to_target`.
 
@@ -215,17 +197,30 @@ def remove_reflection_padding(
         tensor: Tensor whose last two dimensions are the padded spatial dims.
             Supports any leading dimensions, e.g. ``(C, T, H, W)`` or
             ``(C, H, W)``.
-        image_size: 1-D tensor of shape ``(4,)`` containing
-            ``[target_h, target_w, orig_h_resized, orig_w_resized]`` where
-            ``orig_h/w_resized`` is the original spatial size after
-            aspect-preserving resize (i.e. the content region before
-            padding) — the same convention stored by
-            :func:`reflection_pad_to_target` and VFM's
-            ``ReflectionPadding``.
+        image_size: Spatial metadata using the convention produced by
+            :func:`reflection_pad_to_target`. Accepted forms are ``None`` (no
+            crop), a tensor with shape ``(4,)`` or ``(1, 4)``, or a non-empty
+            list whose first element has one of those tensor shapes. The four
+            values are ``[target_h, target_w, orig_h_resized,
+            orig_w_resized]``, where ``orig_h/w_resized`` is the original
+            spatial size after aspect-preserving resize (i.e. the content
+            region before padding). This matches the convention stored by
+            :func:`reflection_pad_to_target` and VFM's ``ReflectionPadding``.
 
     Returns:
         Cropped tensor of shape ``(..., orig_h_resized, orig_w_resized)``.
     """
+    if image_size is None:
+        return tensor
+    if isinstance(image_size, list):
+        if not image_size:
+            raise ValueError("Expected at least one image_size entry")
+        image_size = image_size[0]  # [1,4] or [4]
+    if image_size.ndim == 2 and image_size.shape[0] == 1:
+        image_size = image_size[0]  # [4]
+    if image_size.ndim != 1:
+        raise ValueError(f"Expected image_size shape [4] or [1,4], got {tuple(image_size.shape)}")
+
     target_h = int(image_size[0].item())
     target_w = int(image_size[1].item())
     orig_h_resized = int(image_size[2].item())
@@ -309,7 +304,6 @@ def build_sequence_plan_from_mode(
     base_action_length = action_length - num_history_actions
     if mode == "forward_dynamics":
         condition_frame_indexes_action = list(range(action_length))
-
     # This currently assumes that the action length is the same as the video length - 1
     # and if action length is the same as the video length, then the first action is the conditioning action
     elif base_action_length == video_length - 1:
@@ -487,6 +481,10 @@ def __init__(
         self.video_temporal_downsample: int = video_temporal_downsample
         self.max_action_dim: int = max_action_dim
         self.action_channel_masking: bool = action_channel_masking
+        self.action_processor: ActionProcessor = ActionProcessor(
+            max_action_dim=max_action_dim,
+            action_channel_masking=action_channel_masking,
+        )
 
         # --- Spatial resize/padding stage (resolution supplied at call time) ---
         self.video_resize: VideoResize = VideoResize(
@@ -557,7 +555,12 @@ def __init__(
                 },
             )
 
-    def __call__(self, data_dict: dict, resolution: str | None) -> dict:
+    def __call__(
+        self,
+        data_dict: dict,
+        resolution: str | None,
+        action_normalizer: ActionNormalizer | None = None,
+    ) -> dict:
         """Apply the transform pipeline to a single data dictionary.
 
         Resolution is required at call time and is the only source of truth
@@ -576,7 +579,9 @@ def __call__(self, data_dict: dict, resolution: str | None) -> dict:
            sample is in inverse dynamics mode (if enabled).
         7. Tokenize caption text (if enabled).
         8. Build a ``SequencePlan`` from the ``"mode"`` key (if present).
-        9. If action is needed by the plan, pad ``"action"`` to ``max_action_dim``.
+        9. If action is needed by the plan, normalize real channels, pad
+           ``"action"`` to ``max_action_dim``, and attach
+           ``"action_processing_record"``.
         10. Otherwise, nullify ``"action"`` and ``"domain_id"`` (e.g. in
            ``"image2video"`` mode).
 
@@ -584,11 +589,14 @@ def __call__(self, data_dict: dict, resolution: str | None) -> dict:
             data_dict: A sample dictionary as returned by a Action dataset.
             resolution: Resolution tier key (e.g. ``"256"``, ``"480"``, ``"720"``)
                 for this sample. When ``None``, auto-detected from video dimensions.
+            action_normalizer: Optional source-provided action normalizer. When
+                present, only unpadded real action channels are normalized
+                before model-space channel padding.
 
         Returns:
             The same dictionary, mutated in-place with padded tensors,
-            ``image_size``, tokenized text IDs, and a
-            ``"sequence_plan"`` entry added.
+            ``image_size``, tokenized text IDs, a ``"sequence_plan"`` entry,
+            and action processing metadata added.
         """
         mode = data_dict.get("mode")
         assert mode is not None, "mode is required"
@@ -654,13 +662,17 @@ def __call__(self, data_dict: dict, resolution: str | None) -> dict:
 
         if sequence_plan.has_action:
             assert isinstance(action, torch.Tensor), "action tensor is required when sequence plan has action"
-            data_dict["raw_action_dim"] = torch.tensor(action.shape[1]) if self.action_channel_masking else None
-            data_dict["action"] = pad_action_to_max_dim(action, self.max_action_dim)
+            data_dict = self.action_processor.preprocess_action(
+                data_dict,
+                action,
+                action_normalizer=action_normalizer,
+            )
         else:
             # Nullify action-related fields when action is not needed so the
             # collate function can simply stack all non-None actions.
             data_dict["raw_action_dim"] = None
             data_dict["action"] = None
             data_dict["domain_id"] = None
+            data_dict["action_processing_record"] = None
 
         return data_dict
diff --git a/cosmos_framework/data/vfm/action/transforms_test.py b/cosmos_framework/data/vfm/action/transforms_test.py
index 5f024fb..759ec21 100644
--- a/cosmos_framework/data/vfm/action/transforms_test.py
+++ b/cosmos_framework/data/vfm/action/transforms_test.py
@@ -9,7 +9,11 @@
 import torch
 
 from cosmos_framework.data.vfm.action.json_formatter import ActionPromptJsonFormatter
-from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline
+from cosmos_framework.data.vfm.action.transforms import (
+    ActionTransformPipeline,
+    reflection_pad_to_target,
+    remove_reflection_padding,
+)
 from cosmos_framework.data.vfm.augmentors.duration_fps_text_timestamps import DurationFPSTextTimeStamps
 from cosmos_framework.data.vfm.augmentors.resolution_text_info import ResolutionTextInfo
 
@@ -60,6 +64,24 @@ def test_action_prompt_json_formatter_builds_requested_structure() -> None:
     assert "additional_view_description" not in result
 
 
+@pytest.mark.L0
+def test_video_padding_round_trips_to_unpadded_region() -> None:
+    video = torch.arange(3 * 2 * 4 * 5, dtype=torch.float32).reshape(3, 2, 4, 5)  # [C,T,H,W]
+    data_dict = {"video": video}
+
+    padded = reflection_pad_to_target(
+        data_dict,
+        keys=["video"],
+        keep_aspect_ratio=True,
+        target_w=8,
+        target_h=6,
+    )
+    round_tripped = remove_reflection_padding(padded["video"], padded["image_size"])  # [C,T,H,W]
+
+    assert padded["video"].shape == (3, 2, 6, 8)
+    torch.testing.assert_close(round_tripped, video)
+
+
 @pytest.mark.L0
 def test_action_prompt_json_formatter_drops_empty_fields() -> None:
     formatter = ActionPromptJsonFormatter()
diff --git a/cosmos_framework/data/vfm/augmentor_provider.py b/cosmos_framework/data/vfm/augmentor_provider.py
index 2ace65c..3e3d785 100644
--- a/cosmos_framework/data/vfm/augmentor_provider.py
+++ b/cosmos_framework/data/vfm/augmentor_provider.py
@@ -564,6 +564,9 @@ def get_video_augmentor_v3(
     conditioning_config = kwargs.get("conditioning_config", None)
     uniform_conditioning = kwargs.get("uniform_conditioning", False)
     temporal_compression_factor = kwargs.get("temporal_compression_factor", 4)
+    causal_vae = kwargs.get("causal_vae", True)
+    uniae_pad_frames = kwargs.get("uniae_pad_frames", None)
+    uniae_chunk_frames = kwargs.get("uniae_chunk_frames", None)
 
     print("Running video_basic_augmentor_v3...")
     augmentors = {
@@ -577,6 +580,10 @@ def get_video_augmentor_v3(
                 "min_stride": min_stride,
                 "seek_mode": "exact",  # Change to "approximate"?
                 "dataset_resolution_type": dataset_resolution_type,
+                "resolution": resolution,
+                "causal_vae": causal_vae,
+                "uniae_pad_frames": uniae_pad_frames,
+                "uniae_chunk_frames": uniae_chunk_frames,
             },
         ),
         "merge_datadict": L(merge_datadict.DataDictMerger)(
@@ -599,6 +606,9 @@ def get_video_augmentor_v3(
                 "conditioning_config": conditioning_config,
                 "uniform_conditioning": uniform_conditioning,
                 "temporal_compression_factor": temporal_compression_factor,
+                "resolution": resolution,
+                "uniae_pad_frames": uniae_pad_frames,
+                "uniae_chunk_frames": uniae_chunk_frames,
             },
         )
     augmentors.update(
@@ -670,7 +680,6 @@ def get_video_augmentor_v3(
     return augmentors
 
 
-
 # Use video_basic_augmentor_v3_json_caption instead.
 @augmentor_register("video_basic_augmentor_v3_with_audio")
 def get_video_augmentor_v3_with_audio(
@@ -829,6 +838,9 @@ def get_video_augmentor_v3_json_caption(
     conditioning_config = kwargs.get("conditioning_config", None)
     uniform_conditioning = kwargs.get("uniform_conditioning", False)
     temporal_compression_factor = kwargs.get("temporal_compression_factor", 4)
+    causal_vae = kwargs.get("causal_vae", True)
+    uniae_pad_frames = kwargs.get("uniae_pad_frames", None)
+    uniae_chunk_frames = kwargs.get("uniae_chunk_frames", None)
 
     print("Running video_augmentor_v3_json_caption...")
     augmentors = {
@@ -853,9 +865,13 @@ def get_video_augmentor_v3_json_caption(
                 "min_stride": min_stride,
                 "seek_mode": "exact",
                 "dataset_resolution_type": dataset_resolution_type,
+                "resolution": resolution,
                 "extract_audio": extract_audio,
                 "audio_sample_rate": audio_sample_rate,
                 "emit_placeholder_sound": not extract_audio,
+                "causal_vae": causal_vae,
+                "uniae_pad_frames": uniae_pad_frames,
+                "uniae_chunk_frames": uniae_chunk_frames,
             },
         ),
         "merge_datadict": L(merge_datadict.DataDictMerger)(
@@ -881,6 +897,9 @@ def get_video_augmentor_v3_json_caption(
                 "conditioning_config": conditioning_config,
                 "uniform_conditioning": uniform_conditioning,
                 "temporal_compression_factor": temporal_compression_factor,
+                "resolution": resolution,
+                "uniae_pad_frames": uniae_pad_frames,
+                "uniae_chunk_frames": uniae_chunk_frames,
             },
         )
     augmentors.update(
diff --git a/cosmos_framework/data/vfm/augmentors/__init__.py b/cosmos_framework/data/vfm/augmentors/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/data/vfm/augmentors/__init__.py
+++ b/cosmos_framework/data/vfm/augmentors/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py b/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py
index 302715c..34f6116 100644
--- a/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py
+++ b/cosmos_framework/data/vfm/augmentors/idle_frames_text_info.py
@@ -8,7 +8,7 @@
 frames (i.e. the relative-pose delta is close to identity and the gripper
 command does not change). The upstream dataset is responsible for populating
 ``data_dict[idle_frames_key]`` via
-:func:`projects.cosmos3.vfm.datasets.action.pose_utils.compute_idle_frames`.
+:func:`cosmos_framework.data.vfm.action.pose_utils.compute_idle_frames`.
 
 Per-field dropout (default 5%) is applied here, matching Pi0.7's approach of
 independently dropping each metadata component. This is complementary to the
diff --git a/cosmos_framework/data/vfm/augmentors/image_editing_transform.py b/cosmos_framework/data/vfm/augmentors/image_editing_transform.py
index fdaaa40..4af344b 100644
--- a/cosmos_framework/data/vfm/augmentors/image_editing_transform.py
+++ b/cosmos_framework/data/vfm/augmentors/image_editing_transform.py
@@ -18,6 +18,7 @@
 
 from __future__ import annotations
 
+import json
 import random
 
 import torch
@@ -26,13 +27,12 @@
 
 from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
 from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.sequence_packing import SequencePlan
 
 
 class ExtractImageEditingConversation(Augmentor):
     """Extract and validate image editing conversation from standard annotation format.
 
-    This augmentor processes the cosmos-interleaved conversation format for image editing:
+    This augmentor processes cosmos-interleaved conversation data for image editing:
     - Validates that the conversation has exactly one round (user + assistant)
     - User message must contain at least one image and text instruction
     - Assistant message must contain exactly one image (the edited result)
@@ -42,6 +42,9 @@ class ExtractImageEditingConversation(Augmentor):
         - texts: Dict containing "content" with conversation data
         - mllm_media_list: Dict mapping image keys to PIL images (for understanding)
         - diffusion_media_list: Dict mapping image keys to PIL images (for diffusion/VAE)
+        - optional structured instruction key: Dict, JSON string, or JSON bytes containing
+          text_json.content and gemini_rewrite. When configured, gemini_rewrite is used as
+          the training prompt and text_json.content is used only to recover image references.
 
     Output Format (added to data_dict):
         - source_image: PIL.Image (the input image for editing)
@@ -53,10 +56,124 @@ def __init__(
         self,
         input_keys: list | None = None,
         max_round: int = 1,
+        instruction_key: str = "texts",
+        conversation_key: str = "texts",
+        structured_instruction_field: str | None = None,
         args: dict | None = None,
     ) -> None:
         super().__init__(input_keys or [], None, args)
-        self.max_round = max_round
+        self.max_round: int = max_round
+        self.instruction_key: str = instruction_key
+        self.conversation_key: str = conversation_key
+        self.structured_instruction_field: str | None = structured_instruction_field
+
+    def _decode_json_text(self, text: str, payload_name: str, sample_key: str) -> dict | None:
+        try:
+            payload = json.loads(text)
+        except json.JSONDecodeError as e:
+            log.warning(
+                f"Error decoding {payload_name} JSON: {sample_key}, {str(e)}",
+                rank0_only=False,
+            )
+            return None
+
+        if not isinstance(payload, dict):
+            log.warning(
+                f"Decoded {payload_name} is not a dict: {sample_key}, got {type(payload)}",
+                rank0_only=False,
+            )
+            return None
+        return payload
+
+    def _decode_payload(self, payload: object, payload_name: str, sample_key: str) -> dict | None:
+        if isinstance(payload, dict):
+            return payload
+
+        if isinstance(payload, str):
+            return self._decode_json_text(payload, payload_name, sample_key)
+
+        if isinstance(payload, (bytes, bytearray)):
+            try:
+                text = bytes(payload).decode("utf-8")
+            except UnicodeDecodeError as e:
+                log.warning(
+                    f"Error decoding {payload_name} bytes as UTF-8: {sample_key}, {str(e)}",
+                    rank0_only=False,
+                )
+                return None
+            return self._decode_json_text(text, payload_name, sample_key)
+
+        log.warning(
+            f"Unsupported {payload_name} payload type: {sample_key}, got {type(payload)}",
+            rank0_only=False,
+        )
+        return None
+
+    def _get_instruction_payload(self, data_dict: dict, sample_key: str) -> dict | None:
+        payload = data_dict.get(self.instruction_key)
+        if payload is None:
+            log.warning(
+                f"{self.instruction_key} not found in data_dict: {sample_key}",
+                rank0_only=False,
+            )
+            return None
+        return self._decode_payload(payload, self.instruction_key, sample_key)
+
+    def _get_conversation_payload(
+        self,
+        data_dict: dict,
+        instruction_payload: dict,
+        sample_key: str,
+    ) -> dict | None:
+        if self.conversation_key == self.instruction_key:
+            return instruction_payload
+
+        if self.conversation_key in data_dict:
+            return self._decode_payload(data_dict[self.conversation_key], self.conversation_key, sample_key)
+
+        nested_payload = instruction_payload.get(self.conversation_key)
+        if nested_payload is None:
+            log.warning(
+                f"{self.conversation_key} not found in {self.instruction_key}: {sample_key}",
+                rank0_only=False,
+            )
+            return None
+        return self._decode_payload(nested_payload, f"{self.instruction_key}.{self.conversation_key}", sample_key)
+
+    def _get_structured_instruction(self, instruction_payload: dict, sample_key: str) -> str | None:
+        if self.structured_instruction_field is None:
+            return None
+
+        rewrite_error = instruction_payload.get("rewrite_error")
+        if rewrite_error is not None:
+            log.warning(
+                f"Structured instruction rewrite_error is non-null: {sample_key}, {rewrite_error}",
+                rank0_only=False,
+            )
+            return None
+
+        structured_payload = instruction_payload.get(self.structured_instruction_field)
+        if not isinstance(structured_payload, dict):
+            log.warning(
+                f"{self.structured_instruction_field} missing or not a dict: {sample_key}",
+                rank0_only=False,
+            )
+            return None
+
+        edit_type = structured_payload.get("edit_type")
+        structured_instruction = structured_payload.get("structured_instruction")
+        if not isinstance(edit_type, str) or not edit_type:
+            log.warning(f"Structured instruction edit_type missing: {sample_key}", rank0_only=False)
+            return None
+        if not isinstance(structured_instruction, dict) or not structured_instruction:
+            log.warning(f"Structured instruction body missing: {sample_key}", rank0_only=False)
+            return None
+
+        prompt = {
+            "edit_type": edit_type,
+            "structured_instruction": structured_instruction,
+        }
+        return json.dumps(prompt, ensure_ascii=False)
 
     def __call__(self, data_dict: dict) -> dict | None:
         """Extract image editing conversation.
@@ -69,23 +186,30 @@ def __call__(self, data_dict: dict) -> dict | None:
             or None if the data is invalid.
         """
         # Validate required keys
-        for required_key in ["mllm_media_list", "diffusion_media_list", "texts"]:
+        sample_key = data_dict.get("__key__", "unknown")
+        for required_key in ["diffusion_media_list", self.instruction_key]:
             if required_key not in data_dict:
                 log.warning(
-                    f"{required_key} not found in data_dict: {data_dict.get('__key__', 'unknown')}",
+                    f"{required_key} not found in data_dict: {sample_key}",
                     rank0_only=False,
                 )
                 return None
 
-        mllm_media_list = data_dict["mllm_media_list"]
         diffusion_media_list = data_dict["diffusion_media_list"]
+        instruction_payload = self._get_instruction_payload(data_dict, sample_key)
+        if instruction_payload is None:
+            return None
+        conversation_payload = self._get_conversation_payload(data_dict, instruction_payload, sample_key)
+        if conversation_payload is None:
+            return None
+        conversation_content_key = f"{self.conversation_key}.content"
 
         # Get conversation content
         try:
-            texts_content = data_dict["texts"].get("content")
+            texts_content = conversation_payload.get("content")
             if texts_content is None:
                 log.warning(
-                    f"texts.content is None: {data_dict.get('__key__', 'unknown')}",
+                    f"{conversation_content_key} is None: {sample_key}",
                     rank0_only=False,
                 )
                 return None
@@ -99,13 +223,13 @@ def __call__(self, data_dict: dict) -> dict | None:
                     selected_conversations = texts_content
             else:
                 log.warning(
-                    f"Unexpected texts.content format: {data_dict.get('__key__', 'unknown')}",
+                    f"Unexpected {conversation_content_key} format: {sample_key}",
                     rank0_only=False,
                 )
                 return None
         except Exception as e:
             log.warning(
-                f"Error accessing texts.content: {data_dict.get('__key__', 'unknown')}, {str(e)}",
+                f"Error accessing {conversation_content_key}: {sample_key}, {str(e)}",
                 rank0_only=False,
             )
             return None
@@ -115,15 +239,14 @@ def __call__(self, data_dict: dict) -> dict | None:
         if len(selected_conversations) > 2:
             log.warning(
                 f"Multi-round conversation found ({len(selected_conversations)} messages), "
-                f"keeping only first round: {data_dict.get('__key__', 'unknown')}",
+                f"keeping only first round: {sample_key}",
                 rank0_only=False,
             )
             selected_conversations = selected_conversations[:2]
 
         if len(selected_conversations) < 2:
             log.warning(
-                f"Expected at least 2 messages (user + assistant), got {len(selected_conversations)}: "
-                f"{data_dict.get('__key__', 'unknown')}",
+                f"Expected at least 2 messages (user + assistant), got {len(selected_conversations)}: {sample_key}",
                 rank0_only=False,
             )
             return None
@@ -134,14 +257,14 @@ def __call__(self, data_dict: dict) -> dict | None:
 
         if user_msg.get("role") != "user":
             log.warning(
-                f"First message role is not 'user': {data_dict.get('__key__', 'unknown')}",
+                f"First message role is not 'user': {sample_key}",
                 rank0_only=False,
             )
             return None
 
         if assistant_msg.get("role") != "assistant":
             log.warning(
-                f"Second message role is not 'assistant': {data_dict.get('__key__', 'unknown')}",
+                f"Second message role is not 'assistant': {sample_key}",
                 rank0_only=False,
             )
             return None
@@ -167,24 +290,29 @@ def __call__(self, data_dict: dict) -> dict | None:
 
         if user_image_key is None:
             log.warning(
-                f"No image found in user message: {data_dict.get('__key__', 'unknown')}",
+                f"No image found in user message: {sample_key}",
                 rank0_only=False,
             )
             return None
 
-        editing_instruction = " ".join(user_text_parts).strip()
-        if not editing_instruction:
-            log.warning(
-                f"No text instruction found in user message: {data_dict.get('__key__', 'unknown')}",
-                rank0_only=False,
-            )
-            return None
+        if self.structured_instruction_field is None:
+            editing_instruction = " ".join(user_text_parts).strip()
+            if not editing_instruction:
+                log.warning(
+                    f"No text instruction found in user message: {sample_key}",
+                    rank0_only=False,
+                )
+                return None
+        else:
+            editing_instruction = self._get_structured_instruction(instruction_payload, sample_key)
+            if editing_instruction is None:
+                return None
 
         # Extract assistant content: must have exactly one image
         assistant_content = assistant_msg.get("content", [])
         if isinstance(assistant_content, str):
             log.warning(
-                f"Assistant content is text-only (no image): {data_dict.get('__key__', 'unknown')}",
+                f"Assistant content is text-only (no image): {sample_key}",
                 rank0_only=False,
             )
             return None
@@ -199,7 +327,7 @@ def __call__(self, data_dict: dict) -> dict | None:
 
         if assistant_image_key is None:
             log.warning(
-                f"No image found in assistant message: {data_dict.get('__key__', 'unknown')}",
+                f"No image found in assistant message: {sample_key}",
                 rank0_only=False,
             )
             return None
@@ -208,7 +336,7 @@ def __call__(self, data_dict: dict) -> dict | None:
         for media_key in [user_image_key, assistant_image_key]:
             if media_key not in diffusion_media_list:
                 log.warning(
-                    f"Image {media_key} not found in diffusion_media_list: {data_dict.get('__key__', 'unknown')}",
+                    f"Image {media_key} not found in diffusion_media_list: {sample_key}",
                     rank0_only=False,
                 )
                 return None
@@ -225,7 +353,7 @@ def __call__(self, data_dict: dict) -> dict | None:
 
         if source_image is None or target_image is None:
             log.warning(
-                f"Source or target image is None: {data_dict.get('__key__', 'unknown')}",
+                f"Source or target image is None: {sample_key}",
                 rank0_only=False,
             )
             return None
@@ -329,6 +457,8 @@ def __call__(self, data_dict: dict) -> dict | None:
             # by GenerationDataClean.num_vision_items_per_sample (set in get_data_and_condition).
             # In pack_input_sequence, all items except the last are fully conditioned;
             # the last item uses condition_frame_indexes_vision ([] = fully generated).
+            from cosmos_framework.data.vfm.sequence_packing import SequencePlan
+
             data_dict["sequence_plan"] = SequencePlan(
                 has_text=True,
                 has_vision=True,
diff --git a/cosmos_framework/data/vfm/augmentors/image_editing_transform_test.py b/cosmos_framework/data/vfm/augmentors/image_editing_transform_test.py
new file mode 100644
index 0000000..849ad00
--- /dev/null
+++ b/cosmos_framework/data/vfm/augmentors/image_editing_transform_test.py
@@ -0,0 +1,159 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+import json
+
+import pytest
+from PIL import Image
+
+from cosmos_framework.data.vfm.augmentors.image_editing_transform import ExtractImageEditingConversation
+
+_STRUCTURED_KEY = "edit_schema_all_inputs_qwen3-vl-235b-a22b-instruct"
+
+
+def _conversation(instruction: str = "Make the cup red") -> list[list[dict]]:
+    return [
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": "image_0"},
+                    {"type": "text", "text": instruction},
+                ],
+            },
+            {
+                "role": "assistant",
+                "content": [
+                    {"type": "image", "image": "image_1"},
+                ],
+            },
+        ]
+    ]
+
+
+def _media_data() -> tuple[Image.Image, Image.Image, dict[str, Image.Image]]:
+    source_image = Image.new("RGB", (16, 16), color="blue")
+    target_image = Image.new("RGB", (16, 16), color="red")
+    media_list = {
+        "image_0": source_image,
+        "image_1": target_image,
+    }
+    return source_image, target_image, media_list
+
+
+def _base_data_dict() -> tuple[dict, Image.Image, Image.Image]:
+    source_image, target_image, media_list = _media_data()
+    data_dict = {
+        "__key__": "sample_000001",
+        "mllm_media_list": media_list,
+        "diffusion_media_list": media_list,
+    }
+    return data_dict, source_image, target_image
+
+
+def _structured_payload() -> dict:
+    return {
+        "rewrite_error": None,
+        "gemini_rewrite": {
+            "edit_type": "adjust",
+            "structured_instruction": {
+                "target_object": "cup",
+                "attribute_type": "color",
+                "desired_value": "red",
+            },
+        },
+        "text_json": {
+            "content": _conversation("Original dense instruction"),
+        },
+        "original_instruction": "Original dense instruction",
+    }
+
+
+@pytest.mark.L0
+@pytest.mark.CPU
+def test_extract_image_editing_conversation_keeps_texts_behavior() -> None:
+    data_dict, source_image, target_image = _base_data_dict()
+    data_dict["texts"] = {"content": _conversation("Make the cup red")}
+
+    result = ExtractImageEditingConversation()(data_dict)
+
+    assert result is not None
+    assert result["source_image"] is source_image
+    assert result["target_image"] is target_image
+    assert result["editing_instruction"] == "Make the cup red"
+
+
+@pytest.mark.L0
+@pytest.mark.CPU
+def test_extract_structured_dict_payload_uses_gemini_rewrite() -> None:
+    data_dict, source_image, target_image = _base_data_dict()
+    payload = _structured_payload()
+    data_dict[_STRUCTURED_KEY] = payload
+
+    result = ExtractImageEditingConversation(
+        instruction_key=_STRUCTURED_KEY,
+        conversation_key="text_json",
+        structured_instruction_field="gemini_rewrite",
+    )(data_dict)
+
+    expected_instruction = json.dumps(
+        {
+            "edit_type": payload["gemini_rewrite"]["edit_type"],
+            "structured_instruction": payload["gemini_rewrite"]["structured_instruction"],
+        },
+        ensure_ascii=False,
+    )
+    assert result is not None
+    assert result["source_image"] is source_image
+    assert result["target_image"] is target_image
+    assert result["editing_instruction"] == expected_instruction
+
+
+@pytest.mark.L0
+@pytest.mark.CPU
+@pytest.mark.parametrize("encode_as_bytes", [False, True])
+def test_extract_structured_json_payload_uses_gemini_rewrite(encode_as_bytes: bool) -> None:
+    data_dict, _, _ = _base_data_dict()
+    payload = _structured_payload()
+    payload_json = json.dumps(payload, ensure_ascii=False)
+    data_dict[_STRUCTURED_KEY] = payload_json.encode("utf-8") if encode_as_bytes else payload_json
+
+    result = ExtractImageEditingConversation(
+        instruction_key=_STRUCTURED_KEY,
+        conversation_key="text_json",
+        structured_instruction_field="gemini_rewrite",
+    )(data_dict)
+
+    expected_instruction = json.dumps(
+        {
+            "edit_type": payload["gemini_rewrite"]["edit_type"],
+            "structured_instruction": payload["gemini_rewrite"]["structured_instruction"],
+        },
+        ensure_ascii=False,
+    )
+    assert result is not None
+    assert result["editing_instruction"] == expected_instruction
+
+
+@pytest.mark.L0
+@pytest.mark.CPU
+@pytest.mark.parametrize(
+    "payload_update",
+    [
+        {"gemini_rewrite": None},
+        {"rewrite_error": "failed to rewrite"},
+    ],
+)
+def test_extract_structured_invalid_payload_returns_none(payload_update: dict) -> None:
+    data_dict, _, _ = _base_data_dict()
+    payload = _structured_payload()
+    payload.update(payload_update)
+    data_dict[_STRUCTURED_KEY] = payload
+
+    result = ExtractImageEditingConversation(
+        instruction_key=_STRUCTURED_KEY,
+        conversation_key="text_json",
+        structured_instruction_field="gemini_rewrite",
+    )(data_dict)
+
+    assert result is None
diff --git a/cosmos_framework/data/vfm/augmentors/interleaved_video_parsing.py b/cosmos_framework/data/vfm/augmentors/interleaved_video_parsing.py
index aea759e..41e4e4c 100644
--- a/cosmos_framework/data/vfm/augmentors/interleaved_video_parsing.py
+++ b/cosmos_framework/data/vfm/augmentors/interleaved_video_parsing.py
@@ -414,7 +414,7 @@ def __call__(self, data_dict: dict) -> dict | None:
             )  # [C,T,H,W]
             num_multiplier = (end_frame - start_frame) / self.num_frames
 
-
+        # NOTE: matches legacy VideoParsing.__call__ output keys exactly. Do NOT add
         # variable-length fields like ``frame_indices`` here -- ``video_flatten_keys`` in
         # ``get_video_transfer_augmentor`` lists ``frame_indices``, and surfacing a
         # per-sample list there would crash ``custom_collate_fn`` (default_collate requires
diff --git a/cosmos_framework/data/vfm/augmentors/pkl_to_media.py b/cosmos_framework/data/vfm/augmentors/pkl_to_media.py
index 54d47b8..aa9eb21 100644
--- a/cosmos_framework/data/vfm/augmentors/pkl_to_media.py
+++ b/cosmos_framework/data/vfm/augmentors/pkl_to_media.py
@@ -27,14 +27,12 @@
 
 def token_to_pixels(token_length: int, patch_size: int = 14, temporal_patch_size: int = 2) -> int:
     """Convert token length to pixels based on patch size and temporal patch size."""
-
     merged_patch_size = patch_size * 2
     return token_length * merged_patch_size**2 * temporal_patch_size
 
 
 def pixels_to_token(pixels: int, patch_size: int = 14, temporal_patch_size: int = 2) -> int:
     """Convert pixels to token length based on patch size and temporal patch size."""
-
     merged_patch_size = patch_size * 2
     return pixels // merged_patch_size**2 // temporal_patch_size
 
diff --git a/cosmos_framework/data/vfm/augmentors/sequence_plan.py b/cosmos_framework/data/vfm/augmentors/sequence_plan.py
index 9f0500a..5a2202f 100644
--- a/cosmos_framework/data/vfm/augmentors/sequence_plan.py
+++ b/cosmos_framework/data/vfm/augmentors/sequence_plan.py
@@ -7,15 +7,22 @@
 - weighted dict (``conditioning_config``): explicit frame-count → probability pairs
 - uniform (``uniform_conditioning=True``): k ~ Uniform{0, T_latent-1}, where T_latent
   is computed from the actual video length using the VAE temporal compression factor
+  or UniAE chunking parameters when provided.
 """
 
 import random
+from collections.abc import Mapping
 from typing import Optional
 
 import torch
 
 from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
 from cosmos_framework.data.vfm.sequence_packing import SequencePlan
+from cosmos_framework.model.vfm.tokenizers.uniae.frame_math import (
+    get_uniae_chunk_frames,
+    get_uniae_latent_num_frames,
+    normalize_uniae_chunk_frames,
+)
 
 
 class SequencePlanAugmentor(Augmentor):
@@ -37,6 +44,11 @@ class SequencePlanAugmentor(Augmentor):
               must be provided.
             - "temporal_compression_factor" (int, default 4): VAE temporal compression
               factor used to convert pixel frame count N to T_latent = 1 + (N-1) // tcf.
+            - "uniae_chunk_frames" / "uniae_pad_frames" (optional): When provided,
+              use UniAE's non-causal first-frame plus padded-chunk latent count.
+              ``uniae_chunk_frames`` may be a scalar or a resolution-keyed mapping.
+            - "resolution" (str, optional): Target dataset resolution key. Preferred over
+              the current tensor shape when selecting a resolution-keyed UniAE chunk.
     """
 
     def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: Optional[dict] = None) -> None:
@@ -48,6 +60,9 @@ def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: O
         self.conditioning_config = args.get("conditioning_config")
         self.uniform_conditioning = args.get("uniform_conditioning", False)
         self.temporal_compression_factor = args.get("temporal_compression_factor", 4)
+        self.target_resolution_key = None if args.get("resolution") is None else str(args["resolution"])
+        self.uniae_pad_frames = None if args.get("uniae_pad_frames") is None else int(args["uniae_pad_frames"])
+        self.uniae_chunk_frames = self._normalize_uniae_chunk_frames(args.get("uniae_chunk_frames"))
 
         if self.conditioning_config is None and not self.uniform_conditioning:
             raise ValueError("args must provide 'conditioning_config' or set 'uniform_conditioning=True'")
@@ -70,6 +85,43 @@ def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: O
         else:
             self.normalized_config = {0: 1.0}
 
+    def _normalize_uniae_chunk_frames(
+        self, uniae_chunk_frames: int | Mapping[str, int] | None
+    ) -> int | dict[str, int] | None:
+        return normalize_uniae_chunk_frames(
+            uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=self.temporal_compression_factor,
+        )
+
+    def _get_uniae_chunk_frames(self, spatial_shape: tuple[int, int] | None = None) -> int:
+        assert self.uniae_chunk_frames is not None
+        return get_uniae_chunk_frames(
+            self.uniae_chunk_frames,
+            spatial_shape=spatial_shape,
+            target_resolution_key=self.target_resolution_key,
+        )
+
+    def _get_latent_frame_count(self, num_frames: int | None, spatial_shape: tuple[int, int] | None = None) -> int:
+        if num_frames is None:
+            return 1
+        if num_frames < 1:
+            raise ValueError(f"video must contain at least one frame, got {num_frames}")
+        if num_frames == 1:
+            return 1
+        if self.uniae_chunk_frames is None:
+            return 1 + (num_frames - 1) // self.temporal_compression_factor
+
+        assert self.uniae_pad_frames is not None
+        return get_uniae_latent_num_frames(
+            num_frames,
+            self.uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=self.temporal_compression_factor,
+            spatial_shape=spatial_shape,
+            target_resolution_key=self.target_resolution_key,
+        )
+
     def __call__(self, data_dict: dict) -> dict:
         """Create a SequencePlan with random conditional frames.
 
@@ -94,15 +146,17 @@ def __call__(self, data_dict: dict) -> dict:
 
         # Determine number of frames
         # Video should be a tensor with shape (C, T, H, W) by this point in the pipeline
+        spatial_shape = None
         if isinstance(video, torch.Tensor):
             assert video.ndim == 4, "video should be a tensor with shape (C, T, H, W)"
-            num_frames = video.shape[1]
+            num_frames = video.shape[1]  # video: [C,T,H,W]
+            spatial_shape = (video.shape[2], video.shape[3])
         else:
             # If video is not a tensor or dict, we can't determine the exact number
             # Use a conservative approach - will be limited by max available frames
             num_frames = None
 
-        T_latent = 1 + (num_frames - 1) // self.temporal_compression_factor if num_frames is not None else 1
+        T_latent = self._get_latent_frame_count(num_frames, spatial_shape)
 
         # Sample number of conditional frames
         if self.uniform_conditioning:
diff --git a/cosmos_framework/data/vfm/augmentors/text_transforms_for_image.py b/cosmos_framework/data/vfm/augmentors/text_transforms_for_image.py
index edaef11..d38fae4 100644
--- a/cosmos_framework/data/vfm/augmentors/text_transforms_for_image.py
+++ b/cosmos_framework/data/vfm/augmentors/text_transforms_for_image.py
@@ -8,13 +8,17 @@
 from cosmos_framework.data.imaginaire.webdataset.augmentors.v3_text_transforms import pad_and_resize
 from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
 from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.data_sources.data_registration import _CAPTION_EMBEDDING_KEY_MAPPING_IMAGES
 
 # For the qwen captions, we have 3 variants: short, medium, long
 # In addition, for synthetic data, we create prompt embeddings as well.
 # There is quite a bit of entropy in the way prompt data is saved.
 # Captions are saved as "prompts", while the corresponding embeddings are saved as "original_prompt"
 # This part will be cleaned after synthetic data is cleaned to be in the same format as real data.
+_CAPTION_EMBEDDING_KEY_MAPPING_IMAGES = {
+    "ai_v3p1": "ai_v3p1",
+    "qwen2p5_7b_v4": "qwen2p5_7b_v4",
+    "prompts": "qwen2p5_7b_v4",
+}
 _AVAILABLE_QWEN_CAPTIONS = ["qwen2p5_7b_short", "qwen2p5_7b_medium", "qwen2p5_7b_long"]
 _AVAILABLE_QWEN3_30B_A3B_CAPTIONS = [
     "qwen3_30b_a3b_short",
diff --git a/cosmos_framework/data/vfm/augmentors/transfer_control_transform.py b/cosmos_framework/data/vfm/augmentors/transfer_control_transform.py
index 74fb523..a18e8fd 100644
--- a/cosmos_framework/data/vfm/augmentors/transfer_control_transform.py
+++ b/cosmos_framework/data/vfm/augmentors/transfer_control_transform.py
@@ -5,7 +5,7 @@
 Augmentors for transfer (control-conditioned) image and video generation in the cosmos3 VFM pipeline.
 
 Transfer training conditions the model on control signals (edge, blur, depth, or segmentation)
-to generate images or videos, aligned with cosmos_framework/transfer2. This module provides:
+to generate images or videos, aligned with cosmos/transfer2. This module provides:
 
 - **TransferToTrainingFormat**: Converts (control_input, target) into the joint dataloader format
   with SequencePlan (condition frame + generated frame), for both image and video outputs.
diff --git a/cosmos_framework/data/vfm/augmentors/video_parsing.py b/cosmos_framework/data/vfm/augmentors/video_parsing.py
index cfaa934..25a5580 100644
--- a/cosmos_framework/data/vfm/augmentors/video_parsing.py
+++ b/cosmos_framework/data/vfm/augmentors/video_parsing.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 import random
+from collections.abc import Mapping
 from typing import Optional
 
 import numpy as np
@@ -15,12 +16,18 @@
 from cosmos_framework.data.imaginaire.webdataset.augmentors.image.misc import obtain_augmentation_size
 from cosmos_framework.utils import log
 from cosmos_framework.data.vfm.utils import VIDEO_RES_SIZE_INFO
+from cosmos_framework.model.vfm.tokenizers.uniae.frame_math import (
+    align_uniae_num_video_frames,
+    get_uniae_chunk_frames,
+    normalize_uniae_chunk_frames,
+)
 
 # Map dataset_resolution_type to resolution tier key in VIDEO_RES_SIZE_INFO
 _DATASET_RESOLUTION_TIER: dict[str, str] = {"gt480p": "480", "gt720p": "720", "gt1080p": "1080"}
 
 _MIN_FPS = 10
 _MAX_FPS = 60
+_UNIAE_TEMPORAL_COMPRESSION_FACTOR = 4
 
 
 class VideoParsing(Augmentor):
@@ -345,7 +352,7 @@ def __call__(self, data_dict: dict) -> dict | None:
         video_info["video"] = video_frames
         video_info["num_multiplier"] = num_multiplier  # Store the frame skipping multiplier
 
-
+        # NOTE: Explaining the logic of conditioning FPS calculation:
         # 1. Our video parser stores the original video FPS of the video.
         # 2. We have multiple modes of frame selection -- consecutive chunk of frames or subsampled frames.
         # Here's what we do in each case:
@@ -434,6 +441,45 @@ def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: O
         self.dataset_resolution_type = args.get("dataset_resolution_type", "all")
         self.resolution_tier = _DATASET_RESOLUTION_TIER.get(self.dataset_resolution_type)
 
+        # VAE temporal alignment mode.
+        # causal_vae=True  (default): align to 1+4N (causal VAE, e.g. Wan 2.2)
+        # causal_vae=False: align to 4N (non-causal VAE, e.g. UniAE)
+        self.causal_vae = args.get("causal_vae", True)
+        self.target_resolution_key = None if args.get("resolution") is None else str(args["resolution"])
+        self.uniae_pad_frames = None if args.get("uniae_pad_frames") is None else int(args["uniae_pad_frames"])
+        self.uniae_chunk_frames = self._normalize_uniae_chunk_frames(args.get("uniae_chunk_frames", None))
+
+    def _normalize_uniae_chunk_frames(
+        self, uniae_chunk_frames: int | Mapping[str, int] | None
+    ) -> int | dict[str, int] | None:
+        return normalize_uniae_chunk_frames(
+            uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=_UNIAE_TEMPORAL_COMPRESSION_FACTOR,
+            missing_pad_message="uniae_pad_frames must be specified if uniae_chunk_frames is specified",
+            temporal_divisibility_name="UniAE temporal compression factor",
+        )
+
+    def _get_uniae_chunk_frames(self, spatial_shape: tuple[int, int] | None = None) -> int:
+        assert self.uniae_chunk_frames is not None
+        return get_uniae_chunk_frames(
+            self.uniae_chunk_frames,
+            spatial_shape=spatial_shape,
+            target_resolution_key=self.target_resolution_key,
+        )
+
+    def _align_uniae_num_video_frames(self, num_video_frames: int, spatial_shape: tuple[int, int] | None = None) -> int:
+        assert self.uniae_pad_frames is not None
+        assert self.uniae_chunk_frames is not None
+        return align_uniae_num_video_frames(
+            num_video_frames,
+            self.uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=_UNIAE_TEMPORAL_COMPRESSION_FACTOR,
+            spatial_shape=spatial_shape,
+            target_resolution_key=self.target_resolution_key,
+        )
+
     def _sample_stride_with_bias(self, max_stride: int, min_stride: int = 1) -> int:
         """Sample a stride from [min_stride, max_stride] with bias controlled by low_fps_bias.
 
@@ -520,7 +566,6 @@ def _validate_and_probe(self, video: Optional[bytes], meta_dict: dict, data_dict
         return True
 
     def __call__(self, data_dict: dict) -> dict | None:
-
         # if in future we need to train with batch size > 1, need to pad frames
         try:
             meta_dict = data_dict[self.meta_key]
@@ -553,8 +598,10 @@ def __call__(self, data_dict: dict) -> dict | None:
                 f"Resize error. orig {(orig_w, orig_h)} desire {img_size} compute {target_size}"
             )
             transform = [Resize(target_size)]
+            output_spatial_shape = target_size
         else:
             transform = None
+            output_spatial_shape = (meta_dict["height"], meta_dict["width"])
 
         # Adding try-expcept because some of the data is bad and video decoding call fail.
         try:
@@ -569,11 +616,33 @@ def __call__(self, data_dict: dict) -> dict | None:
             stride = self._sample_stride_with_bias(self.max_stride, self.min_stride)
             frame_indices = np.arange(0, num_video_frames, stride).tolist()
 
-            # VAE compress temporal by 4x, with 1 as condition
-            # thus the max_video_frames must be 1 + 4N
+            # Align frame count to the active VAE temporal contract.
+            # causal_vae=True: 1+4N (causal VAE, e.g. Wan 2.2).
+            # causal_vae=False: UniAE chunk/pad alignment if configured; otherwise 4N.
             num_video_frames = min(len(frame_indices), self.args.get("max_num_frames", 1000))
-            N = (num_video_frames - 1) // 4
-            num_video_frames = 1 + 4 * N
+            if self.causal_vae:
+                N = (num_video_frames - 1) // 4
+                num_video_frames = 1 + 4 * N
+            else:
+                # If this is UniAE, we need to align the frame count to the chunk size and padding.
+                if self.uniae_chunk_frames is not None:
+                    # T is valid when r = (T-1) % effective_chunk_frames satisfies:
+                    #   r == 0  (exact multiple of chunks)
+                    #   OR r % 4 == target_r  where target_r = (-2*pad_frames) % 4
+                    # Compute minimum trim delta in O(1):
+                    #   delta = steps to nearest r' <= r satisfying the condition.
+                    num_video_frames = self._align_uniae_num_video_frames(num_video_frames, output_spatial_shape)
+
+                    if num_video_frames == 0:
+                        log.warning(
+                            f"VideoParsingWithFullFrames: video too short for UniAE. "
+                            f"url: {data_dict['__url__']}, key: {data_dict['__key__']}",
+                            rank0_only=False,
+                        )
+                        return None
+                else:
+                    N = num_video_frames // 4
+                    num_video_frames = 4 * N
             frame_indices = frame_indices[0:num_video_frames]
 
             frame_batch = video_decoder.get_frames_at(frame_indices)
@@ -698,7 +767,6 @@ def __init__(self, input_keys: list, output_keys: Optional[list] = None, args: O
         super().__init__(input_keys, output_keys, args)
 
     def __call__(self, data_dict: dict) -> dict | None:
-
         # if in future we need to train with batch size > 1, need to pad frames
         try:
             meta_dict = data_dict[self.meta_key]
@@ -743,8 +811,10 @@ def __call__(self, data_dict: dict) -> dict | None:
                 f"Resize error. orig {(orig_w, orig_h)} desire {img_size} compute {target_size}"
             )
             transform = [Resize(target_size)]
+            output_spatial_shape = target_size
         else:
             transform = None
+            output_spatial_shape = (meta_dict["height"], meta_dict["width"])
 
         # Adding try-expcept because some of the data is bad and video decoding call fail.
         try:
@@ -772,11 +842,19 @@ def __call__(self, data_dict: dict) -> dict | None:
             stride = self._sample_stride_with_bias(self.max_stride, self.min_stride)
             frame_indices = np.arange(chunk_start_clamped, chunk_end_clamped, stride).tolist()
 
-            # VAE compress temporal by 4x, with 1 as condition
-            # thus the max_video_frames must be 1 + 4N
+            # Align frame count to the active VAE temporal contract.
+            # causal_vae=True: 1+4N (causal VAE, e.g. Wan 2.2).
+            # causal_vae=False: UniAE chunk/pad alignment if configured; otherwise 4N.
             num_video_frames = min(len(frame_indices), self.args.get("max_num_frames", 1000))
-            N = (num_video_frames - 1) // 4
-            num_video_frames = 1 + 4 * N
+            if self.causal_vae:
+                N = (num_video_frames - 1) // 4
+                num_video_frames = 1 + 4 * N
+            else:
+                if self.uniae_chunk_frames is not None:
+                    num_video_frames = self._align_uniae_num_video_frames(num_video_frames, output_spatial_shape)
+                else:
+                    N = num_video_frames // 4
+                    num_video_frames = 4 * N
             if num_video_frames < 1:
                 log.warning(
                     f"VideoParsingChunkedFrames: chunk too short for stride. "
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/__init__.py b/cosmos_framework/data/vfm/augmentors/vlm/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/__init__.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/nvlm_data_unify.py b/cosmos_framework/data/vfm/augmentors/vlm/nvlm_data_unify.py
deleted file mode 100644
index eb029eb..0000000
--- a/cosmos_framework/data/vfm/augmentors/vlm/nvlm_data_unify.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-"""Visual-Text Transformations or Augmentations."""
-
-import io
-from typing import Dict, Optional
-
-from PIL import Image
-
-from cosmos_framework.data.imaginaire.webdataset.augmentors.augmentor import Augmentor
-from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.augmentors.vlm.nvlm_sample_loaders_and_part_filters import (
-    get_data_class,
-    get_part_filter,
-    get_sample_loader,
-)
-
-
-class NVLMImageDataUnify(Augmentor):
-    """
-    This augmentor is used to unify the data format of the nvlm data.
-    It will take the raw nvlm data tar and convert it to a dictionary with the following keys:
-    {
-        "__url__": str,
-        "__key__": str,
-        "data_class": str,
-        "images": List[PIL.Image.Image],
-        "text": str,
-        "words_boxes": Optional[List[List[int]]],
-        "words_text": Optional[List[str]],
-        "similarity_matrix": Optional[List[List[float]]],
-    }
-    """
-
-    def __init__(
-        self,
-        input_keys: list = ["raw_nvlm"],
-        output_keys: Optional[list] = [],
-        args: Optional[dict] = None,
-        data_path_prefix: list[str] = [
-            "cosmos_framework/ar/v2/nvlm/",
-        ],  # prefix of the data in s3
-    ) -> None:
-        super().__init__(input_keys, output_keys, args)
-        self.data_path_prefix = data_path_prefix
-
-    def convert_image(self, img):
-        try:
-            if isinstance(img, bytes):
-                img = Image.open(io.BytesIO(img)).convert("RGB")
-            elif isinstance(img, Image.Image):
-                img = img.convert("RGB")
-                pass  # Image is already in PIL format
-            elif isinstance(img, list):
-                for i in range(len(img)):
-                    img[i], success = self.convert_image(img[i])
-                    if not success:
-                        return Image.new("RGB", (256, 256), (0, 0, 0)), False
-                return img, True
-            else:
-                raise ValueError(f"Invalid image type: {type(img)}")
-
-            success = True
-        except Exception as e:
-            log.warning(f"Error processing image: {e}. Creating an empty black image.", rank0_only=False)
-            img = Image.new("RGB", (256, 256), (0, 0, 0))  # Creates a 256x256 black image
-            success = False
-        return img, success
-
-    def __call__(self, data_dict: Dict) -> Dict:
-        url = data_dict["__url__"]
-        data_path = "/".join(url.path.split("/")[:-1])  # remove the last part of the path
-        sample_loader = get_sample_loader(data_path)
-        part_filter = get_part_filter(data_path)
-        data_class = get_data_class(data_path)
-        assert sample_loader is not None and part_filter is not None and data_class is not None, (
-            f"sample_loader({sample_loader}) or part_filter({part_filter}) or data_class({data_class}) is not found for {data_path}"
-        )
-
-        raw = {"__url__": url, "__key__": data_dict["__key__"]}
-        output = {"__url__": url, "__key__": data_dict["__key__"]}
-        for k, v in data_dict.items():
-            ext = k.split(".")[-1]
-            if part_filter(ext):
-                raw[ext] = v
-        try:
-            output_converted = sample_loader(raw)
-            # Here output_converted will be a dictionary with the following keys:
-            # {
-            #   "__key__": str,
-            #   "image": PIL.Image.Image,
-            #   "images": List[PIL.Image.Image],
-            #   "text": str,
-            #   "words_boxes": Optional
-            #   "words_text": Optional
-            #   "similarity_matrix": Optional
-            # }
-        except Exception as e:
-            log.warning(
-                f"Error in sample_loader: {e}, sample_loader: {sample_loader}, data_path: {data_path}, raw: {raw.keys()}, original_data_dict: {data_dict.keys()}, __url__: {url}, __key__: {data_dict['__key__']}"
-            )
-            return None
-
-        output.update(output_converted)
-        if "image" not in output_converted and "images" not in output_converted:
-            success = False
-            log.warning(f"image not found in {output_converted.keys()}")
-        if "image" in output_converted:  # Single image case
-            img, success = self.convert_image(output["image"])
-            output["images"] = [img]  # What should be the format for the iamges
-        elif "images" in output_converted:
-            output["images"] = output_converted["images"]
-            output["images"], success = self.convert_image(output["images"])
-        if not success:
-            log.warning(f"image conversion failed for {data_dict['__key__']} url: {url} | Skip this data")
-            return None
-        output["data_class"] = data_class
-
-        return output
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/nvlm_sample_loaders_and_part_filters.py b/cosmos_framework/data/vfm/augmentors/vlm/nvlm_sample_loaders_and_part_filters.py
deleted file mode 100644
index fabe0c3..0000000
--- a/cosmos_framework/data/vfm/augmentors/vlm/nvlm_sample_loaders_and_part_filters.py
+++ /dev/null
@@ -1,2815 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-# Combined Sample Loaders
-# Auto-generated script combining all sample_loader.py files (Dont edit this file! Edit the projects/cosmos/reasoning/v1/scripts/create_sample_loader_and_part_filter_file.py instead)
-
-import io
-
-import torch
-from PIL import Image
-
-from cosmos_framework.utils import log
-from cosmos_framework.data.vfm.data_sources.vlm.nvlm import data_path_mapping
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-# import torch
-
-
-def sample_loader_0(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    if "text" in raw:
-        caption = raw["text"]
-    else:
-        caption = raw["json"]["caption"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        caption=caption,  # expected type: str
-    )
-
-
-def part_filter_0(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg", "text")
-
-
-# This file was automatically generated by `energon prepare`.
-
-
-
-def sample_loader_1(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_1(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `energon prepare`.
-
-
-
-def sample_loader_2(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_2(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `energon prepare`.
-
-
-
-def sample_loader_3(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_3(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `energon prepare`.
-
-
-
-def sample_loader_4(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_4(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_5(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_5(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_6(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    key = raw["__key__"]
-    if "docvqa" in key:
-        context = json_item["question"]
-        answers = json_item["answers"]
-        image = raw["jpg"]
-        answer_weights = json_item["answer_weights"]
-    elif "textvqa" in key or "lrv_instruct" in key:
-        context = json_item["question"]
-        answers = json_item["answer"]
-        image = raw["jpg"]
-        answer_weights = None
-    elif "stvqa" in key:
-        context = json_item["question"]
-        answers = json_item["answers"]
-        image = raw["jpg"]
-        answer_weights = [1.0] * len(json_item["answers"])
-    elif "chartqa" in key:
-        context = json_item["query"]
-        answers = json_item["label"]
-        image = raw["png"]
-        answer_weights = None
-    elif "screenqa" in key:
-        image = raw["jpg"]
-        context = json_item["question"]
-        answers = json_item["ground_truth"]
-        answer_weights = [1.0] * len(json_item["ground_truth"])
-    elif "HME100K" in key:
-        image = raw["jpg"]
-        context = "Please write out the expression of the formula in the image using LaTeX format."
-        answers = json_item["latex_formula"]
-        answer_weights = None
-    else:  # scale, textbook
-        image = raw["jpg"]
-        context = json_item["question"]
-        answers = json_item["answer"]
-        answer_weights = None
-
-    return dict(
-        __key__=key,
-        image=image,
-        context=context,
-        answers=answers,
-        answer_weights=answer_weights,
-    )
-
-
-def part_filter_6(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg", "png")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_7(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question_string"],  # expected type: str
-        answers=j["answer"],  # expected type: typing.Union[typing.List[str], NoneType], default: None
-        answer_weights=None,  # expected type: typing.Union[torch.Tensor, NoneType], default: None
-    )
-
-
-def part_filter_7(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_8(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"],  # expected type: str
-        answers=str(j["answer"]),  # expected type: typing.Optional[typing.List[str]], default: None
-        answer_weights=None,  # expected type: typing.Optional[torch.Tensor], default: None
-    )
-
-
-def part_filter_8(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_9(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"].strip(),  # expected type: str
-        answers=j["gt_answer"].strip(),  # expected type: typing.Union[typing.List[str], NoneType], default: None
-        answer_weights=None,  # expected type: typing.Union[torch.Tensor, NoneType], default: None
-    )
-
-
-def part_filter_9(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_10(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"],  # expected type: str
-        answers=j["answer"],  # expected type: typing.Optional[typing.List[str]], default: None
-        answer_weights=None,  # expected type: typing.Optional[torch.Tensor], default: None
-    )
-
-
-def part_filter_10(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_11(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        context=j["question"],
-        answers=j["answer"],
-        answer_weights=None,
-    )
-
-
-def part_filter_11(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_12(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"],  # expected type: str
-        answers=j["answer"],  # expected type: typing.Optional[typing.List[str]], default: None
-        answer_weights=None,  # expected type: typing.Optional[torch.Tensor], default: None
-    )
-
-
-def part_filter_12(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_13(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    key = raw["__key__"]
-
-    if "geoqa_plus" in key or "tqa" in key:
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            context=json_item["question"],
-            choices=json_item["choices"],
-            correct_choice_idx=json_item["correct_answer_index"],
-        )
-    elif "geometry3k" in key:
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            context=json_item["question"],
-            choices=json_item["choices"],
-            correct_choice_idx=ord(json_item["answer"].lower()) - 97,
-        )
-    else:  # science_qa, ai2d
-        image_key = "png" if "png" in raw else "jpg"
-        if image_key not in raw:
-            log.warning(f"Image key {image_key} not found in with raw keys: {raw.keys()}")
-        return dict(
-            __key__=raw["__key__"],  # science_qa_sample_{idx}
-            image=raw[image_key],  # expected type: torch.Tensor
-            context=json_item["question"],  # expected type: str
-            choices=json_item["choices"],  # expected type: typing.Union[typing.List[str], NoneType], default: None
-            correct_choice_idx=json_item["correct_choice_index"],
-        )
-
-
-def part_filter_13(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "png", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_14(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],  # arxiv_qa_sample_{idx}
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=json_item["question"],  # expected type: str
-        choices=json_item["options"],  # expected type: typing.Union[typing.List[str], NoneType], default: None
-        correct_choice_idx=json_item["correct_choice_index"],
-    )
-
-
-def part_filter_14(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_15(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    if json_item["question_type"] == "multi_choice":
-        correct_choice_idx = json_item["choices"].index(json_item["answer"])
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            context=json_item["question"],
-            choices=json_item["choices"],
-            correct_choice_idx=correct_choice_idx,
-        )
-    else:
-        # A temporary hack for non multi-choice samples.
-        # If correct_choice_idx=-1, we should route it to the VQAWebdataset dataloading method.
-        # (74.7% free-text questions, 25.3% multi-choice questions)
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            context=json_item["question"],
-            choices=[json_item["answer"]],
-            correct_choice_idx=-1,
-        )
-
-
-def part_filter_15(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_16(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]), images=[raw["jpg"]], texts=j["conversations"], similarity_matrix=None
-    )
-
-
-def part_filter_16(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_17(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_17(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_18(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_18(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_19(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,  # expected type: torch.Tensor
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_19(part: str) -> bool:
-
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_20(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]), images=[raw["jpg"]], texts=j["conversations"], similarity_matrix=None
-    )
-
-
-def part_filter_20(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_21(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]), images=[raw["png"]], texts=j["conversations"], similarity_matrix=None
-    )
-
-
-def part_filter_21(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "png")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_22(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,  # expected type: torch.Tensor
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_22(part: str) -> bool:
-
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_23(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,  # expected type: torch.Tensor
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_23(part: str) -> bool:
-
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_24(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_24(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_25(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]), images=[raw["jpg"]], texts=j["conversations"], similarity_matrix=None
-    )
-
-
-def part_filter_25(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_26(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]), images=[raw["jpg"]], texts=j["conversations"], similarity_matrix=None
-    )
-
-
-def part_filter_26(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_27(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_27(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_28(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_28(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_29(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_29(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_30(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_30(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_31(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_31(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_32(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_32(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_33(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_33(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_34(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_34(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_35(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_35(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_36(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_36(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_37(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_37(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_38(raw: dict) -> dict:
-    j = raw["json"]
-
-    if "ReCTs" in raw["__key__"]:
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            text="",
-            words_boxes=j["quads_1k_normalized"],
-            words_text=j["texts"],
-        )
-    else:  # coco-text-multi, textocr-multi
-        return dict(
-            __key__=raw["__key__"],
-            image=raw["jpg"],
-            text="",
-            words_boxes=j["bboxes_1k_normalized"],
-            words_text=j["texts"],
-        )
-
-
-def part_filter_38(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_39(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        image=raw["jpg"],  # expected type: torch.Tensor
-        text=" ".join(j["lines"]["text"]),  # expected type: str
-    )
-
-
-def part_filter_39(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_40(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_40(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_41(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_41(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_42(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_42(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_43(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_43(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_44(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_44(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_45(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_45(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_46(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_46(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_47(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_47(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_48(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_48(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_49(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_49(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_50(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_50(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_51(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    for i, turn in enumerate(json_item["conversations"]):
-        if i > 0 and turn["from"] == "human" and "<image>" in turn["value"]:
-            turn["value"] = turn["value"].replace("<image>\n", "")
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_51(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_52(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_52(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_53(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    # for i, turn in enumerate(json_item['conversations']):
-    #     if i > 0 and turn['from'] == 'human' and '<image>' in turn['value']:
-    #         turn['value'] = turn['value'].replace("<image>\n", "")
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_53(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_54(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_54(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_55(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_55(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_56(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_56(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_57(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_57(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_58(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_58(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_59(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_59(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_60(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_60(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_61(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_61(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_62(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_62(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_63(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_63(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_64(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_64(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_65(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_65(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_66(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_66(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_67(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_67(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("img", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_68(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_68(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_69(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_69(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_70(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_70(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_71(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_71(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_72(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-    images = [raw["jpg"]]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_72(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_73(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_73(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "img")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_74(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    img = Image.open(io.BytesIO(raw["img"]))
-    images = [img]
-
-    return dict(
-        __key__="llava-{}".format(raw["__key__"]),
-        images=images,
-        texts=json_item["conversations"],
-        similarity_matrix=None,
-    )
-
-
-def part_filter_74(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "img")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_75(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    if "text" in raw:
-        caption = raw["text"]
-    else:
-        caption = raw["json"]["caption"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        caption=caption,  # expected type: str
-    )
-
-
-def part_filter_75(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg", "text")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_76(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    if "text" in raw:
-        caption = raw["text"]
-    else:
-        caption = raw["json"]["caption"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        caption=caption,  # expected type: str
-    )
-
-
-def part_filter_76(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg", "text")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_77(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    total = len(json_item["conversations"]) // 2
-    idx = random.randrange(total)  # noqa: F821
-    human = json_item["conversations"][idx * 2]
-    out = json_item["conversations"][idx * 2 + 1]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        context=human["value"].replace("<image>\n", ""),
-        answers=out["value"],
-        answer_weights=None,
-    )
-
-
-def part_filter_77(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_78(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    json_item = raw["json"]
-
-    total = len(json_item["conversations"]) // 2
-    idx = random.randrange(total)  # noqa: F821
-    human = json_item["conversations"][idx * 2]
-    out = json_item["conversations"][idx * 2 + 1]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        context=human["value"].replace("<image>\n", ""),
-        answers=out["value"],
-        answer_weights=None,
-    )
-
-
-def part_filter_78(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-
-
-def sample_loader_79(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    if "answer" in j:
-        answers = [a[0] for a in j["answer"][0]]
-        answer_weights = torch.Tensor([float(a[1]) for a in j["answer"][0]])
-    else:
-        answers = None
-        answer_weights = None
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"],  # expected type: str
-        answers=answers,  # expected type: typing.List[str]
-        answer_weights=answer_weights,  # expected type: typing.Union[torch.Tensor, NoneType]
-    )
-
-
-def part_filter_79(part: str) -> bool:
-    # Filter for parts required by the sample_loader
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_80(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=raw["json"]["question"],  # expected type: str
-        answers=raw["json"]["answer"],  # expected type: typing.Union[typing.List[str], NoneType], default: None
-        answer_weights=None,  # expected type: typing.Union[torch.Tensor, NoneType], default: None
-    )
-
-
-def part_filter_80(part: str) -> bool:
-
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_81(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        image=raw["jpg"],  # expected type: torch.Tensor
-        text=" ".join(j["lines"]["text"]),  # expected type: str
-    )
-
-
-def part_filter_81(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_82(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text=j["text"],
-        words_boxes=j["bbox_1k_normalized"],
-    )
-
-
-def part_filter_82(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_83(raw: dict) -> dict:
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text=j["text"],
-        words_boxes=j["bbox_1k_normalized"],
-    )
-
-
-def part_filter_83(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_84(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text=j["text"],
-        words_boxes=j["bbox_1k_normalized"],
-    )
-
-
-def part_filter_84(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_85(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(__key__=raw["__key__"], image=raw["jpg"], text=j["text"], words_boxes=j["quad_1k_normalized"])
-
-
-def part_filter_85(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_86(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text=j["text"],
-        words_boxes=j["bbox_1k_normalized"],
-    )
-
-
-def part_filter_86(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_87(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-
-    quad = j["quad"]
-    quad = [val for point in quad for val in point]
-
-    return dict(
-        image=raw["jpg"],  # expected type: torch.Tensor
-        text=j["text"],  # expected type: str
-        words_boxes=quad,  # expected type: typing.Optional[torch.Tensor], default: None
-    )
-
-
-def part_filter_87(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_88(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text="",
-        words_boxes=j["bboxes_1k_normalized"],
-        words_text=j["texts"],
-    )
-
-
-def part_filter_88(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_89(raw: dict) -> dict:
-    j = raw["json"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text="",
-        words_boxes=j["bboxes_1k_normalized"],
-        words_text=j["texts"],
-    )
-
-
-def part_filter_89(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("jpg", "json")
-
-
-# This file was automatically generated by `nvgpt4 data prepare`.
-
-
-
-def sample_loader_90(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],
-        text="",
-        words_boxes=j["quads_1k_normalized"],
-        words_text=j["texts"],
-    )
-
-
-def part_filter_90(part: str) -> bool:
-
-    # E.g. if your dataset contains jpeg, txt and json, but you won't use json,
-    # remove it from the list, such that it is not decoded. If you need all, keep as is
-    return part in ("json", "jpg")
-
-
-
-
-def sample_loader_91(raw: dict) -> dict:  # Note: Images are already decoded to tensors
-    j = raw["json"]
-    if "answer" in j:
-        answers = [a[0] for a in j["answer"][0]]
-        answer_weights = torch.Tensor([float(a[1]) for a in j["answer"][0]])
-    else:
-        answers = None
-        answer_weights = None
-
-    return dict(
-        __key__=raw["__key__"],
-        image=raw["jpg"],  # expected type: torch.Tensor
-        context=j["question"],  # expected type: str
-        answers=answers,  # expected type: typing.List[str]
-        answer_weights=answer_weights,  # expected type: typing.Union[torch.Tensor, NoneType]
-    )
-
-
-def part_filter_91(part: str) -> bool:
-    # Filter for parts required by the sample_loader
-    return part in ("jpg", "json")
-
-
-# Dataset -> Sample Loader Mapping
-dataset_loader_mapping = {
-    "coco_train_val_restval": {
-        "sample_loader": "sample_loader_0",
-        "part_filter": "part_filter_0",
-        "data_class": "CaptioningWebdataset",
-        "data_weight": 0.01,
-    },
-    "extended-sci/data/merged/CoT": {
-        "sample_loader": "sample_loader_1",
-        "part_filter": "part_filter_1",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.006,
-    },
-    "extended-sci/data/merged/single-choice": {
-        "sample_loader": "sample_loader_2",
-        "part_filter": "part_filter_2",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.004,
-    },
-    "extended-sci/data/extended-sci-3/CoT": {
-        "sample_loader": "sample_loader_3",
-        "part_filter": "part_filter_3",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0006,
-    },
-    "extended-sci/data/extended-sci-3/single-choice": {
-        "sample_loader": "sample_loader_4",
-        "part_filter": "part_filter_4",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0004,
-    },
-    "nvlm/wdai/data/SceMQA_processed": {
-        "sample_loader": "sample_loader_5",
-        "part_filter": "part_filter_5",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0006,
-    },
-    "nvlm/wdai/data/vqa_collection_doc_text_st_chart_scale_textbook_LRV_Screen": {
-        "sample_loader": "sample_loader_6",
-        "part_filter": "part_filter_6",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.08,
-    },
-    "nvlm/wdai/data/plotqa/processed": {
-        "sample_loader": "sample_loader_7",
-        "part_filter": "part_filter_7",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.095,
-    },
-    "nvlm/wdai/data/clevr-math/processed": {
-        "sample_loader": "sample_loader_8",
-        "part_filter": "part_filter_8",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/MMC-Instruction/processed": {
-        "sample_loader": "sample_loader_9",
-        "part_filter": "part_filter_9",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.07,
-    },
-    "nvlm/wdai/data/ocrvqa/processed": {
-        "sample_loader": "sample_loader_10",
-        "part_filter": "part_filter_10",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.06,
-    },
-    "nvlm/wdai/data/dude/processed": {
-        "sample_loader": "sample_loader_11",
-        "part_filter": "part_filter_11",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/VisualMRC/processed": {
-        "sample_loader": "sample_loader_12",
-        "part_filter": "part_filter_12",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.015,
-    },
-    "nvlm/wdai/data/mcvqa_collection_scienceqa_ai2d_geoqaplus_geometry3k_tqa": {
-        "sample_loader": "sample_loader_13",
-        "part_filter": "part_filter_13",
-        "data_class": "MultiChoiceVQAWebdataset",
-        "data_weight": 0.025,
-    },
-    "nvlm/wdai/data/arxiv_qa/processed": {
-        "sample_loader": "sample_loader_14",
-        "part_filter": "part_filter_14",
-        "data_class": "MultiChoiceVQAWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/tabmwp/processed": {
-        "sample_loader": "sample_loader_15",
-        "part_filter": "part_filter_15",
-        "data_class": "MultiChoiceVQAWebdataset",
-        "data_weight": 0.015,
-    },
-    "nvlm/wdai/data/ocr_vqa_aug/processed": {
-        "sample_loader": "sample_loader_16",
-        "part_filter": "part_filter_16",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.055,
-    },
-    "nvlm/wdai/data/dvqa_full/processed": {
-        "sample_loader": "sample_loader_17",
-        "part_filter": "part_filter_17",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.055,
-    },
-    "nvlm/wdai/data/LLaVA-v1.5_shuffle/no_refcoco_vg_ocrvqa": {
-        "sample_loader": "sample_loader_18",
-        "part_filter": "part_filter_18",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.085,
-    },
-    "vqa/more_data/infographics_vqa/processed/train": {
-        "sample_loader": "sample_loader_19",
-        "part_filter": "part_filter_19",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/sharegpt4o/processed": {
-        "sample_loader": "sample_loader_20",
-        "part_filter": "part_filter_20",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/sparse_ocr_data/merged": {
-        "sample_loader": "sample_loader_21",
-        "part_filter": "part_filter_21",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.045,
-    },
-    "nvlm/nayeonl/data/blendv4/MetaMathQA/processed/train_text_image": {
-        "sample_loader": "sample_loader_22",
-        "part_filter": "part_filter_22",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.004,
-    },
-    "nvlm/nayeonl/data/blendv4/gsm8k/processed/train_text_image": {
-        "sample_loader": "sample_loader_23",
-        "part_filter": "part_filter_23",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.003,
-    },
-    "nvlm/wdai/data/docmatix/processed": {
-        "sample_loader": "sample_loader_24",
-        "part_filter": "part_filter_24",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.1,
-    },
-    "nvlm/wdai/data/bentham_hw_squad/processed": {
-        "sample_loader": "sample_loader_25",
-        "part_filter": "part_filter_25",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/WikiTableQA/processed": {
-        "sample_loader": "sample_loader_26",
-        "part_filter": "part_filter_26",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.003,
-    },
-    "nvlm/wdai/data/figureqa/processed": {
-        "sample_loader": "sample_loader_27",
-        "part_filter": "part_filter_27",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/llava-onevision/ai2d_combined_processed": {
-        "sample_loader": "sample_loader_28",
-        "part_filter": "part_filter_28",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/llava-onevision/math_combined_processed": {
-        "sample_loader": "sample_loader_29",
-        "part_filter": "part_filter_29",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.035,
-    },
-    "nvlm/wdai/data/llava-onevision/robut_combined_processed": {
-        "sample_loader": "sample_loader_30",
-        "part_filter": "part_filter_30",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/llava-onevision/llavar_20k_processed": {
-        "sample_loader": "sample_loader_31",
-        "part_filter": "part_filter_31",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/llava-onevision/tallyqa_processed": {
-        "sample_loader": "sample_loader_32",
-        "part_filter": "part_filter_32",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/llava-onevision/ureader_ie_processed": {
-        "sample_loader": "sample_loader_33",
-        "part_filter": "part_filter_33",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/llava-onevision/visual7w_processed": {
-        "sample_loader": "sample_loader_34",
-        "part_filter": "part_filter_34",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.006,
-    },
-    "nvlm/wdai/data/llava-onevision/mavis_math_rule_geo_processed": {
-        "sample_loader": "sample_loader_35",
-        "part_filter": "part_filter_35",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/llava-onevision/ureader_kg_processed": {
-        "sample_loader": "sample_loader_36",
-        "part_filter": "part_filter_36",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/llava-onevision/ureader_qa_processed": {
-        "sample_loader": "sample_loader_37",
-        "part_filter": "part_filter_37",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/ocr_multi_collection_cocotext_textocr_ReCTs": {
-        "sample_loader": "sample_loader_38",
-        "part_filter": "part_filter_38",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/pdfa-eng-wds/processed_word_len_500": {
-        "sample_loader": "sample_loader_39",
-        "part_filter": "part_filter_39",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.015,
-    },
-    "nvlm/wdai/data/llava-onevision/super_clevr_processed": {
-        "sample_loader": "sample_loader_40",
-        "part_filter": "part_filter_40",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/llava-onevision/icon_qa_processed": {
-        "sample_loader": "sample_loader_41",
-        "part_filter": "part_filter_41",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.009,
-    },
-    "nvlm/wdai/data/augmentations/chartqa_aug": {
-        "sample_loader": "sample_loader_42",
-        "part_filter": "part_filter_42",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/augmentations/gpt_chartqa": {
-        "sample_loader": "sample_loader_43",
-        "part_filter": "part_filter_43",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.006,
-    },
-    "nvlm/wdai/data/augmentations/gpt_docvqa": {
-        "sample_loader": "sample_loader_44",
-        "part_filter": "part_filter_44",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.006,
-    },
-    "nvlm/wdai/data/augmentations/docvqa_text": {
-        "sample_loader": "sample_loader_45",
-        "part_filter": "part_filter_45",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.006,
-    },
-    "nvlm/wdai/data/augmentations/textvqa_text": {
-        "sample_loader": "sample_loader_46",
-        "part_filter": "part_filter_46",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.008,
-    },
-    "nvlm/wdai/data/augmentations/i2s-musicsheet": {
-        "sample_loader": "sample_loader_47",
-        "part_filter": "part_filter_47",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0005,
-    },
-    "nvlm/wdai/data/augmentations/music": {
-        "sample_loader": "sample_loader_48",
-        "part_filter": "part_filter_48",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/augmentations/invoice": {
-        "sample_loader": "sample_loader_49",
-        "part_filter": "part_filter_49",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.002,
-    },
-    "nvlm/wdai/data/augmentations/k12": {
-        "sample_loader": "sample_loader_50",
-        "part_filter": "part_filter_50",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.019,
-    },
-    "nvlm/wdai/data/augmentations/MTVQA": {
-        "sample_loader": "sample_loader_51",
-        "part_filter": "part_filter_51",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/augmentations/VisualWebInstruct": {
-        "sample_loader": "sample_loader_52",
-        "part_filter": "part_filter_52",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.028,
-    },
-    "nvlm/wdai/data/augmentations/financeqa": {
-        "sample_loader": "sample_loader_53",
-        "part_filter": "part_filter_53",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/augmentations/docreason": {
-        "sample_loader": "sample_loader_54",
-        "part_filter": "part_filter_54",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.004,
-    },
-    "nvlm/wdai/data/augmentations/gpt_mtwi": {
-        "sample_loader": "sample_loader_55",
-        "part_filter": "part_filter_55",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/augmentations/geos_gpt": {
-        "sample_loader": "sample_loader_56",
-        "part_filter": "part_filter_56",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0001,
-    },
-    "nvlm/wdai/data/augmentations/cauldron_vistext": {
-        "sample_loader": "sample_loader_57",
-        "part_filter": "part_filter_57",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/augmentations/memes": {
-        "sample_loader": "sample_loader_58",
-        "part_filter": "part_filter_58",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/augmentations/gpt_roadtext": {
-        "sample_loader": "sample_loader_59",
-        "part_filter": "part_filter_59",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0002,
-    },
-    "nvlm/wdai/data/augmentations/indoor_qa": {
-        "sample_loader": "sample_loader_60",
-        "part_filter": "part_filter_60",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/augmentations/colpali": {
-        "sample_loader": "sample_loader_61",
-        "part_filter": "part_filter_61",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/augmentations/pmc_vqa": {
-        "sample_loader": "sample_loader_62",
-        "part_filter": "part_filter_62",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/augmentations/pathvqa": {
-        "sample_loader": "sample_loader_63",
-        "part_filter": "part_filter_63",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.004,
-    },
-    "nvlm/wdai/data/augmentations/sciqa": {
-        "sample_loader": "sample_loader_64",
-        "part_filter": "part_filter_64",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.027,
-    },
-    "nvlm/wdai/data/augmentations/chinese_meme": {
-        "sample_loader": "sample_loader_65",
-        "part_filter": "part_filter_65",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/augmentations/gpt_hiertext": {
-        "sample_loader": "sample_loader_66",
-        "part_filter": "part_filter_66",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.003,
-    },
-    "nvlm/wdai/data/augmentations/cauldron_cocoqa": {
-        "sample_loader": "sample_loader_67",
-        "part_filter": "part_filter_67",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.007,
-    },
-    "nvlm/wdai/data/cmm-math/processed": {
-        "sample_loader": "sample_loader_68",
-        "part_filter": "part_filter_68",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/mmtab/processed": {
-        "sample_loader": "sample_loader_69",
-        "part_filter": "part_filter_69",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.008,
-    },
-    "nvlm/wdai/data/simchart9k/processed": {
-        "sample_loader": "sample_loader_70",
-        "part_filter": "part_filter_70",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/llava-onevision/mapqa_processed": {
-        "sample_loader": "sample_loader_71",
-        "part_filter": "part_filter_71",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/llava-onevision/vizwiz_processed": {
-        "sample_loader": "sample_loader_72",
-        "part_filter": "part_filter_72",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.002,
-    },
-    "nvlm/wdai/data/augmentations/gpt_infovqa": {
-        "sample_loader": "sample_loader_73",
-        "part_filter": "part_filter_73",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/augmentations/viquae": {
-        "sample_loader": "sample_loader_74",
-        "part_filter": "part_filter_74",
-        "data_class": "SimilarityInterleavedWebdataset",
-        "data_weight": 0.0005,
-    },
-    "captioning/ccs_recaptioned/webdataset": {
-        "sample_loader": "sample_loader_75",
-        "part_filter": "part_filter_75",
-        "data_class": "CaptioningWebdataset",
-        "data_weight": 0.2,
-    },
-    "captioning/laion115m-clean": {
-        "sample_loader": "sample_loader_76",
-        "part_filter": "part_filter_76",
-        "data_class": "CaptioningWebdataset",
-        "data_weight": 0.579,
-    },
-    "nvlm/wdai/data/dvqa_full/processed_pt": {
-        "sample_loader": "sample_loader_77",
-        "part_filter": "part_filter_77",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/docmatix/processed_pt": {
-        "sample_loader": "sample_loader_78",
-        "part_filter": "part_filter_78",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.02,
-    },
-    "vqa/VQAv2/stage1": {
-        "sample_loader": "sample_loader_91",
-        "part_filter": "part_filter_91",
-        "data_class": "VQAWebdataset",
-        "data_weight": 1.0,
-    },
-    "vqa/Visual_Genome": {
-        "sample_loader": "sample_loader_80",
-        "part_filter": "part_filter_80",
-        "data_class": "VQAWebdataset",
-        "data_weight": 0.01,
-    },
-    "nvlm/wdai/data/pdfa-eng-wds/processed_word_len_300": {
-        "sample_loader": "sample_loader_81",
-        "part_filter": "part_filter_81",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.08,
-    },
-    "nvlm/wdai/data/textocr/processed": {
-        "sample_loader": "sample_loader_82",
-        "part_filter": "part_filter_82",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.02,
-    },
-    "nvlm/wdai/data/coco-text/processed": {
-        "sample_loader": "sample_loader_83",
-        "part_filter": "part_filter_83",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.002,
-    },
-    "nvlm/wdai/data/ArT/processed": {
-        "sample_loader": "sample_loader_84",
-        "part_filter": "part_filter_84",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/ReCTs/processed": {
-        "sample_loader": "sample_loader_85",
-        "part_filter": "part_filter_85",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/lsvt/processed": {
-        "sample_loader": "sample_loader_86",
-        "part_filter": "part_filter_86",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.005,
-    },
-    "nvlm/wdai/data/RCTW/processed": {
-        "sample_loader": "sample_loader_87",
-        "part_filter": "part_filter_87",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.001,
-    },
-    "nvlm/wdai/data/coco-text/processed_multi": {
-        "sample_loader": "sample_loader_88",
-        "part_filter": "part_filter_88",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.0003,
-    },
-    "nvlm/wdai/data/textocr/processed_multi": {
-        "sample_loader": "sample_loader_89",
-        "part_filter": "part_filter_89",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.0004,
-    },
-    "nvlm/wdai/data/ReCTs/processed_multi": {
-        "sample_loader": "sample_loader_90",
-        "part_filter": "part_filter_90",
-        "data_class": "OCRWebdataset",
-        "data_weight": 0.0003,
-    },
-}
-
-
-def get_sample_loader(path):
-    """Returns the correct sample_loader function for a dataset."""
-    if path not in dataset_loader_mapping:
-        path = data_path_mapping(path)
-    assert path in dataset_loader_mapping, f"path {path} not in dataset_loader_mapping"
-    return globals().get(dataset_loader_mapping.get(path, {}).get("sample_loader"))
-
-
-def get_part_filter(path):
-    """Returns the correct part_filter function for a dataset."""
-    if path not in dataset_loader_mapping:
-        path = data_path_mapping(path)
-    assert path in dataset_loader_mapping, f"path {path} not in dataset_loader_mapping"
-    return globals().get(dataset_loader_mapping.get(path, {}).get("part_filter"))
-
-
-def get_data_class(path):
-    """Returns the correct data_class for a dataset."""
-    if path not in dataset_loader_mapping:
-        path = data_path_mapping(path)
-
-    assert path in dataset_loader_mapping, f"path {path} not in dataset_loader_mapping"
-    return dataset_loader_mapping[path]["data_class"]
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/prompt_format.py b/cosmos_framework/data/vfm/augmentors/vlm/prompt_format.py
index ec86e66..5b576c4 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/prompt_format.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/prompt_format.py
@@ -45,7 +45,6 @@ def __call__(self, data_dict: Dict) -> Dict:
         if isinstance(list_of_conversation[0], list):
             selected_conversation = random.sample(list_of_conversation, 1)[0]
         elif isinstance(list_of_conversation[0], dict):
-
             selected_conversation = list_of_conversation
         else:
             raise ValueError(
@@ -82,7 +81,6 @@ def __call__(self, data_dict: Dict) -> Dict:
 
         del data_dict[conversation_key]
 
-
         # # enforce chat order
         # self._enforce_text_chat_order(selected_conversation)
 
@@ -91,7 +89,7 @@ def __call__(self, data_dict: Dict) -> Dict:
     def _enforce_text_chat_order(self, conversation: list) -> None:
         """
         Reorder text content within user messages based on text_chat_order setting.
-        NOTE: this does NOT work for interleaved data!!!!!!
+        NOTE (maxzhaoshuol): this does NOT work for interleaved data!!!!!!
 
         Args:
             conversation: List of message dictionaries
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/timestamp.py b/cosmos_framework/data/vfm/augmentors/vlm/timestamp.py
index edede0c..88d3ac5 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/timestamp.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/timestamp.py
@@ -97,7 +97,7 @@ def overlay_text(
         return images, [compute_timestamps(i, fps, processor) for i in range(len(images))]
 
     # Try to use DejaVu Sans Mono font for better readability
-    font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf", font_size)
+    font = ImageFont.truetype("/invalid_dir", font_size)
 
     # Process each image
     processed_images = []
@@ -392,17 +392,15 @@ def augment_user_prompt(
     elif output_format == "temporal_caption":
         event = assistant_message[0]
         if random.random() < 0.333333:
-
             start = round(event["start"])
             end = round(event["end"])
         elif random.random() < 0.666666:
-
             start = round(event["start"] * 2) / 2
             end = round(event["end"] * 2) / 2
         else:
             start = event["start"]
             end = event["end"]
-        if start == end:  # HACK: remove events with start == end
+        if start == end:
             raise ValueError("Start and end time are the same for data.")
         if timestamp_format == "seconds":
             if random.random() < 0.5:
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_with_subject_tracking.py b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_with_subject_tracking.py
index 0507109..90cc9ab 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_with_subject_tracking.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_with_subject_tracking.py
@@ -224,17 +224,15 @@ def augment_user_prompt(
     elif output_format == "temporal_caption_subject":
         event = assistant_message[0]
         if random.random() < 0.333333:
-
             start = round(event["start"])
             end = round(event["end"])
         elif random.random() < 0.666666:
-
             start = round(event["start"] * 2) / 2
             end = round(event["end"] * 2) / 2
         else:
             start = event["start"]
             end = event["end"]
-        if start == end:  # HACK: remove events with start == end
+        if start == end:
             log.warning(f"Start and end time are the same for data. {event}")
             return None
 
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_augment_message.py b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_augment_message.py
index 584ca1d..7212510 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_augment_message.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_augment_message.py
@@ -162,14 +162,12 @@ def augment_user_prompt(
     elif output_format == "temporal_caption":
         event = assistant_message[0]
         if random.random() < 0.5:
-
             start = round(event["start"])
             end = round(event["end"])
         else:
-
             start = round(event["start"] * 2) / 2
             end = round(event["end"] * 2) / 2
-        if start == end:  # HACK: remove events with start == end
+        if start == end:
             raise ValueError("Start and end time are the same for data.")
         user_prompt = random.choice(
             [
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_end_time.py b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_end_time.py
index 8df9dd1..812e113 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_end_time.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/timestamp_without_end_time.py
@@ -205,10 +205,8 @@ def augment_user_prompt(
     elif output_format == "temporal_caption":
         event = assistant_message[0]
         if random.random() < 0.333333:
-
             start = round(event["start"])
         elif random.random() < 0.666666:
-
             start = round(event["start"] * 2) / 2
         else:
             start = event["start"]
diff --git a/cosmos_framework/data/vfm/augmentors/vlm/tokenize_data.py b/cosmos_framework/data/vfm/augmentors/vlm/tokenize_data.py
index a0ce29c..f3a9914 100644
--- a/cosmos_framework/data/vfm/augmentors/vlm/tokenize_data.py
+++ b/cosmos_framework/data/vfm/augmentors/vlm/tokenize_data.py
@@ -158,7 +158,6 @@ def __call__(self, data_dict: Dict) -> Dict:
             if message["role"] == "user" and isinstance(message["content"], list):
                 total_images += len([content for content in message["content"] if content["type"] == "image"])
                 total_videos += len([content for content in message["content"] if content["type"] == "video"])
-
         assert total_videos == 1 or total_videos == 0, "Only one video is supported for now"
 
         # url
@@ -167,7 +166,6 @@ def __call__(self, data_dict: Dict) -> Dict:
         # go through each message in the conversation
         for message in conversation:
             # for user message, we insert the media
-
             if message["role"] == "user" and isinstance(
                 message["content"], list
             ):  # Otherwise it's text and content is a string
@@ -225,7 +223,6 @@ def __call__(self, data_dict: Dict) -> Dict:
                         raw_images.append(image)
 
                     elif content["type"] == "video":
-
                         # as tokenization will NOT upsample the video, we can use a larger value here at the cost of multiple video having 1.5x token length
                         max_total_pixels = token_to_pixels(self.max_video_token_length * 1.5, temporal_patch_size=2)
                         media_key = content["video"]
@@ -248,7 +245,6 @@ def __call__(self, data_dict: Dict) -> Dict:
                             return None
                         videos = data_dict["media"][media_key]["videos"]  # list of PIL images
                         fps = data_dict["media"][media_key]["fps"]
-
                         # this is because videos are decoded to be around "max_video_token_length" tokens
 
                         videos = maybe_subsample_frames(
diff --git a/cosmos_framework/data/vfm/joint_dataloader.py b/cosmos_framework/data/vfm/joint_dataloader.py
index 56b13e7..ba84ceb 100644
--- a/cosmos_framework/data/vfm/joint_dataloader.py
+++ b/cosmos_framework/data/vfm/joint_dataloader.py
@@ -1,7 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
+import math
 from collections import deque
+from collections.abc import Mapping
 from dataclasses import dataclass
 from typing import Any, ClassVar, Dict, Union
 
@@ -12,6 +14,11 @@
 
 from cosmos_framework.utils.lazy_config import instantiate
 from cosmos_framework.utils import log
+from cosmos_framework.model.vfm.tokenizers.uniae.frame_math import (
+    get_uniae_chunk_frames,
+    get_uniae_latent_num_frames,
+    normalize_uniae_chunk_frames,
+)
 
 _TIMING_KEYS = {"_sample_time", "_aug_time", "_pre_aug_time", "_aug_step_times"}
 _BATCH_TIMING_KEYS = {
@@ -38,6 +45,7 @@ def custom_collate_fn(batch):
         "sound",
         "raw_action_dim",
         "image_size",
+        "action_processing_record",
     }
 
     # Data keys where a per-sample value of ``None`` is a meaningful signal
@@ -57,7 +65,6 @@ def custom_collate_fn(batch):
     # Handle standard list of samples
     elem = batch[0]
     if isinstance(elem, dict):
-
         # Some Action datasets add optional metadata keys (for example
         # ``additional_view_description`` for concat-view captions) only for a
         # subset of samples.  PyTorch can batch such samples together when
@@ -72,6 +79,9 @@ def custom_collate_fn(batch):
             if key in _TIMING_KEYS:
                 continue
             values = [d.get(key) for d in batch]
+            if key == "action_processing_record":
+                result[key] = values
+                continue
             if any(value is None for value in values):
                 # Sparse data keys keep their None placeholders to preserve
                 # 1:1 alignment with sequence_plan.  Other (optional metadata)
@@ -165,6 +175,8 @@ def __init__(
         prewarm: bool = True,
         default_lookahead_limit: int = _DEFAULT_LOOKAHEAD_LIMIT,
         lookahead_limits: Dict[str, int] | None = None,
+        uniae_chunk_frames: int | Mapping[str, int] | None = None,
+        uniae_pad_frames: int | None = None,
     ):
         """
         Initialize the JointDataLoader with multiple datasets.
@@ -186,6 +198,8 @@ def __init__(
             default_lookahead_limit: Packing-loop look-ahead fallback for dataloaders not in
                 ``lookahead_limits``.
             lookahead_limits: Optional ``{dataset_name: int}`` per-dataloader override.
+            uniae_chunk_frames: Optional UniAE full chunk size, or resolution-keyed chunk sizes.
+            uniae_pad_frames: Optional UniAE boundary padding frames per chunk.
 
         Example:
             joint_loader = IterativeJointDataLoader(
@@ -211,6 +225,8 @@ def __init__(
         self.sound_latent_fps = sound_latent_fps
         self.audio_sample_rate = audio_sample_rate
         self.default_lookahead_limit = int(default_lookahead_limit)
+        self.uniae_pad_frames = int(uniae_pad_frames) if uniae_pad_frames is not None else None
+        self.uniae_chunk_frames = self._normalize_uniae_chunk_frames(uniae_chunk_frames)
 
         assert (self.max_sequence_length is None) != (self.max_samples_per_batch is None), (
             "Exactly one of max_sequence_length or max_samples_per_batch must be None, but not both."
@@ -221,6 +237,8 @@ def __init__(
         assert not unknown, f"lookahead_limits references unknown dataloaders {unknown}; valid: {sorted(dataloaders)}"
 
         for dataset_name, dataloader_data in dataloaders.items():
+            if dataloader_data is None:
+                continue
             assert set(dataloader_data.keys()) == {"dataloader", "ratio"}, f"Invalid config: {dataloader_data}"
             if dataloader_data["ratio"] <= 0:
                 continue
@@ -255,13 +273,42 @@ def __init__(
                 "JointDataLoader: prewarm DISABLED (debug mode); first iteration may incur per-stream cold-load cost"
             )
 
+    def _normalize_uniae_chunk_frames(
+        self, uniae_chunk_frames: int | Mapping[str, int] | None
+    ) -> int | dict[str, int] | None:
+        return normalize_uniae_chunk_frames(
+            uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=self.tokenizer_temporal_compression_factor,
+            temporal_divisibility_name="tokenizer_temporal_compression_factor",
+        )
+
+    def _get_uniae_chunk_frames(self, spatial_shape: tuple[int, int]) -> int:
+        assert self.uniae_chunk_frames is not None
+        return get_uniae_chunk_frames(self.uniae_chunk_frames, spatial_shape=spatial_shape)
+
+    def _compute_vision_latent_t_shape(self, T: int, H: int, W: int) -> int:
+        if T < 1:
+            raise ValueError(f"Vision media must contain at least one frame, got {T}.")
+        if T == 1 or self.uniae_chunk_frames is None:
+            return 1 + (T - 1) // self.tokenizer_temporal_compression_factor
+
+        assert self.uniae_pad_frames is not None
+        return get_uniae_latent_num_frames(
+            T,
+            self.uniae_chunk_frames,
+            pad_frames=self.uniae_pad_frames,
+            temporal_compression_factor=self.tokenizer_temporal_compression_factor,
+            spatial_shape=(H, W),
+        )
+
     def _prewarm_dataloaders(self) -> None:
         """Force all dataloader iterators to spawn workers and produce one batch.
 
         The first ``next()`` call on an ``InfiniteDataLoader`` iterator triggers
         ``DataLoader.__iter__()`` which spawns worker processes.  For action
         dataloaders using ``multiprocessing_context='spawn'``, each worker must
-        fully initialise heavy datasets (BridgeOrigLeRobotDataset, EMBODIMENT_A, etc.)
+        fully initialise heavy datasets (BridgeOrigLeRobotDataset, embodiment_a, etc.)
         from scratch.  If this happens lazily during training, the resulting
         delay (potentially minutes) causes NCCL collective timeouts when faster
         ranks enter the forward pass while slower ranks are still loading data.
@@ -362,14 +409,13 @@ def _compute_num_tokens_per_sample(self, data_batch: dict) -> int:
             else:
                 _, T, H, W = media.shape
 
-            vae_spatial_downsample = self.tokenizer_spatial_compression_factor * self.patch_spatial
-            vae_temporal_downsample = self.tokenizer_temporal_compression_factor
-
-            latent_h_shape = H // vae_spatial_downsample
-            latent_w_shape = W // vae_spatial_downsample
-            latent_t_shape = 1 + (T - 1) // vae_temporal_downsample
+            latent_h_shape = H // self.tokenizer_spatial_compression_factor
+            latent_w_shape = W // self.tokenizer_spatial_compression_factor
+            patch_h_shape = math.ceil(latent_h_shape / self.patch_spatial)
+            patch_w_shape = math.ceil(latent_w_shape / self.patch_spatial)
+            latent_t_shape = self._compute_vision_latent_t_shape(T, H, W)
 
-            num_vision_tokens = latent_h_shape * latent_w_shape * latent_t_shape + 2
+            num_vision_tokens = patch_h_shape * patch_w_shape * latent_t_shape + 2
             num_tokens += num_vision_tokens
 
         # Action part: each action time step is 1 token.
@@ -534,6 +580,8 @@ def __init__(
         prewarm: bool = True,
         default_lookahead_limit: int = JointDataLoader._DEFAULT_LOOKAHEAD_LIMIT,
         lookahead_limits: Dict[str, int] | None = None,
+        uniae_chunk_frames: int | Mapping[str, int] | None = None,
+        uniae_pad_frames: int | None = None,
     ):
         super().__init__(
             dataloaders,
@@ -547,6 +595,8 @@ def __init__(
             prewarm=prewarm,
             default_lookahead_limit=default_lookahead_limit,
             lookahead_limits=lookahead_limits,
+            uniae_chunk_frames=uniae_chunk_frames,
+            uniae_pad_frames=uniae_pad_frames,
         )
         self.seed = seed
         # Calculate probabilities for random sampling
@@ -787,6 +837,8 @@ def __init__(
         audio_sample_rate: int = 48000,
         dataset_name: str = "default",
         lookahead_limit: int = JointDataLoader._DEFAULT_LOOKAHEAD_LIMIT,
+        uniae_chunk_frames: int | Mapping[str, int] | None = None,
+        uniae_pad_frames: int | None = None,
     ):
         """
         Args:
@@ -802,6 +854,8 @@ def __init__(
             audio_sample_rate: Audio sample rate in Hz.
             dataset_name: Name tag attached to every sample in the output batch.
             lookahead_limit: Packing-loop look-ahead for the wrapped dataloader.
+            uniae_chunk_frames: Optional UniAE full chunk size, or resolution-keyed chunk sizes.
+            uniae_pad_frames: Optional UniAE boundary padding frames per chunk.
         """
         wrapped = {dataset_name: {"dataloader": dataloader, "ratio": 1}}
         super().__init__(
@@ -814,6 +868,8 @@ def __init__(
             sound_latent_fps=sound_latent_fps,
             audio_sample_rate=audio_sample_rate,
             lookahead_limits={dataset_name: int(lookahead_limit)},
+            uniae_chunk_frames=uniae_chunk_frames,
+            uniae_pad_frames=uniae_pad_frames,
         )
 
     def __iter__(self):
@@ -905,6 +961,8 @@ def __init__(
         audio_sample_rate: int = 48000,
         default_lookahead_limit: int = JointDataLoader._DEFAULT_LOOKAHEAD_LIMIT,
         lookahead_limits: Dict[str, int] | None = None,
+        uniae_chunk_frames: int | Mapping[str, int] | None = None,
+        uniae_pad_frames: int | None = None,
     ):
         super().__init__(
             dataloaders,
@@ -917,6 +975,8 @@ def __init__(
             audio_sample_rate=audio_sample_rate,
             default_lookahead_limit=default_lookahead_limit,
             lookahead_limits=lookahead_limits,
+            uniae_chunk_frames=uniae_chunk_frames,
+            uniae_pad_frames=uniae_pad_frames,
         )
 
         # Convert data ratios to probabilities
diff --git a/cosmos_framework/data/vfm/packing_iterable_dataset.py b/cosmos_framework/data/vfm/packing_iterable_dataset.py
new file mode 100644
index 0000000..ac30f0d
--- /dev/null
+++ b/cosmos_framework/data/vfm/packing_iterable_dataset.py
@@ -0,0 +1,271 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""
+Abstract base class for pool-based token-budget bin-packing over multiple datasets.
+
+Extracted from ``cosmos_framework.data.vfm.vlm.joint_dataset_dynamic_batch_webloader``
+so that both the VLM and VFM internal dataloaders can share a single packing implementation.
+
+Usage
+-----
+Subclass and implement ``compute_sample_tokens(sample) -> int``.
+Optionally override ``collate_batch(samples) -> Any`` for custom collation.
+
+    class MyPacker(PackingIterableDataset):
+        def compute_sample_tokens(self, sample):
+            return len(sample["input_ids"])
+"""
+
+from __future__ import annotations
+
+import random
+from abc import ABC, abstractmethod
+from collections import deque
+from enum import Enum
+from typing import Any, Union
+
+import torch
+
+from cosmos_framework.utils.lazy_config import instantiate
+from cosmos_framework.utils import log
+
+
+class Modality(Enum):
+    IMAGE = "image"
+    VIDEO = "video"
+    TEXT = "text"
+
+
+class PackingIterableDataset(torch.utils.data.IterableDataset, ABC):
+    """Pool-based greedy bin-packing IterableDataset.
+
+    Maintains a pool of ``pool_size`` samples and assembles batches by
+    greedily selecting candidates that fit within the token budget
+    ``max_tokens``.  Subclasses supply two hooks:
+
+    * ``compute_sample_tokens(sample)`` — token cost of one sample (abstract).
+    * ``collate_batch(samples)`` — assemble a packed list into a batch
+      (default: identity, returns the list unchanged).
+
+    Parameters
+    ----------
+    datasets_cfg:
+        Mapping ``{name: {"dataset": <iterable>, "ratio": <float>}}``.
+        The *dataset* value may be a Hydra lazy config, an already-constructed
+        ``IterableDataset``, or a plain ``DataLoader`` (its ``.dataset`` is
+        unwrapped automatically).
+    max_tokens:
+        Token budget per batch (padded cost = ``cur_max_len * batch_size``).
+    pool_size:
+        Number of samples to buffer before selecting a batch.
+    max_batch_size:
+        Hard cap on items per batch (0 or None = no cap).
+    long_threshold:
+        Samples with token count ``>= long_threshold`` are emitted as
+        singletons regardless of budget.
+    batching_strategy:
+        ``"prefer_closest"`` (default) or ``"prefer_first"``.
+    """
+
+    def __init__(
+        self,
+        datasets_cfg: dict[str, dict[str, Union[int, object]]],
+        max_tokens: int,
+        pool_size: int,
+        max_batch_size: int,
+        long_threshold: int,
+        batching_strategy: str,
+    ):
+        super().__init__()
+
+        assert batching_strategy in ("prefer_first", "prefer_closest"), (
+            f"batching_strategy must be 'prefer_first' or 'prefer_closest', got {batching_strategy!r}"
+        )
+
+        self.max_tokens = max_tokens
+        self.pool_size = pool_size
+        self.long_threshold = long_threshold
+        self.max_batch_size = max_batch_size
+        self.batching_strategy = batching_strategy
+
+        self._pool: deque[dict] = deque()
+        self._dataset_names: list[str] = []
+        self._ratios: list[float] = []
+        self._datasets: list[torch.utils.data.IterableDataset] = []
+
+        for name, cfg in datasets_cfg.items():
+            assert {"ratio", "dataset"} <= cfg.keys(), (
+                f"Each entry must have 'dataset' and 'ratio' keys: {name} -> {cfg.keys()}"
+            )
+            ratio = cfg["ratio"]
+            if ratio == 0:
+                log.info(f"Skipping dataset {name} with ratio {ratio}")
+                continue
+            dataset_cfg = cfg["dataset"]
+
+            ds = (
+                instantiate(dataset_cfg)
+                if not isinstance(dataset_cfg, (torch.utils.data.IterableDataset, torch.utils.data.DataLoader))
+                else dataset_cfg
+            )
+            if isinstance(ds, torch.utils.data.DataLoader):
+                ds = ds.dataset
+            if hasattr(ds, "build_dataset") and callable(getattr(ds, "build_dataset")):
+                ds = ds.build_dataset()
+
+            assert isinstance(ds, torch.utils.data.IterableDataset), (
+                f"Expected an IterableDataset, got {type(ds)} for {name}"
+            )
+
+            self._dataset_names.append(name)
+            self._ratios.append(float(ratio))
+            self._datasets.append(ds)
+            log.info(f"Added dataset {name} with ratio {ratio}")
+
+        log.info(f"added data: {list(datasets_cfg.keys())}")
+        assert len(self._datasets) > 0, "No datasets added"
+        self._data_len: int = sum(int(getattr(ds, "total_images", 0)) for ds in self._datasets)
+        if self._data_len == 0:
+            self._data_len = 10**12
+        self.iterators = [iter(ds) for ds in self._datasets]
+
+    # ------------------------------------------------------------------
+    # Abstract / overridable hooks
+    # ------------------------------------------------------------------
+
+    @abstractmethod
+    def compute_sample_tokens(self, sample: dict) -> int:
+        """Return the token cost of one sample for packing budget accounting."""
+
+    def collate_batch(self, samples: list[dict]) -> Any:
+        """Assemble a packed list of samples into one batch.
+
+        Default implementation returns the list unchanged (identity).
+        Override to pad, stack, or transform samples into tensors.
+        """
+        return samples
+
+    # ------------------------------------------------------------------
+    # PyTorch Dataset API
+    # ------------------------------------------------------------------
+
+    def __len__(self) -> int:
+        return self._data_len
+
+    def __iter__(self):
+        while True:
+            batch = self._best_fit_batch()
+            yield self.collate_batch(batch)
+
+    # ------------------------------------------------------------------
+    # Internal packing helpers (moved verbatim from _JointIterableDataset)
+    # ------------------------------------------------------------------
+
+    def _max_tokens(self, cur_max: int) -> int:
+        if cur_max < 1000:
+            return self.max_tokens
+        return self.max_tokens // 2
+
+    def _get_next_sample(self) -> dict:
+        index_id = random.choices(range(len(self.iterators)), weights=self._ratios, k=1)[0]
+        curr_dataset = self.iterators[index_id]
+        try:
+            output = next(curr_dataset)
+        except StopIteration:
+            log.critical(f"dataset {self._dataset_names[index_id]} exhausted")
+            self.iterators[index_id] = iter(self._datasets[index_id])
+            output = next(self.iterators[index_id])
+        return output
+
+    def _fill_pool(self):
+        while len(self._pool) < self.pool_size:
+            self._pool.append(self._get_next_sample())
+
+    def _padded_cost(self, cur_max: int, k: int) -> int:
+        return cur_max * k
+
+    def _get_modality(self, sample: dict) -> Modality:
+        if "pixel_values" in sample:
+            return Modality.IMAGE
+        elif "pixel_values_videos" in sample:
+            return Modality.VIDEO
+        return Modality.TEXT
+
+    def _best_fit_batch(self) -> list[dict]:
+        """Build one batch using the configured token-budget strategy."""
+        self._fill_pool()
+        seed = self._pool.popleft()
+        seed_modality = self._get_modality(seed)
+        L0 = self.compute_sample_tokens(seed)
+
+        if L0 >= self.long_threshold or L0 >= self._max_tokens(L0):
+            return [seed]
+
+        chosen = [seed]
+        cur_max = L0
+
+        while self._pool:
+            if self.max_batch_size and len(chosen) >= self.max_batch_size:
+                break
+            best_idx = self._find_best_candidate(cur_max, len(chosen), seed_modality)
+            if best_idx is None:
+                break
+            cand = self._remove_from_pool(best_idx)
+            chosen.append(cand)
+            cur_max = max(cur_max, self.compute_sample_tokens(cand))
+
+        return chosen
+
+    def _find_best_candidate(self, cur_max: int, num_chosen: int, seed_modality: Modality) -> int | None:
+        if self.batching_strategy == "prefer_first":
+            return self._find_best_candidate_prefer_first(cur_max, num_chosen, seed_modality)
+        return self._find_best_candidate_prefer_closest(cur_max, num_chosen, seed_modality)
+
+    def _find_best_candidate_prefer_first(self, cur_max: int, num_chosen: int, seed_modality: Modality) -> int | None:
+        best_idx = None
+        best_new_tokens = None
+        for idx, cand in enumerate(self._pool):
+            if self._get_modality(cand) != seed_modality:
+                continue
+            L = self.compute_sample_tokens(cand)
+            new_max = max(cur_max, L)
+            new_tokens = self._padded_cost(new_max, num_chosen + 1)
+            if new_tokens <= self._max_tokens(cur_max):
+                if best_new_tokens is None or new_tokens < best_new_tokens:
+                    best_new_tokens = new_tokens
+                    best_idx = idx
+        return best_idx
+
+    def _find_best_candidate_prefer_closest(self, cur_max: int, num_chosen: int, seed_modality: Modality) -> int | None:
+        best_idx = None
+        best_new_tokens = None
+        smallest_length_diff = None
+        for idx, cand in enumerate(self._pool):
+            if self._get_modality(cand) != seed_modality:
+                continue
+            L = self.compute_sample_tokens(cand)
+            new_max = max(cur_max, L)
+            new_tokens = self._padded_cost(new_max, num_chosen + 1)
+            if new_tokens <= self._max_tokens(cur_max):
+                length_diff = abs(L - cur_max)
+                if (
+                    best_new_tokens is None
+                    or new_tokens < best_new_tokens
+                    or (new_tokens == best_new_tokens and length_diff < smallest_length_diff)
+                ):
+                    best_new_tokens = new_tokens
+                    best_idx = idx
+                    smallest_length_diff = length_diff
+        return best_idx
+
+    def _remove_from_pool(self, idx: int) -> dict:
+        if idx == 0:
+            return self._pool.popleft()
+        elif idx == len(self._pool) - 1:
+            return self._pool.pop()
+        else:
+            self._pool.rotate(-idx)
+            item = self._pool.popleft()
+            self._pool.rotate(idx)
+            return item
diff --git a/cosmos_framework/data/vfm/processors/__init__.py b/cosmos_framework/data/vfm/processors/__init__.py
index a1cc60e..ce98401 100644
--- a/cosmos_framework/data/vfm/processors/__init__.py
+++ b/cosmos_framework/data/vfm/processors/__init__.py
@@ -125,7 +125,12 @@ def build_processor(
         return Qwen3VLProcessor(tokenizer_type, credentials=credentials, bucket=bucket, cache_dir=cache_dir)
     elif "nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16" in tokenizer_type:
         return NemotronVLProcessor(tokenizer_type, credentials=credentials, bucket=bucket, cache_dir=cache_dir)
-    elif "NVIDIA-Nemotron-3-Dense-VL" in tokenizer_type or "Qwen3-2B-ViT" in tokenizer_type:
+    elif (
+        "NVIDIA-Nemotron-3-Dense-VL" in tokenizer_type
+        or "Qwen3-2B-ViT" in tokenizer_type
+        or "nvidia/Cosmos3-Reasoner-2B-Private" in tokenizer_type
+        or "nvidia/Cosmos3-Edge-Reasoner" in tokenizer_type
+    ):
         return Nemotron3DenseVLProcessor(tokenizer_type, credentials=credentials, bucket=bucket, cache_dir=cache_dir)
     elif "Qwen/Qwen3-0.6B" in tokenizer_type:
         local_path = _download_llm_tokenizer(tokenizer_type, credentials, bucket, cache_dir)
diff --git a/cosmos_framework/data/vfm/processors/nemotronvl_processor.py b/cosmos_framework/data/vfm/processors/nemotronvl_processor.py
index 767c8ef..077c80e 100644
--- a/cosmos_framework/data/vfm/processors/nemotronvl_processor.py
+++ b/cosmos_framework/data/vfm/processors/nemotronvl_processor.py
@@ -248,7 +248,6 @@ def __init__(
         # NemotronVL hardcodes these helper attributes because they are not
         # discoverable from the HF model config; the values match the upstream
         # vision-encoder configuration.
-        # HACK: hardcoded based on the model config.
         self.min_height_width = 512
         self.patch_size = 16
         self.temporal_patch_size = 1
@@ -258,7 +257,6 @@ def __init__(
     def _resolve_pad_id(self):
         # NemotronVL's tokenizer does not specify a pad_token; reserve
         # <SPECIAL_999> for padding (project convention).
-
         return self.processor.tokenizer.convert_tokens_to_ids("<SPECIAL_999>")
 
     def apply_chat_template(
@@ -396,7 +394,7 @@ def add_assistant_tokens_mask(self, tokens):
 
     import requests
 
-    response = requests.get("https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg")
+    response = requests.get("https://invalid_url")
     img = Image.open(BytesIO(response.content))
 
     # test video
diff --git a/cosmos_framework/data/vfm/processors/qwen3vl_processor.py b/cosmos_framework/data/vfm/processors/qwen3vl_processor.py
index 030d040..dffa47c 100644
--- a/cosmos_framework/data/vfm/processors/qwen3vl_processor.py
+++ b/cosmos_framework/data/vfm/processors/qwen3vl_processor.py
@@ -71,7 +71,7 @@ def apply_chat_template(
         num_video, video_fps, video_total_num_frames, video_frames_indices = maybe_parse_video_content(messages)
         if num_video > 0:
             # Here we add the args to avoid the error:
-            # File "/usr/local/lib/python3.12/dist-packages/transformers/video_processing_utils.py", line 321, in _decode_and_sample_videos
+            # File "/invalid_dir", line 321, in _decode_and_sample_videos
             #     raise ValueError(
             # ValueError: Sampling frames from a list of images is not supported! Set `do_sample_frames=False`.
             kwargs["videos_kwargs"] = dict(do_sample_frames=False)
@@ -178,7 +178,7 @@ def add_assistant_tokens_mask(self, tokens):
             "content": [
                 {
                     "type": "video",
-                    "video": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"] * 4,
+                    "video": ["https://invalid_url"] * 4,
                     "fps": 12,
                 },
                 {"type": "text", "text": "What is the capital of France?"},
diff --git a/cosmos_framework/data/vfm/sequence_packing.py b/cosmos_framework/data/vfm/sequence_packing.py
index d2821cc..1209a2d 100644
--- a/cosmos_framework/data/vfm/sequence_packing.py
+++ b/cosmos_framework/data/vfm/sequence_packing.py
@@ -685,6 +685,7 @@ def _pack_vision_tokens(
     enable_fps_modulation: bool = False,
     base_fps: float = 24.0,
     temporal_compression_factor: int = 4,
+    vision_temporal_positions: torch.Tensor | None = None,
 ) -> int:
     """Pack vision tokens into the sequence.
 
@@ -701,6 +702,8 @@ def _pack_vision_tokens(
         enable_fps_modulation: If True, scale temporal position IDs based on video FPS.
         base_fps: Base FPS for normalization (default 24.0).
         temporal_compression_factor: VAE temporal compression factor (default 4).
+        vision_temporal_positions: Optional explicit temporal coordinate per latent
+            frame, shape ``(T,)``. Used by UniAE to account for kept boundary latents.
     Returns:
         Vision split length.
     """
@@ -773,6 +776,8 @@ def _pack_vision_tokens(
     if packed_seq._use_mrope:
         # Determine FPS for this vision segment (None disables FPS modulation)
         effective_fps = vision_fps if enable_fps_modulation else None
+        if vision_temporal_positions is not None:
+            vision_temporal_positions = vision_temporal_positions.to(device="cpu", dtype=torch.float32)  # [T]
 
         vision_mrope_ids, packed_seq._mrope_temporal_offset = get_3d_mrope_ids_vae_tokens(
             grid_t=latent_t,
@@ -783,6 +788,8 @@ def _pack_vision_tokens(
             fps=effective_fps,
             base_fps=base_fps,
             temporal_compression_factor=temporal_compression_factor,
+            temporal_positions=vision_temporal_positions,
+            actual_temporal_compression_factor=temporal_compression_factor,
         )  # vision_mrope_ids: [3,N_vision_tokens]
         packed_seq.position_ids.append(vision_mrope_ids)
     else:
@@ -850,7 +857,6 @@ def _pack_action_tokens(
     packed_seq.action.token_shapes.append((action_split_len,))
     packed_seq.action.tokens.append(input_action_tokens)
 
-
     condition_set = {idx for idx in condition_frame_indexes_action if 0 <= idx < action_split_len}
     assert isinstance(packed_seq.action.condition_mask, list)
 
@@ -917,6 +923,7 @@ def _pack_sound_tokens(
     enable_fps_modulation: bool = False,
     base_fps: float = 24.0,
     sound_fps: float | None = None,
+    sound_base_temporal_compression_factor: int | None = None,
 ) -> int:
     """Pack sound/audio tokens into the sequence.
 
@@ -936,6 +943,8 @@ def _pack_sound_tokens(
         enable_fps_modulation: If True, scale temporal positions by FPS ratio.
         base_fps: Base FPS for normalization (default 24.0).
         sound_fps: Sound latent FPS (e.g., 25.0). Used for FPS-aware m-RoPE positions.
+        sound_base_temporal_compression_factor: Base temporal compression factor for sound FPS scaling.
+            ``None`` preserves the current behavior where sound advances at ``base_fps`` positions/sec.
 
     Returns:
         Number of sound tokens added.
@@ -1008,6 +1017,7 @@ def _pack_sound_tokens(
             fps=effective_fps,
             base_fps=base_fps,
             temporal_compression_factor=1,  # Sound latent is already at sound_latent_fps (no further compression)
+            base_temporal_compression_factor=sound_base_temporal_compression_factor,
             start_frame_offset=0,  # Sound[0] aligns with vision frame 0
         )  # sound_mrope_ids: [3,N_sound_tokens]
         packed_seq.position_ids.append(sound_mrope_ids)
@@ -1047,11 +1057,18 @@ def _pack_supertokens_temporal_causal(
     ``num_action_tokens_per_supertoken=0`` is stamped on the pack and read by the
     attention builder so NATTEN metadata stays in sync automatically.
 
-    mRoPE layout (with actions, unified_3d_mrope only):
-        - Null actions (frame 0):          all tcf tokens at ``temporal_offset``.
-        - Real training actions (frames 1..T-1): ``start_frame_offset=1`` so the
-          last action in group i co-locates with vision frame i.
-        - AR real actions (single supertoken): ``start_frame_offset=0``.
+    mRoPE layout (with actions, unified_3d_mrope only). The layout is inferred from the
+    action tensor shape:
+        - Whole-clip training (frame 0 is the clean conditioning frame, so
+          ``real_actions`` has ``(T-1)*tcf`` rows): null action for supertoken 0, real
+          actions for frames 1..T-1 with ``start_frame_offset=1`` so the last action in
+          group i co-locates with vision frame i; vision uses ``start_frame_offset=0``.
+        - AR generation, single frame OR chunk (every frame carries a real action, so
+          ``real_actions`` has ``latent_t*tcf`` rows): vision AND action both use
+          ``start_frame_offset=1``, generalizing the single-frame AR supertoken to
+          ``latent_t`` frames. The caller (``pack_input_sequence_autoregressive``)
+          seeds ``temporal_offset`` one frame-stride back to compensate, so the unit
+          lands at the same absolute positions as the whole-clip training pack.
         - Interleaved per frame as cat([action_ids, vision_ids]).
 
     ``input_timestep`` is float (TF/none) or Tensor(T_max,) (DF, per-frame sigma).
@@ -1094,32 +1111,36 @@ def _pack_supertokens_temporal_causal(
     if pack_action_tokens:
         # Build all_action_tokens: shape (latent_t * tcf, action_dim)
         #
-        # Cases:
-        #   1. Training with conditioning frame (latent_t > 1, real_actions < latent_t*tcf):
-        #      Prepend tcf null tokens for frame 0, then real actions for frames 1..T-1.
-        #   2. KV-cache continuation (latent_t > 1, real_actions == latent_t*tcf): all supertokens
-        #      carry real actions (no conditioning frame in-segment).
-        #   3. AR frame N>0 (latent_t == 1, action provided): real actions, no null prefix.
-        #   4. AR frame 0 / image2video (action is None): all null tokens.
+        # Cases (token assembly; mRoPE start_frame_offset is chosen separately below,
+        # inferred from the same action shape):
+        #   1. Whole-clip training with conditioning frame (latent_t > 1, real_actions
+        #      has (T-1)*tcf rows): prepend tcf null tokens for frame 0, then real
+        #      actions for frames 1..T-1.
+        #   2. AR generation (every frame has a real action, real_actions has
+        #      latent_t*tcf rows — single frame OR chunk): no null prefix.
+        #   3. AR frame 0 / image2video (action is None): all null tokens.
         if input_action_tokens is not None:
-            # input_action_tokens shape: (1, T*tcf, D) or (T*tcf, D) for training; (tcf, D) for AR frame N>0
+            # input_action_tokens shape: (1, T*tcf, D) or (T*tcf, D) for training; (T*tcf, D) for AR units
             if input_action_tokens.dim() == 3:
                 real_actions = input_action_tokens.squeeze(0)  # [T*tcf,action_dim] or [N,action_dim]
             else:
                 real_actions = input_action_tokens  # [N,action_dim]
             null_tokens = torch.zeros(tcf, action_dim, device=device, dtype=real_actions.dtype)  # [tcf,action_dim]
-            if latent_t == 1:
-                # AR frame N>0: single supertoken with real actions, no null prefix
-                all_action_tokens = real_actions  # [tcf,action_dim]
-                null_action_flag = False
-            elif real_actions.shape[0] == latent_t * tcf:
-                # All frames have real actions (e.g. KV-cache continuation segments)
+            if real_actions.shape[0] == latent_t * tcf:
+                # AR generation (single frame: tcf == 1*tcf, or chunk: latent_t*tcf):
+                # every supertoken carries a real action, no null prefix.
                 all_action_tokens = real_actions
                 null_action_flag = False
-            else:
+            elif real_actions.shape[0] == (latent_t - 1) * tcf:
                 # Conditioning frame present: null for supertoken 0, real for 1..T-1
                 all_action_tokens = torch.cat([null_tokens, real_actions], dim=0)  # [T*tcf,action_dim]
                 null_action_flag = True
+            else:
+                raise ValueError(
+                    "Temporal-causal action tokens must have either latent_t*tcf rows for AR chunks "
+                    f"or (latent_t-1)*tcf rows for whole-clip training; got {real_actions.shape[0]} rows "
+                    f"for latent_t={latent_t}, tcf={tcf}."
+                )
         else:
             # AR frame 0 or image2video: all action tokens are null
             all_action_tokens = torch.zeros(
@@ -1171,14 +1192,17 @@ def _pack_supertokens_temporal_causal(
         temporal_offset = packed_seq._mrope_temporal_offset
         effective_vision_fps = vision_fps if enable_fps_modulation else None
 
-        # AR frame N>=1 with action_gen=True (latent_t==1 and real actions supplied):
-        # shift both vision and action by start_frame_offset=1 so the last action in
-        # the group co-locates with vision frame N, mirroring training's layout.
-        # All other cases (training latent_t>1, AR action_gen=False, AR frame 0 null)
-        # keep start_frame_offset=0. The caller in pack_input_sequence_autoregressive
-        # seeds temporal_offset accordingly (N-1 frames back when this shift applies).
-        ar_with_real_actions = latent_t == 1 and pack_action_tokens and input_action_tokens is not None
-        vision_sfo = 1 if ar_with_real_actions else 0
+        # AR generation (single frame OR chunk) is detected by every frame carrying a
+        # real action (``real_actions`` has ``latent_t*tcf`` rows). There, vision AND
+        # action both use start_frame_offset=1 so the last action in each group
+        # co-locates with its vision frame, mirroring whole-clip training; the caller
+        # (pack_input_sequence_autoregressive) seeds temporal_offset one frame-stride
+        # back to compensate. Whole-clip training (frame 0 is the null conditioning
+        # frame, ``real_actions`` has ``(T-1)*tcf`` rows) keeps vision start_frame_offset=0.
+        all_frames_have_real_action = (
+            pack_action_tokens and input_action_tokens is not None and real_actions.shape[0] == latent_t * tcf
+        )
+        vision_sfo = 1 if all_frames_have_real_action else 0
 
         vision_ids_flat, new_offset = get_3d_mrope_ids_vae_tokens(
             grid_t=latent_t,
@@ -1195,10 +1219,10 @@ def _pack_supertokens_temporal_causal(
         if pack_action_tokens:
             effective_action_fps = action_fps if enable_fps_modulation else None
 
-            # Action IDs: null for frame 0 (all tcf tokens share temporal_offset,
-            # co-located with vision frame 0), real for frames 1..T-1.
-            # Real tokens (training and AR) use start_frame_offset=1 so the last
-            # action in a group co-locates with vision frame i.
+            # Action IDs. Real action tokens use start_frame_offset=1 so the last
+            # sub-token of a group co-locates with its vision frame. Whole-clip training
+            # has a null action at frame 0 (the conditioning frame); AR units have a real
+            # action for every frame.
             fps_active = effective_action_fps is not None
             t_dtype = torch.float32 if fps_active else torch.long
             t_offset = float(temporal_offset) if fps_active else int(temporal_offset)
@@ -1221,28 +1245,24 @@ def _real_action_ids(n_frames: int, start_frame_offset: int) -> torch.Tensor:
                 )
                 return flat.reshape(3, n_frames, tcf)  # [3,n_frames,tcf]
 
-            if latent_t > 1 and input_action_tokens is not None:
-                if real_actions.shape[0] == latent_t * tcf:
-                    # KV continuation: real action in every supertoken (including frame 0)
-                    action_ids_3d = _real_action_ids(latent_t, start_frame_offset=0)
-                else:
-                    # Training with conditioning frame: supertoken 0 = null, 1..T-1 = real
-                    null_ids_3d = null_ids.reshape(3, 1, tcf)  # [3,1,tcf]
-                    real_ids_3d = _real_action_ids(latent_t - 1, start_frame_offset=1)  # [3,T-1,tcf]
-                    action_ids_3d = torch.cat([null_ids_3d, real_ids_3d], dim=1)  # [3,T,tcf]
+            if all_frames_have_real_action:
+                # AR generation (single frame: tcf == 1*tcf, or chunk: latent_t*tcf):
+                # every supertoken carries a real action. start_frame_offset=1 puts
+                # a_{j-1}'s last sub-token on vision frame j -- the whole-clip TF
+                # training layout. The caller seeds temporal_offset (N-1) frame-strides
+                # back to compensate.
+                action_ids_3d = _real_action_ids(latent_t, start_frame_offset=1)  # [3,T,tcf]
             elif latent_t > 1:
-                # No action tensor (all-null layout): same ID structure as training w/ conditioning frame.
+                # Whole-clip training: supertoken 0 = null (conditioning frame), frames
+                # 1..T-1 = real with start_frame_offset=1. Covers real-action training
+                # (real_actions has (T-1)*tcf rows) and the architectural all-null layout
+                # (input_action_tokens is None); the tokens differ but the IDs match.
                 null_ids_3d = null_ids.reshape(3, 1, tcf)  # [3,1,tcf]
                 real_ids_3d = _real_action_ids(latent_t - 1, start_frame_offset=1)  # [3,T-1,tcf]
                 action_ids_3d = torch.cat([null_ids_3d, real_ids_3d], dim=1)  # [3,T,tcf]
-            elif input_action_tokens is None:
-                # AR frame 0 / image2video: only null
-                action_ids_3d = null_ids.reshape(3, 1, tcf)  # [3,1,tcf]
             else:
-                # AR frame N>=1: single supertoken with real actions. start_frame_offset=1
-                # matches training (last action co-locates with vision frame N); caller
-                # seeds temporal_offset to (N-1) frame-strides back to compensate.
-                action_ids_3d = _real_action_ids(1, start_frame_offset=1)  # [3,1,tcf]
+                # AR frame 0 / image2video (latent_t == 1, no action): only null.
+                action_ids_3d = null_ids.reshape(3, 1, tcf)  # [3,1,tcf]
 
             # (3, T*H*W) → (3, T, H*W)
             vision_ids_3d = vision_ids_flat.reshape(3, latent_t, patches_per_frame)  # [3,T,patch_h*patch_w]
@@ -1309,7 +1329,9 @@ def pack_input_sequence(
     unified_3d_mrope_temporal_modality_margin: int = 0,
     enable_fps_modulation: bool = False,
     base_fps: float = 24.0,
+    sound_base_temporal_compression_factor: int | None = None,
     temporal_compression_factor: int = 4,
+    vision_temporal_position_mode: str = "latent_index",
     video_temporal_causal: bool = False,
     action_dim: int = 32,
     initial_mrope_temporal_offset: int | float = 0,
@@ -1347,8 +1369,13 @@ def pack_input_sequence(
             Uses the same flag as diffusion_expert_config.enable_fps_modulation.
         base_fps: Base FPS for normalization (default 24.0).
             Uses the same value as diffusion_expert_config.base_fps.
+        sound_base_temporal_compression_factor: Base temporal compression factor for sound FPS scaling.
+            ``None`` preserves the current behavior where sound advances at ``base_fps`` positions/sec.
         temporal_compression_factor: VAE temporal compression factor (default 4).
             Obtained from the VAE tokenizer at runtime.
+        vision_temporal_position_mode: Temporal coordinates used for unified_3d_mrope vision tokens.
+            "latent_index" keeps legacy positions; "uniae_source_right_edge" uses
+            per-latent positions from gen_data_clean.temporal_positions_vision.
     Returns:
         PackedSequence containing all packed tensors and metadata. See PackedSequence for field details.
     """
@@ -1361,6 +1388,44 @@ def pack_input_sequence(
     if isinstance(input_text_indexes, torch.Tensor):
         raise ValueError("input_text_tokens must be a list, not a tensor")
 
+    supported_vision_temporal_position_modes = {"latent_index", "uniae_source_right_edge"}
+    if vision_temporal_position_mode not in supported_vision_temporal_position_modes:
+        raise ValueError(
+            "Unsupported vision_temporal_position_mode: "
+            f"{vision_temporal_position_mode}. Supported modes: {supported_vision_temporal_position_modes}."
+        )
+    has_any_vision = any(plan.has_vision for plan in sequence_plans)
+    explicit_vision_temporal_positions_active = vision_temporal_position_mode != "latent_index" and has_any_vision
+    if explicit_vision_temporal_positions_active:
+        if position_embedding_type != "unified_3d_mrope":
+            raise NotImplementedError(
+                "Explicit vision temporal positions are only supported with position_embedding_type='unified_3d_mrope'."
+            )
+        if gen_data_clean.temporal_positions_vision is None:
+            raise ValueError(
+                f"vision_temporal_position_mode={vision_temporal_position_mode} requires "
+                "gen_data_clean.temporal_positions_vision."
+            )
+        if gen_data_clean.x0_tokens_vision is not None and len(gen_data_clean.temporal_positions_vision) != len(
+            gen_data_clean.x0_tokens_vision
+        ):
+            raise ValueError(
+                "temporal_positions_vision must have one entry per x0_tokens_vision item, "
+                f"got {len(gen_data_clean.temporal_positions_vision)} positions for "
+                f"{len(gen_data_clean.x0_tokens_vision)} vision items."
+            )
+        if video_temporal_causal:
+            raise NotImplementedError(
+                "video_temporal_causal=True is not wired for explicit UniAE vision temporal positions yet."
+            )
+        if any(plan.has_action for plan in sequence_plans):
+            raise NotImplementedError("Action packing is not wired for explicit UniAE vision temporal positions yet.")
+        if initial_mrope_temporal_offset != 0:
+            raise NotImplementedError(
+                "Autoregressive mRoPE temporal offsets are not wired for explicit UniAE vision temporal positions yet."
+            )
+    use_float_mrope_positions = enable_fps_modulation or explicit_vision_temporal_positions_active
+
     # Initialize packed sequence (acts as builder during packing)
     packed_seq = PackedSequence()
 
@@ -1405,7 +1470,7 @@ def pack_input_sequence(
                 special_tokens,
                 curr_rope_id,
                 has_generation=has_generation_for_sample,
-                use_float_positions=enable_fps_modulation,
+                use_float_positions=use_float_mrope_positions,
             )
             sample_len += text_sample_len
 
@@ -1496,6 +1561,7 @@ def pack_input_sequence(
                 shared_latent_t: int | None = None
                 shared_patch_h: int | None = None
                 shared_patch_w: int | None = None
+                shared_temporal_positions: torch.Tensor | None = None
                 # FPS is recorded per-sample (shape [B]); for multi-item samples
                 # (transfer / image-edit) every vision item in this sample shares
                 # the same conditioning FPS, so we read by sample_idx, not by the
@@ -1510,7 +1576,18 @@ def pack_input_sequence(
                     sample_vision_fps = float(gen_data_clean.fps_vision[sample_idx].item())
 
                 for item_idx in range(num_vis):
-                    input_vision_tokens = gen_data_clean.x0_tokens_vision[idx_vision]
+                    flat_vision_idx = idx_vision
+                    input_vision_tokens = gen_data_clean.x0_tokens_vision[flat_vision_idx]
+                    vision_temporal_positions: torch.Tensor | None = None
+                    if explicit_vision_temporal_positions_active:
+                        assert gen_data_clean.temporal_positions_vision is not None
+                        vision_temporal_positions = gen_data_clean.temporal_positions_vision[flat_vision_idx]
+                        if vision_temporal_positions.shape[0] != input_vision_tokens.shape[2]:
+                            raise ValueError(
+                                "vision_temporal_positions must match latent_t for each vision item, "
+                                f"got {vision_temporal_positions.shape[0]} positions and "
+                                f"latent_t={input_vision_tokens.shape[2]} for item {flat_vision_idx}."
+                            )
                     vision_fps = sample_vision_fps
                     idx_vision += 1
 
@@ -1544,6 +1621,19 @@ def pack_input_sequence(
                                 f"got item {item_idx} (H,W)=({item_latent_h},{item_latent_w}) "
                                 f"vs first=({shared_patch_h},{shared_patch_w})"
                             )
+                        if vision_temporal_positions is not None:
+                            if shared_temporal_positions is None:
+                                shared_temporal_positions = vision_temporal_positions
+                            else:
+                                comparison_temporal_positions = vision_temporal_positions.to(
+                                    device=shared_temporal_positions.device
+                                )  # [T]
+                                assert torch.allclose(comparison_temporal_positions, shared_temporal_positions), (
+                                    "share_vision_temporal_positions requires equal explicit temporal positions "
+                                    f"across vision items, got item {item_idx} positions "
+                                    f"{vision_temporal_positions.tolist()} vs first "
+                                    f"{shared_temporal_positions.tolist()}."
+                                )
                         # Rewind so this item starts at the same temporal offset as item 0.
                         packed_seq._mrope_temporal_offset = items_temporal_offset_snapshot
 
@@ -1558,6 +1648,7 @@ def pack_input_sequence(
                         enable_fps_modulation=enable_fps_modulation,
                         base_fps=base_fps,
                         temporal_compression_factor=temporal_compression_factor,
+                        vision_temporal_positions=vision_temporal_positions,
                     )
                     vision_split_len += item_split_len
                 sample_len += vision_split_len
@@ -1622,6 +1713,7 @@ def pack_input_sequence(
                 enable_fps_modulation=enable_fps_modulation,
                 base_fps=base_fps,
                 sound_fps=sound_fps,
+                sound_base_temporal_compression_factor=sound_base_temporal_compression_factor,
             )
             sample_len += sound_split_len
         else:
@@ -1641,8 +1733,8 @@ def pack_input_sequence(
 
             # EOV position IDs: 3D mRoPE or 1D RoPE
             if packed_seq._use_mrope:
-                # Use float dtype when FPS modulation is enabled for consistency
-                eov_dtype = torch.float32 if enable_fps_modulation else torch.long
+                # Use float dtype when any vision mRoPE positions are fractional.
+                eov_dtype = torch.float32 if use_float_mrope_positions else torch.long
                 eov_mrope_ids = torch.full((3, 1), packed_seq._mrope_temporal_offset, dtype=eov_dtype)  # [3,1]
                 packed_seq.position_ids.append(eov_mrope_ids)  # type: ignore[arg-type]
                 packed_seq._mrope_temporal_offset += 1
@@ -2095,7 +2187,7 @@ def verify_natten_parameter_list(
             {'window_size_float': (0.5, 0.5), 'dilation_float': (1.0, 0.0)}  # valid
 
             # Fixed window size of 8x8, dilation of 2x1.
-
+            # NOTE: requires ALL inputs to be at least 16x8
             {'window_size': (8, 8), 'dilation': (2, 1)}  # valid
 
             # Multi-profile: different parameters for 2D (images) and 3D (videos)
@@ -2231,7 +2323,7 @@ def generate_natten_metadata(
             {'window_size_float': (0.5, 0.5), 'dilation_float': (1.0, 0.0)}  # valid
 
             # Fixed window size of 8x8, dilation of 2x1.
-
+            # NOTE: requires ALL inputs to be at least 16x8
             {'window_size': (8, 8), 'dilation': (2, 1)}  # valid
 
             # Invalid:
@@ -2363,9 +2455,9 @@ def filter_shape(shape: tuple) -> tuple:
             is_causal = dim_params["is_causal"]
 
             # Create varlen metadata for natten varlen/varsized ops
-
+            # NOTE: generate_multi_dim_varlen_parameters will automatically map window size -1 to
             # full size, that's why constant window sizes aren't allowed.
-
+            # NOTE: if any of the parameters are constant, natten will simplify them
             natten_metadata.append(
                 generate_multi_dim_varlen_parameters(
                     token_layout_list=token_layout_list,
@@ -2780,7 +2872,6 @@ def build_sequence_plans_from_data_batch(
     Returns:
         List of SequencePlan objects, one per sample in the batch.
     """
-
     # For new modalities, please generate the sequence_plan in the dataset class!!!!
 
     # If sequence_plan already exists in data_batch, return it
@@ -2790,7 +2881,6 @@ def build_sequence_plans_from_data_batch(
     assert "action" not in data_batch or data_batch["action"] is None, "Action data SHOULD have sequence_plans!"
     assert "sound" not in data_batch or data_batch["sound"] is None, "Sound data SHOULD have sequence_plans!"
 
-
     # Determine batch size from available tensors
     batch_size = 0
     for key in [input_video_key, input_image_key]:
diff --git a/cosmos_framework/data/vfm/sound_data_utils.py b/cosmos_framework/data/vfm/sound_data_utils.py
index 0a8ec63..2d739b0 100644
--- a/cosmos_framework/data/vfm/sound_data_utils.py
+++ b/cosmos_framework/data/vfm/sound_data_utils.py
@@ -3,7 +3,8 @@
 
 """Sound data utilities for building sequence plans and handling audio-video generation modes.
 
-This module provides utilities for building SequencePlan objects based on sound generation modes.
+This module provides utilities for building SequencePlan objects based on sound generation modes,
+similar to how action modes are handled in cosmos_framework/data/vfm/action/data_utils.py.
 
 Supported modes:
     - t2vs: Text → Video + Sound (joint generation)
@@ -27,7 +28,8 @@ def build_sequence_plan_for_sound(
     """Build a SequencePlan based on the sound generation mode.
 
     This function determines the appropriate condition frame indexes for vision and sound
-    based on the specified mode.
+    based on the specified mode. It mirrors how `build_sequence_plan_from_mode` works
+    for action in cosmos_framework/data/vfm/action/data_utils.py.
 
     Args:
         mode: Generation mode. One of:
diff --git a/cosmos_framework/data/vfm/utils.py b/cosmos_framework/data/vfm/utils.py
index ab6f238..8b04f0a 100644
--- a/cosmos_framework/data/vfm/utils.py
+++ b/cosmos_framework/data/vfm/utils.py
@@ -6,7 +6,6 @@
 
 IMAGE_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
     # Our desired 256 resolution is the one below (commented).
-
     # Desired: "256": {"1,1": (336, 336), "4,3": (384, 288), "3,4": (288, 384), "16,9": (448, 256), "9,16": (256, 448)},
     "256": {
         "1,1": (256, 256),
@@ -41,7 +40,6 @@
 
 VIDEO_RES_SIZE_INFO: dict[str, dict[str, tuple[int, int]]] = {
     # Our desired 256 resolution is the one below (commented).
-
     # Desired: "256": {"1,1": (336, 336), "4,3": (384, 288), "3,4": (288, 384), "16,9": (448, 256), "9,16": (256, 448)},
     "256": {
         "1,1": (256, 256),
@@ -111,10 +109,42 @@ def parse_frame_range_from_wdinfo(wdinfo: str) -> tuple[int, int] | None:
     return None
 
 
+def _normalize_skip_frame_ranges(
+    skip_frame_range: str | list[str] | None,
+) -> set[tuple[int, int]]:
+    """Normalize ``skip_frame_range`` into a set of (min_frames, max_frames) buckets.
+
+    Args:
+        skip_frame_range: A single bucket string like ``"300_400"``, a list of such
+            strings, or None. Each string identifies the frame-range bucket
+            (e.g. ``frames_300_400``) that should be skipped.
+
+    Returns:
+        Set of (min_frames, max_frames) tuples to skip. Empty if ``skip_frame_range`` is None.
+    """
+    if skip_frame_range is None:
+        return set()
+
+    if isinstance(skip_frame_range, str):
+        skip_frame_range = [skip_frame_range]
+
+    skip_buckets: set[tuple[int, int]] = set()
+    for bucket in skip_frame_range:
+        match = re.fullmatch(r"(\d+)_(\d+)", bucket.strip())
+        if match is None:
+            raise ValueError(
+                f"Invalid skip_frame_range entry {bucket!r}. Expected the form '<min>_<max>', e.g. '300_400'."
+            )
+        skip_buckets.add((int(match.group(1)), int(match.group(2))))
+
+    return skip_buckets
+
+
 def filter_wdinfos_by_frame_range(
     wdinfos: list[str],
     min_frames: int | None = None,
     max_frames: int | None = None,
+    skip_frame_range: str | list[str] | None = None,
 ) -> list[str]:
     """
     Filter wdinfo files based on frame range.
@@ -125,10 +155,16 @@ def filter_wdinfos_by_frame_range(
     - min_frames is EXCLUSIVE: wdinfo_max must be > min_frames
     - max_frames is INCLUSIVE: wdinfo_max must be <= max_frames
 
+    Additionally, any wdinfo whose frame-range bucket matches an entry in
+    ``skip_frame_range`` is excluded.
+
     Args:
         wdinfos: List of wdinfo paths
         min_frames: Minimum number of frames (exclusive). If None, no lower bound.
         max_frames: Maximum number of frames (inclusive). If None, no upper bound.
+        skip_frame_range: Frame-range bucket(s) to exclude, e.g. ``"300_400"`` to
+            drop the ``frames_300_400`` bucket. Accepts a single string or a list
+            of strings. If None, no bucket is skipped.
 
     Returns:
         Filtered list of wdinfo paths
@@ -144,8 +180,14 @@ def filter_wdinfos_by_frame_range(
         # frames_400_500 excluded because wdinfo_max (500) <= min_frames (500)
         # frames_500_600 included because wdinfo_max (600) > min_frames (500) AND <= max_frames (600)
         # frames_600_700 excluded because wdinfo_max (700) > max_frames (600)
+
+        >>> filter_wdinfos_by_frame_range(wdinfos, skip_frame_range="500_600")
+        ['wdinfo/frames_400_500/wdinfo.json', 'wdinfo/frames_600_700/wdinfo.json']
+        # frames_500_600 excluded because its bucket matches skip_frame_range
     """
-    if min_frames is None and max_frames is None:
+    skip_buckets = _normalize_skip_frame_ranges(skip_frame_range)
+
+    if min_frames is None and max_frames is None and not skip_buckets:
         return wdinfos
 
     filtered = []
@@ -158,6 +200,10 @@ def filter_wdinfos_by_frame_range(
 
         wdinfo_min, wdinfo_max = frame_range
 
+        # Skip explicitly excluded buckets (matched on the full (min, max) bucket).
+        if (wdinfo_min, wdinfo_max) in skip_buckets:
+            continue
+
         # Filter based on wdinfo's upper bound (wdinfo_max):
         # - min_frames is exclusive: wdinfo_max must be > min_frames
         # - max_frames is inclusive: wdinfo_max must be <= max_frames
diff --git a/cosmos_framework/data/vfm/vlm/video_decoder_qwen.py b/cosmos_framework/data/vfm/vlm/video_decoder_qwen.py
new file mode 100644
index 0000000..12c9bcc
--- /dev/null
+++ b/cosmos_framework/data/vfm/vlm/video_decoder_qwen.py
@@ -0,0 +1,249 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""
+Copied from projects/cosmos/reason1/datasets/video_decoder_qwen.py
+Changes:
+1: remove hardcoded hyper-parameters for Qwen, now read it from processor
+2: support skipping smart resize, since it may resize the video frames to be smaller than model input and frames will get resized up later in processor
+"""
+
+import random
+import re
+from concurrent.futures import ThreadPoolExecutor
+from functools import partial
+from typing import Callable, Optional
+
+import torch
+from PIL import Image
+from qwen_vl_utils.vision_process import smart_nframes, smart_resize
+from torchcodec.decoders import VideoDecoder
+from torchvision import transforms
+from torchvision.transforms import InterpolationMode
+
+from cosmos_framework.utils import log
+from cosmos_framework.data.vfm.processors.qwen3vl_processor import Qwen3VLProcessor
+
+Image.MAX_IMAGE_PIXELS = 933120000
+_VIDEO_EXTENSIONS = "mp4 avi webm mov".split()
+
+VIDEO_DECODER_OPTIONS = {}
+
+
+def token_to_pixels(token_length: int, patch_size: int = 14, temporal_patch_size: int = 2, merge_size: int = 2) -> int:
+    """Convert token length to pixels based on patch size and temporal patch size.
+
+    Args:
+        token_length: Token length
+        patch_size: Patch size
+        temporal_patch_size: Temporal patch size,
+            for Qwen it has 3D conv, temporal patch size is 2; for other models like internVL or eagle er, the temporal patch size is 1 since their VIT is image encoder;
+        merge_size: Merge size, or called pixel shuffing factor;
+            for Qwen and internVL it is 2; for eagle er it is 1;
+    """
+    merged_patch_size = patch_size * merge_size
+    return token_length * merged_patch_size**2 * temporal_patch_size
+
+
+def pixels_to_token(pixels: int, patch_size: int = 14, temporal_patch_size: int = 2, merge_size: int = 2) -> int:
+    """Convert pixels to token length based on patch size and temporal patch size."""
+    merged_patch_size = patch_size * merge_size
+    return pixels // merged_patch_size**2 // temporal_patch_size
+
+
+def video_decoder_qwen(
+    num_threads: int = 0,
+    min_fps_thres: int = 4,
+    max_fps_thres: int = 60,
+    target_fps: float = 2.0,
+    min_video_token_length: int = 16,
+    max_video_token_length: int = 8192,
+    random_augmentation: bool = False,
+    frame_count_random_range: Optional[list[int]] = None,
+    **kwargs,
+) -> Callable:
+    """
+    Sampling video frames similar to Qwen. It prioritizes matching the target FPS first and then resizing the video frames.
+    See https://github.com/kq-chen/qwen-vl-utils/blob/main/src/qwen_vl_utils/vision_process.py#L118 for more details.
+
+    Args:
+        key: Video file name/key
+        data: Video binary data
+        min_fps_thres: Minimum FPS threshold
+        max_fps_thres: Maximum FPS threshold
+        target_fps: Target FPS
+        min_video_token_length: Minimum token length
+        max_video_token_length: Maximum token length
+        num_threads: Number of threads for the torchcodec video decoder
+        random_augmentation: Whether to randomize the FPS and max_video_token_length
+        frame_count_random_range: Random frame count range
+
+    Returns:
+        dict with video frames tensor and target FPS
+    """
+
+    video_decoder_configured = partial(
+        _video_decoder_qwen_func,
+        min_fps_thres=min_fps_thres,
+        max_fps_thres=max_fps_thres,
+        num_threads=num_threads,
+        target_fps=target_fps,
+        min_video_token_length=min_video_token_length,
+        max_video_token_length=max_video_token_length,
+        random_augmentation=random_augmentation,
+        frame_count_random_range=frame_count_random_range,
+    )
+
+    return video_decoder_configured
+
+
+def _video_decoder_qwen_func(
+    key: str,
+    data: bytes,
+    processor: Qwen3VLProcessor,
+    min_fps_thres: int = 4,
+    max_fps_thres: int = 60,
+    target_fps: float = 2.0,
+    min_video_token_length: int = 16,
+    max_video_token_length: int = 8192,
+    num_threads: int = 0,
+    random_augmentation: bool = False,
+    fps_random_range: list[float] = [0.5, 1.5],
+    max_video_token_length_random_range: list[float] = [0.75, 1.25],
+    frame_count_random_range: Optional[list[int]] = None,
+    start_frame: Optional[int] = None,
+    end_frame: Optional[int] = None,
+    decoding_timeout: int = 60,
+    **kwargs,
+) -> dict | None:
+    """Actual video decoder function.
+
+    Args:
+        key (str): Video file name/key
+        data (bytes): Video binary data
+        min_fps_thres (int, optional): Minimum FPS threshold. Defaults to 4.
+        max_fps_thres (int, optional): Maximum FPS threshold. Defaults to 60.
+        target_fps (float, optional): Target FPS. Defaults to 2.0.
+        min_video_token_length (int, optional): Minimum token length. Defaults to 16.
+        max_video_token_length (int, optional): Maximum token length. Defaults to 8192.
+        num_threads (int, optional): Number of threads for the torchcodec video decoder. Defaults to 0.
+        random_augmentation (bool, optional): Whether to randomize the FPS and max_video_token_length. Defaults to False.
+        fps_random_range (list[float], optional): Random FPS range. Defaults to [10.0, 24.0].
+        max_video_token_length_random_range (list[float], optional): Random max_video_token_length range. Defaults to [0.75, 1.25].
+        frame_count_random_range (list[int], optional): Random frame count range. If provided, take priority over fps_random_range.
+        start_frame (Optional[int], optional): Start frame. Defaults to None. If both start_frame and end_frame are provided, the video will be decoded from start_frame to end_frame.
+        end_frame (Optional[int], optional): End frame. Defaults to None. If both start_frame and end_frame are provided, the video will be decoded from start_frame to end_frame.
+        decoding_timeout (int, optional): Timeout in seconds. Defaults to 60.
+    Raises:
+        ValueError: Video fps lower than 1, skipping
+        ValueError: Video fps lower than min_fps_thres, skipping
+        ValueError: Video fps higher than max_fps_thres, skipping
+
+    Returns:
+        dict | None: Dictionary with video frames tensor and target FPS
+    """
+    # Check video extension
+    extension = re.sub(r".*[.]", "", key)
+    if extension.lower() not in _VIDEO_EXTENSIONS:
+        return None
+
+    # Read video with torchcodec
+    video_reader = VideoDecoder(data, num_ffmpeg_threads=num_threads)
+    total_frames = video_reader.metadata.num_frames
+    video_fps = video_reader.metadata.average_fps
+
+    # torchcodec returns ``None`` for containers that don't store frame count
+    # or average fps (e.g. some MKV/WebM streams).  Downstream arithmetic
+    # (``total_frames - 1``, ``video_fps < 1``, ...) would TypeError on None;
+    # surface a ValueError so the dataloader's skip path handles it uniformly.
+    if total_frames is None or video_fps is None:
+        raise ValueError(f"torchcodec missing metadata (num_frames={total_frames}, average_fps={video_fps}), skipping")
+
+    if start_frame is not None and end_frame is not None:
+        total_frames = end_frame - start_frame
+
+    if video_fps < 1:
+        raise ValueError("Video fps lower than 1, skipping")
+    if video_fps < min_fps_thres:
+        raise ValueError(f"Video fps {video_fps} lower than {min_fps_thres}, skipping")
+    if video_fps > max_fps_thres:
+        raise ValueError(f"Video fps {video_fps} higher than {max_fps_thres}, skipping")
+
+    if random_augmentation:
+        if frame_count_random_range is not None:
+            # Random number of frames
+            min_frames_range, max_frames_range = frame_count_random_range
+            min_frames_range = min(min_frames_range, total_frames)
+            max_frames_range = min(max_frames_range, total_frames)
+            target_frames = random.uniform(min_frames_range, max_frames_range)
+            target_fps = target_frames / total_frames * video_fps
+        else:
+            # randomize fps
+            target_fps = (
+                random.uniform(fps_random_range[0], fps_random_range[1]) * target_fps
+                if random.random() < 0.5
+                else target_fps
+            )
+        # randomize max_video_token_length
+        max_video_token_length = int(
+            random.uniform(max_video_token_length_random_range[0], max_video_token_length_random_range[1])
+            * max_video_token_length
+        )
+        log.debug(f"random_augmentation: max_video_token_length: {max_video_token_length}, target_fps: {target_fps}")
+
+    patch_size = processor.patch_size
+    min_height_width = processor.min_height_width
+    temporal_patch_size = processor.temporal_patch_size
+    merge_size = processor.merge_size
+    min_pixels: int = token_to_pixels(min_video_token_length, patch_size, temporal_patch_size, merge_size)
+    max_pixels: int = token_to_pixels(max_video_token_length, patch_size, temporal_patch_size, merge_size)
+    max_frames: int = max_pixels // (min_height_width) ** 2 // temporal_patch_size
+
+    # sample based on target fps
+    nframes = smart_nframes(dict(fps=target_fps), total_frames=total_frames, video_fps=video_fps)
+    nframes = min(nframes, max_frames)
+    if start_frame is not None and end_frame is not None:
+        idx = torch.linspace(start_frame, end_frame - 1, nframes).round().long().tolist()  # [nframes]
+    else:
+        idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()  # [nframes]
+
+    def _decode_video() -> torch.Tensor:
+        return video_reader.get_frames_at(indices=idx).data  # [T, C, H, W] uint8
+
+    # Use ThreadPoolExecutor to run video decoding with a timeout.
+    # If the thread is stuck, abandon it immediately.
+    executor = ThreadPoolExecutor(max_workers=1)
+    future = executor.submit(_decode_video)
+    try:
+        video_frames = future.result(timeout=decoding_timeout)
+        executor.shutdown(wait=False)
+    except TimeoutError as e:
+        log.warning(f"[{key}] Video decoding timed out after {decoding_timeout} seconds")
+        executor.shutdown(wait=False)
+        return None
+
+    sample_fps = nframes / max(total_frames, 1e-6) * video_fps
+
+    # recompute max_pixels based on number of sampled frames
+    nframes, _, height, width = video_frames.shape
+    max_pixels = max_pixels // nframes
+    if processor.use_smart_resize:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=patch_size * merge_size,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+        log.debug(
+            f"resized_height: {resized_height}, resized_width: {resized_width} | original height: {height}, original width: {width}"
+        )
+        video_frames = transforms.functional.resize(
+            video_frames,
+            [resized_height, resized_width],
+            interpolation=InterpolationMode.BICUBIC,
+            antialias=True,
+        ).float()  # [T,C,H,W]
+    video_frames = video_frames.permute(1, 0, 2, 3)  # [C,T,H,W]
+
+    return dict(videos=video_frames, fps=sample_fps)
diff --git a/cosmos_framework/data/vlm/processors/nemotron3densevl_processor.py b/cosmos_framework/data/vlm/processors/nemotron3densevl_processor.py
new file mode 100644
index 0000000..fd8406f
--- /dev/null
+++ b/cosmos_framework/data/vlm/processors/nemotron3densevl_processor.py
@@ -0,0 +1,248 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+import os
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+from qwen_vl_utils.vision_process import smart_resize
+from transformers.models.auto.processing_auto import AutoProcessor
+
+from cosmos_framework.utils import log
+from cosmos_framework.utils.vlm.pretrained_models_downloader import maybe_download_hf_model_from_s3
+
+
+def convert_string_content_to_list_content(messages: List[Dict]) -> List[Dict]:
+    """
+    Convert the string content to a list of dicts.
+    """
+    for message_id, message in enumerate(messages):
+        if isinstance(message["content"], str):
+            messages[message_id]["content"] = [{"type": "text", "text": message["content"]}]
+    return messages
+
+
+def maybe_parse_video_content(
+    messages: List[Dict],
+) -> tuple[int, Optional[list[float]], Optional[list[int]], Optional[list[list[int]]]]:
+    """
+    Convert the string content to a list of dicts.
+    """
+    num_video = 0
+    video_fps = []
+    video_total_num_frames = []
+    video_frames_indices = []
+    for message_id, message in enumerate(messages):
+        if isinstance(message["content"], list):
+            for sub_content in message["content"]:
+                if sub_content.get("type", "") == "video" and isinstance(sub_content["video"], list):
+                    num_video += 1
+                    fps = sub_content.get("fps", None)
+                    if fps is None:
+                        log.critical(
+                            f"fps is None for video {sub_content}. Better to set the fps explicitly", rank0_only=False
+                        )
+                    video_fps.append(fps)
+                    video_total_num_frames.append(len(sub_content["video"]))
+                    video_frames_indices.append(list(range(video_total_num_frames[-1])))
+    return num_video, video_fps, video_total_num_frames, video_frames_indices
+
+
+class Nemotron3DenseVLProcessor:
+    # This is a wrapper around the AutoProcessor class to add some helper functions
+    def __init__(
+        self,
+        name="Qwen/Qwen3-VL-2B-Init",
+        credentials: str = "./credentials/s3_training.secret",
+        bucket: str = "bucket4",
+        cache_dir: str = None,
+    ):
+        self.name = name
+        if os.path.isdir(name):
+            model_name_or_path_local = name
+        else:
+            model_name_or_path_local = maybe_download_hf_model_from_s3(
+                name, credentials, bucket, include_model_weights=False
+            )
+
+        self.processor = AutoProcessor.from_pretrained(model_name_or_path_local, trust_remote_code=True)
+        log.info("Successfully loaded processor from local cache")
+
+        if hasattr(self.processor, "image_token"):
+            self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.processor.image_token)
+        else:
+            self.image_token_id = None
+        if hasattr(self.processor, "video_token"):
+            self.video_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.processor.video_token)
+        else:
+            self.video_token_id = None
+        self.eos_id = self.processor.tokenizer.eos_token_id
+        self.pad_id = self.processor.tokenizer.pad_token_id
+        self.vision_end_id = self.processor.tokenizer.convert_tokens_to_ids("</img>")
+
+        # Helper attributes for the dataloader video decoding function
+        self.shortest_edge = self.processor.image_processor.size["shortest_edge"]
+        self.min_height_width = int(np.sqrt(self.shortest_edge))
+        self.patch_size = self.processor.video_processor.patch_size
+        self.temporal_patch_size = self.processor.video_processor.temporal_patch_size
+        self.merge_size = self.processor.video_processor.merge_size
+        self.use_smart_resize = True
+        if self.pad_id is None:
+            self.pad_id = self.eos_id
+
+    def apply_chat_template(
+        self,
+        messages,
+        add_generation_prompt=False,
+        return_tensors="pt",
+        tokenize=True,
+        **kwargs,
+    ):
+        """
+        Return:
+            inputs: dict
+                input_ids: torch.Tensor, shape: (N_token)
+                attention_mask: torch.Tensor, shape: (N_token)
+                texts: str, the raw text
+                image_sizes: torch.Tensor, shape (N_img, 2)
+                pixel_values: torch.Tensor, shape (N_img_patch, 3, 224, 224)
+        """
+
+        # messages = [msg for msg in messages if msg.get("role") != "system"]
+        assert tokenize, "tokenize must be True"
+        assert return_tensors == "pt", "return_tensors must be pt"
+        # Note: this tokenizer does not support "content": str, it always expect "content" entry to be a list of dicts
+        messages = convert_string_content_to_list_content(messages)
+        kwargs = {}
+        for message_id, message in enumerate(messages):
+            if isinstance(message["content"], list):
+                for sub_content in message["content"]:
+                    if sub_content.get("type", "") == "image":
+                        image = sub_content["image"]
+                        max_pixels = sub_content.get("max_pixels", self.processor.image_processor.size["longest_edge"])
+                        min_pixels = sub_content.get("min_pixels", self.processor.image_processor.size["shortest_edge"])
+                        assert isinstance(image, Image.Image), (
+                            "image must be a url string for now, not support list of images for one content"
+                        )
+                        width, height = image.size
+                        resized_height, resized_width = smart_resize(
+                            height,
+                            width,
+                            factor=32,
+                            min_pixels=min_pixels,
+                            max_pixels=max_pixels,
+                        )
+                        image = image.resize((resized_width, resized_height))
+                        sub_content["image"] = image
+
+        num_video, video_fps, video_total_num_frames, video_frames_indices = maybe_parse_video_content(messages)
+        if num_video > 0:
+            # Here we add the args to avoid the error:
+            # File "/invalid_dir", line 321, in _decode_and_sample_videos
+            #     raise ValueError(
+            # ValueError: Sampling frames from a list of images is not supported! Set `do_sample_frames=False`.
+            video_metadata = [
+                dict(fps=fps, total_num_frames=total_num_frames, frames_indices=frames_indices)
+                for fps, total_num_frames, frames_indices in zip(
+                    video_fps, video_total_num_frames, video_frames_indices
+                )
+            ]
+            kwargs["videos_kwargs"] = {
+                "do_sample_frames": False,
+                "video_metadata": video_metadata[0] if num_video == 1 else video_metadata,
+            }
+
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=tokenize,
+            add_generation_prompt=add_generation_prompt,
+            return_dict=True,
+            return_tensors=return_tensors,
+            # padding="max_length",
+            # max_length=16000,
+            # truncation=False,
+            **kwargs,
+        )
+
+        # Convert batch features into single features
+        # By default, the processor returns a batch of features, but we use processor in dataloader, so we need to convert it to single features
+        inputs["input_ids"] = inputs["input_ids"][0]  # [N_token]
+        inputs["attention_mask"] = inputs["attention_mask"][0]  # [N_token]
+        num_image_tokens = inputs["input_ids"] == self.image_token_id  # [N_token]
+        num_video_tokens = inputs["input_ids"] == self.video_token_id  # [N_token]
+        return inputs
+
+    def add_assistant_tokens_mask(self, tokens):
+        """
+        Add a mask to the assistant tokens.
+        This is used to mask out tokens that are not generated by the assistant (e.g.,  system prompts, user prompts, chat templates), such that in the loss computation, only the tokens generated by the assistant are used.
+        If there are multiple turns in the conversation, the mask will mask all the assistant tokens in each turn.
+
+        Args:
+            tokens (Union[List[int], torch.Tensor]): The tokens to add the mask to.
+        Returns:
+            Union[List[bool], torch.Tensor]: The mask. True for tokens generated by the assistant (i.e. should apply loss on), False for tokens not generated by the assistant.
+        """
+        if isinstance(tokens, torch.Tensor) and tokens.ndim == 2:
+            mask = torch.stack(
+                [self.add_assistant_tokens_mask(tokens[i]) for i in range(tokens.shape[0])]
+            )  # [B,N_token]
+            assert mask.shape == tokens.shape
+            return mask
+        np_tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.array(tokens)
+        assert np_tokens.ndim == 1
+
+        # Constants defining bos, eos and fixed offsets.
+        BOS_TOKEN = "<|im_start|>"
+        EOS_TOKEN = "<|im_end|>"
+        ROLE = "assistant"
+        # Offsets: skip the bos + "assistant\n" (always 3 tokens) and include the eos (+1) for supervision
+        START_OFFSET = 3
+        END_OFFSET = 1
+
+        # Retrieve token IDs for the markers and the role.
+        bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(BOS_TOKEN)
+        eos_token_id = self.processor.tokenizer.convert_tokens_to_ids(EOS_TOKEN)
+        role_id = self.processor.tokenizer.convert_tokens_to_ids(ROLE)
+        role_ids = self.processor.tokenizer.encode(
+            ROLE, add_special_tokens=False
+        )  # In case the role_id corresponds to multiple tokens, decode it back to string for accurate comparison
+        think_start_id = self.processor.tokenizer.convert_tokens_to_ids("<think>")
+        think_end_id = self.processor.tokenizer.convert_tokens_to_ids("</think>")
+
+        # Locate all positions where the start and end markers appear.
+        start_indices = np.where(np_tokens == bos_token_id)[0]
+        end_indices = np.where(np_tokens == eos_token_id)[0]
+
+        # Initialize the mask with False values.
+        masks = np.zeros_like(np_tokens, dtype=bool)
+        assert len(start_indices) == len(end_indices)
+        # For each pair of bos/eos, check if the role is 'assistant'
+        # and apply the mask accordingly.
+        for start, end in zip(start_indices, end_indices):
+            end_pos = None
+            if np_tokens[start + 1] == role_id:
+                # Mask tokens from after the assistant header (start+3) to include the end marker (end+1)
+                masks[start + START_OFFSET : end + END_OFFSET] = True
+                end_pos = start + START_OFFSET
+            elif all(np_tokens[start + 1 : start + 1 + len(role_ids)] == role_ids):
+                masks[start + START_OFFSET + len(role_ids) - 1 : end + END_OFFSET] = True
+                end_pos = start + START_OFFSET + len(role_ids) - 1
+            if end_pos is not None and np_tokens[end_pos] == think_start_id:
+                masks[end_pos] = False
+                if np_tokens[end_pos + 1] == think_end_id:
+                    masks[end_pos + 1] = False
+
+        assert masks.shape == np_tokens.shape
+        if isinstance(tokens, torch.Tensor):
+            return torch.from_numpy(masks)
+        else:
+            return masks.tolist()
+
+    def encode(self, *args, **kwargs):
+        return self.processor.encode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.processor.decode(*args, **kwargs)
diff --git a/cosmos_framework/data/vlm/processors/nemotronvl_processor.py b/cosmos_framework/data/vlm/processors/nemotronvl_processor.py
new file mode 100644
index 0000000..1fde099
--- /dev/null
+++ b/cosmos_framework/data/vlm/processors/nemotronvl_processor.py
@@ -0,0 +1,553 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+import os
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+from transformers.models.auto.processing_auto import AutoProcessor
+from transformers.processing_utils import VideosKwargs
+from transformers.video_utils import VideoMetadata
+
+from cosmos_framework.utils import log
+from cosmos_framework.utils.vlm.pretrained_models_downloader import maybe_download_hf_model_from_s3
+
+nemotron_chat_template = """
+{%- set ns = namespace(enable_thinking=false, has_sys_prompt=false, non_tool_system_content='', has_video=false, explicit_think_requested=false) -%}
+{%- set msg = namespace(content='') -%}
+{%- for message in messages -%}
+    {%- if message['role'] == 'system' -%}
+        {%- set ns.has_sys_prompt = true -%}
+        {# Extract system content without tool flags #}
+        {%- if message['content'] is string -%}
+            {%- set ns.non_tool_system_content = message['content'].replace('</think>', '<_end_think>').replace('/think', '').replace('/no_think', '').replace('<_end_think>', '</think>').strip() -%}
+        {%- else -%}
+            {%- set ns.non_tool_system_content = '' -%}
+            {%- for content in message['content'] -%}
+                {%- if content['type'] == 'text' -%}
+                    {%- set ns.non_tool_system_content = ns.non_tool_system_content + content['text'].replace('</think>', '<_end_think>').replace('/think', '').replace('/no_think', '').replace('<_end_think>', '</think>') -%}
+                {%- endif -%}
+            {%- endfor -%}
+            {%- set ns.non_tool_system_content = ns.non_tool_system_content.strip() -%}
+        {%- endif -%}
+    {%- endif -%}
+    {# Check for video content in all messages #}
+    {%- if message['content'] is not string -%}
+        {%- for content in message['content'] -%}
+            {%- if content['type'] == 'video' or content['type'] == 'video_url' -%}
+                {%- set ns.has_video = true -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- if message['content'] is string -%}
+        {%- if message['role'] == 'user' or message['role'] == 'system' -%}
+            {%- if '/think' in message['content'].replace('</think>', '') -%}
+                {%- set ns.enable_thinking = true -%}
+                {%- set ns.explicit_think_requested = true -%}
+            {%- elif '/no_think' in message['content'] -%}
+                {%- set ns.enable_thinking = false -%}
+            {%- endif -%}
+        {%- endif -%}
+    {%- else -%}
+        {%- for content in message['content'] -%}
+            {%- if content['type'] == 'text' -%}
+                {%- if message['role'] == 'user' or message['role'] == 'system' -%}
+                    {%- if '/think' in content['text'].replace('</think>', '') -%}
+                        {%- set ns.enable_thinking = true -%}
+                        {%- set ns.explicit_think_requested = true -%}
+                    {%- elif '/no_think' in content['text'] -%}
+                        {%- set ns.enable_thinking = false -%}
+                    {%- endif -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+{%- endfor -%}
+
+{{- bos_token -}}
+{%- if messages[0]['role'] != 'system' -%}
+    {{- '<SPECIAL_10>System\n' -}}
+{%- else -%}
+    {{- '<SPECIAL_10>System\n' + ns.non_tool_system_content }}
+{%- endif -%}
+
+{%- if tools -%}
+    {%- if ns.non_tool_system_content != '' -%}
+        {{- '\n\n' -}}
+    {%- endif -%}
+    {{- 'You can use the following tools to assist the user if required:\n' -}}
+    {{- '<AVAILABLE_TOOLS>[' -}}
+    {%- for tool in tools -%}
+        {{- (tool.function if tool.function is defined else tool) | tojson -}}
+        {{- ', ' if not loop.last else '' -}}
+    {%- endfor -%}
+    {{- ']</AVAILABLE_TOOLS>\n\n' -}}
+    
+    {{- 'If you decide to call any tool(s), use the following format:\n' -}}
+    {{- '<TOOLCALL>[{"name": "tool_name1", "arguments": "tool_args1"}, ' -}}
+    {{- '{"name": "tool_name2", "arguments": "tool_args2"}]</TOOLCALL>\n\n' -}}
+    
+    {{- 'The user will execute tool-calls and return responses from tool(s) in this format:\n' -}}
+    {{- '<TOOL_RESPONSE>[{"response": "tool_response1"}, ' -}}
+    {{- '{"response": "tool_response2"}]</TOOL_RESPONSE>\n\n' -}}
+    
+    {{- 'Based on the tool responses, you can call additional tools if needed, ' -}}
+    {{- 'correct tool calls if any errors are found, or just respond to the user.' -}}
+{%- endif -%}
+{{- '\n' -}}
+
+{%- set messages = messages[1:] if messages[0]['role'] == 'system' else messages -%}
+
+{# Prevent no user or assistant message #}
+{%- if messages|length == 0 -%}
+    {%- set messages = [{'role': 'user', 'content': ''}] -%}
+{%- endif -%}
+
+{%- for message in messages %}
+    {%- if message['content'] is string -%}
+        {%- set msg.content = message['content'].replace('</think>', '<_end_think>').replace('/think', '').replace('/no_think', '').replace('<_end_think>', '</think>').strip() -%}
+    {%- else -%}
+        {%- set msg.content = '' -%}
+        {%- set mm_content = '' -%}
+        {%- set counters = namespace(images=0, videos=0) -%}
+
+        {%- for content in message['content'] -%}
+            {%- if content['type'] == 'image' -%}
+                {%- set counters.images = counters.images + 1 -%}
+            {%- elif content['type'] == 'video' -%}
+                {%- set counters.videos = counters.videos + 1 -%}
+            {%- elif content['type'] == 'text' -%}
+                {%- set msg.content = msg.content + content['text'] -%}
+            {%- endif -%}
+        {%- endfor -%}
+        {%- if '<image>' in msg.content -%}
+            {%- set counters.images = 0 -%}
+        {%- endif -%}
+        {%- if '<video>' in msg.content -%}
+            {%- set counters.videos = 0 -%}
+        {%- endif -%}
+        {%- if counters.images > 1 -%}
+            {%- set image_tags = namespace(tags=[]) -%}
+            {%- for i in range(counters.images) -%}
+                {%- set image_tags.tags = image_tags.tags + ['<image ' + (i + 1)|string + '><image>'] -%}
+            {%- endfor -%}
+            {%- set mm_content = ' '.join(image_tags.tags) + '\n' -%}
+        {%- elif counters.images == 1 -%}
+            {%- set mm_content = '<image>\n' -%}
+        {%- endif -%}
+        {%- set mm_content = mm_content + '<video>\n' * counters.videos -%}
+        {%- set msg.content = mm_content + msg.content.lstrip('\n') -%}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' %}
+        {{- '<SPECIAL_11>User\n' + msg.content.replace('</think>', '<_end_think>').replace('/think', '').replace('/no_think', '').replace('<_end_think>', '</think>').strip() + '\n' }}
+    {%- elif message['role'] == 'tool' %}
+        {%- if loop.first or (messages[loop.index0 - 1].role != 'tool') -%}
+            {{- '<SPECIAL_11>User\n' + '<TOOL_RESPONSE>[' }}
+        {%- endif -%}
+        {{- msg.content -}}
+        {{- ', ' if not loop.last and (messages[loop.index0 + 1].role == 'tool') else '' -}}
+        {%- if loop.last or (messages[loop.index0 + 1].role != 'tool') -%}
+            {{- ']</TOOL_RESPONSE>\n' -}}
+        {%- endif -%}
+    {%- elif message['role'] == 'assistant' %}
+        {{- '<SPECIAL_11>Assistant\n' + msg.content.strip() }}
+        {%- if message.tool_calls -%}
+            {%- if msg.content.strip() != '' -%}
+                {{- '\n\n' -}}
+            {%- endif -%}
+            {{- '<TOOLCALL>[' -}}
+            {%- for call in message.tool_calls -%}
+                {%- set fn = call.function if call.function is defined else call -%}
+                {{- '{"name": "' + fn.name + '", "arguments": ' -}}
+                {%- if fn.arguments is string -%}
+                    {{- fn.arguments -}}
+                {%- else -%}
+                    {{- fn.arguments | tojson -}}
+                {%- endif -%}
+                {{- '}' + (', ' if not loop.last else '') -}}
+            {%- endfor -%}
+            {{- ']</TOOLCALL>' -}}
+        {%- endif -%}
+        {{- '\n<SPECIAL_12>\n' -}}
+    {%- endif %}
+{%- endfor -%}
+{%- if add_generation_prompt %}
+    {{- '<SPECIAL_11>Assistant\n' }}
+    {%- if ns.enable_thinking is defined and ns.enable_thinking is false %}
+        {{- '<think></think>' }}
+    {%- else %}
+        {{- '<think>\n' }}
+    {%- endif %}
+{%- endif %}
+"""
+
+
+def convert_string_content_to_list_content(messages: List[Dict]) -> List[Dict]:
+    """
+    Convert the string content to a list of dicts.
+    """
+    for message_id, message in enumerate(messages):
+        if isinstance(message["content"], str):
+            messages[message_id]["content"] = [{"type": "text", "text": message["content"]}]
+    return messages
+
+
+def maybe_parse_vision_content(
+    messages: List[Dict],
+) -> tuple[
+    int,
+    Optional[list[float]],
+    Optional[list[int]],
+    Optional[list[list[int]]],
+    Optional[list[list[np.ndarray]]],
+    int,
+    Optional[list[Image.Image]],
+]:
+    """
+    Convert the string content to a list of dicts.
+    """
+    num_video = 0
+    video_fps = []
+    video_total_num_frames = []
+    video_frames_indices = []
+    video_frames = []
+    images = []
+    num_image = 0
+    for message_id, message in enumerate(messages):
+        if isinstance(message["content"], list):
+            for sub_content in message["content"]:
+                if sub_content.get("type", "") == "video" and isinstance(sub_content["video"], list):
+                    num_video += 1
+                    fps = sub_content.get("fps", None)
+                    if fps is None:
+                        log.critical(
+                            f"fps is None for video {sub_content}. Better to set the fps explicitly", rank0_only=False
+                        )
+                    video_fps.append(fps)
+                    video_total_num_frames.append(len(sub_content["video"]))
+                    video_frames_indices.append(list(range(video_total_num_frames[-1])))
+                    video_frames.append(sub_content["video"])
+                elif sub_content.get("type", "") == "image":
+                    num_image += 1
+                    images.append(sub_content["image"])
+    return num_video, video_fps, video_total_num_frames, video_frames_indices, video_frames, num_image, images
+
+
+def maybe_get_max_pixels_from_images_kwargs(messages: List[Dict]) -> Optional[tuple[int, int]]:
+    """
+    Get the max pixels from the images_kwargs.
+    """
+    for message_id, message in enumerate(messages):
+        if isinstance(message["content"], list):
+            for sub_content in message["content"]:
+                if sub_content.get("type", "") == "image" and sub_content.get("max_pixels", None) is not None:
+                    return sub_content["max_pixels"], sub_content.get("min_pixels", None)
+    return None, None
+
+
+class NemotronVLProcessor:
+    # This is a wrapper around the AutoProcessor class to add some helper functions
+    def __init__(
+        self,
+        name="nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16",
+        credentials: str = "./credentials/s3_training.secret",
+        bucket: str = "bucket4",
+        cache_dir: Optional[str] = None,
+    ):
+        self.name = name
+        if os.path.isdir(name):
+            model_name_or_path_local = name
+        else:
+            model_name_or_path_local = maybe_download_hf_model_from_s3(
+                name, credentials, bucket, include_model_weights=False
+            )
+
+        self.processor = AutoProcessor.from_pretrained(model_name_or_path_local, trust_remote_code=True)
+        log.info("Successfully loaded processor from local cache")
+
+        self.processor.tokenizer.chat_template = nemotron_chat_template
+        if hasattr(self.processor, "image_token"):
+            self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.processor.image_token)
+        else:
+            self.image_token_id = None
+        if hasattr(self.processor, "video_token"):
+            self.video_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.processor.video_token)
+        else:
+            self.video_token_id = None
+        self.eos_id = self.processor.tokenizer.eos_token_id
+        self.pad_id = self.processor.tokenizer.convert_tokens_to_ids(
+            "<SPECIAL_999>"
+        )
+        self.vision_end_id = self.processor.tokenizer.convert_tokens_to_ids("</img>")
+
+        # Helper attributes for the dataloader video decoding function
+        self.shortest_edge = 512
+        self.min_height_width = 512
+        self.patch_size = 16
+        self.temporal_patch_size = 1
+        self.merge_size = 1
+        self.use_smart_resize = False
+
+    def apply_chat_template(
+        self,
+        messages,
+        add_generation_prompt=False,
+        return_tensors="pt",
+        tokenize=True,
+        **kwargs,
+    ):
+        """
+        Return:
+            inputs: dict
+                input_ids: torch.Tensor, shape: (N_token)
+                attention_mask: torch.Tensor, shape: (N_token)
+                texts: str, the raw text
+                image_sizes: torch.Tensor, shape (N_img, 2)
+                pixel_values: torch.Tensor, shape (N_img_patch, 3, 224, 224)
+        """
+        assert tokenize, "tokenize must be True"
+        assert return_tensors == "pt", "return_tensors must be pt"
+        # Note: this tokenizer does not support "content": str, it always expect "content" entry to be a list of dicts
+        messages = convert_string_content_to_list_content(messages)
+
+        has_thinking = False
+        for message_id, message in enumerate(messages):
+            if message["role"] == "assistant":
+                for content in message["content"]:
+                    if content.get("type", "") == "text":
+                        if "<think>" in content["text"] and "</think>" in content["text"]:
+                            has_thinking = True
+        for message_id, message in enumerate(messages):
+            if message["role"] == "system":
+                prefix = "/think " if has_thinking else "/no_think "
+                messages[message_id]["content"][0]["text"] = prefix + messages[message_id]["content"][0]["text"]
+            if message["role"] == "assistant" and not has_thinking:
+                for content in messages[message_id]["content"]:
+                    if content.get("type", "text") == "text":
+                        content["text"] = "<think></think>" + content["text"]
+
+        num_video, video_fps, video_total_num_frames, video_frames_indices, video_frames, num_image, images = (
+            maybe_parse_vision_content(messages)
+        )
+        prompt = self.processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
+        kwargs = {}  # omit kwargs passed in
+        if num_video > 0:
+            kwargs["videos_kwargs"] = VideosKwargs(do_sample_frames=False)
+            assert num_video == 1, "only support one video for now"
+            fps = video_fps[0]
+            total_num_frames = video_total_num_frames[0]
+            frames_indices = video_frames_indices[0]
+            inputs = self.processor(
+                text=[prompt],
+                videos=video_frames,
+                videos_kwargs=VideosKwargs(
+                    do_sample_frames=False,
+                    video_metadata=VideoMetadata(
+                        fps=fps, total_num_frames=total_num_frames, duration=total_num_frames / fps, video_backend=None
+                    ),
+                ),
+                return_tensors=return_tensors,
+            )
+        elif num_image > 0:
+            inputs = self.processor(
+                text=[prompt],
+                images=images,
+                return_tensors=return_tensors,
+            )
+
+        # Convert batch features into single features
+        # By default, the processor returns a batch of features, but we use processor in dataloader, so we need to convert it to single features
+        inputs["input_ids"] = inputs["input_ids"][0]  # [N_token]
+        inputs["attention_mask"] = inputs["attention_mask"][0]  # [N_token]
+        return inputs
+
+    def add_assistant_tokens_mask(self, tokens):
+        """
+        Add a mask to the assistant tokens.
+        This is used to mask out tokens that are not generated by the assistant (e.g.,  system prompts, user prompts, chat templates), such that in the loss computation, only the tokens generated by the assistant are used.
+        If there are multiple turns in the conversation, the mask will mask all the assistant tokens in each turn.
+
+        Args:
+            tokens (Union[List[int], torch.Tensor]): The tokens to add the mask to.
+        Returns:
+            Union[List[bool], torch.Tensor]: The mask. True for tokens generated by the assistant (i.e. should apply loss on), False for tokens not generated by the assistant.
+        """
+        if isinstance(tokens, torch.Tensor) and tokens.ndim == 2:
+            mask = torch.stack(
+                [self.add_assistant_tokens_mask(tokens[i]) for i in range(tokens.shape[0])]
+            )  # [B,N_token]
+            assert mask.shape == tokens.shape
+            return mask
+        np_tokens = tokens.cpu().numpy() if isinstance(tokens, torch.Tensor) else np.array(tokens)
+        assert np_tokens.ndim == 1
+
+        # Constants defining bos, eos and fixed offsets.
+        BOS_TOKEN = "<SPECIAL_11>"
+        EOS_TOKEN = "<SPECIAL_12>"
+        ROLE = "Assistant"
+        # Offsets: skip the bos + "assistant\n" (always 3 tokens) and include the eos (+1) for supervision
+        START_OFFSET = 3
+        END_OFFSET = 1
+
+        # Retrieve token IDs for the markers and the role.
+        bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(BOS_TOKEN)
+        eos_token_id = self.processor.tokenizer.convert_tokens_to_ids(EOS_TOKEN)
+        role_id = self.processor.tokenizer.convert_tokens_to_ids(ROLE)
+
+        # Locate all positions where the start and end markers appear.
+        start_indices = np.where(np_tokens == bos_token_id)[0].tolist()
+        end_indices = np.where(np_tokens == eos_token_id)[0].tolist()[:1]
+        for i in range(len(start_indices) - 1, 0, -1):
+            end_indices.insert(0, start_indices[i] - 1)
+        # Initialize the mask with False values.
+        masks = np.zeros_like(np_tokens, dtype=bool)
+        assert len(start_indices) == len(end_indices)
+        # For each pair of bos/eos, check if the role is 'assistant'
+        # and apply the mask accordingly.
+        for start, end in zip(start_indices, end_indices):
+            if np_tokens[start + 1] == role_id:
+                # Mask tokens from after the assistant header (start+3) to include the end marker (end+1)
+                masks[start + START_OFFSET : end + END_OFFSET] = True
+
+        assert masks.shape == np_tokens.shape
+        if isinstance(tokens, torch.Tensor):
+            return torch.from_numpy(masks)
+        else:
+            return masks.tolist()
+
+    def encode(self, *args, **kwargs):
+        return self.processor.tokenizer.encode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        return self.processor.tokenizer.decode(*args, **kwargs)
+
+
+if __name__ == "__main__":
+    """
+    PYTHONPATH=. python3 cosmos_framework/data/vlm/processors/nemotronvl_processor.py
+
+    inputs: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_sizes', 'text'])
+        input_ids: type: <class 'torch.Tensor'> shape: torch.Size([6699])
+        attention_mask: type: <class 'torch.Tensor'> shape: torch.Size([6699])
+        pixel_values: type: <class 'torch.Tensor'> shape: torch.Size([26, 3, 224, 224])
+        image_sizes: type: <class 'torch.Tensor'> shape: torch.Size([2, 2])
+        text: type: <class 'str'>
+
+    For image, expected output:
+        input_ids: type: <class 'torch.Tensor'>
+        shape: torch.Size([2772])
+        attention_mask: type: <class 'torch.Tensor'>
+        shape: torch.Size([2772])
+        pixel_values: type: <class 'torch.Tensor'>
+        shape: torch.Size([11008, 1536])
+        image_grid_thw: type: <class 'torch.Tensor'>
+        shape: torch.Size([1, 3])
+        image_grid_thw: tensor([[  1,  86, 128]])
+        num_image_token_id_tokens: 2752
+        num_video_token_id_tokens: 0
+        assistant_tokens_mask: 2
+        assistant_tokens: tensor([ 59604, 151645])
+        decoded_assistant_tokens: Paris<|im_end|>
+
+    For video, expected output:
+        input_ids: type: <class 'torch.Tensor'>
+        shape: torch.Size([5538])
+        attention_mask: type: <class 'torch.Tensor'>
+        shape: torch.Size([5538])
+        pixel_values_videos: type: <class 'torch.Tensor'>
+        shape: torch.Size([22016, 1536])
+        video_grid_thw: type: <class 'torch.Tensor'>
+        shape: torch.Size([1, 3])
+        video_grid_thw: tensor([[  2,  86, 128]])
+        num_image_token_id_tokens: 0
+        num_video_token_id_tokens: 5504
+        assistant_tokens_mask: 2
+        assistant_tokens: tensor([ 59604, 151645])
+        decoded_assistant_tokens: Paris<|im_end|>
+    """
+    processor = NemotronVLProcessor("nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16")
+    from io import BytesIO
+
+    import requests
+
+    response = requests.get("https://invalid_url")
+    img = Image.open(BytesIO(response.content))
+
+    # test video
+    print("=============== test video ===============")
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": [img],
+                    "fps": 12,
+                },
+                {"type": "text", "text": "Describe what you see."},
+            ],
+        },
+        {"role": "assistant", "content": "<think> No need to think. </think> A cat is sleeping on a couch."},
+    ]
+    inputs = processor.apply_chat_template(messages)
+    input_ids = inputs["input_ids"]
+    decoded_text = processor.decode(input_ids, skip_special_tokens=False)
+    print(decoded_text)
+    print(list(inputs.keys()))
+    for k, v in inputs.items():
+        print(f"{k}: type: {type(v)}")
+        if isinstance(v, torch.Tensor):
+            print(f"shape: {v.shape}")
+        if "grid" in k:
+            print(f"{k}: {v}")
+    num_image_token_id_tokens = inputs["input_ids"] == processor.image_token_id
+    print(f"num_image_token_id_tokens: {num_image_token_id_tokens.sum()}")
+    num_video_token_id_tokens = inputs["input_ids"] == processor.video_token_id
+    print(f"num_video_token_id_tokens: {num_video_token_id_tokens.sum()}")
+
+    assistant_tokens_mask = processor.add_assistant_tokens_mask(inputs["input_ids"])
+    print(f"assistant_tokens_mask: {assistant_tokens_mask.sum()}")
+    assistant_tokens = inputs["input_ids"][assistant_tokens_mask]
+    print(f"assistant_tokens: {assistant_tokens}")
+    decoded_assistant_tokens = processor.decode(assistant_tokens, skip_special_tokens=False)
+    print(f"decoded_assistant_tokens: {decoded_assistant_tokens}")
+
+    print("\n\n\n\n\n=============== test image ===============")
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant."},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image", "image": img},
+            ],
+        },
+        {"role": "assistant", "content": "<think> No need to think. </think> A cat is sleeping on a couch."},
+    ]
+    inputs = processor.apply_chat_template(messages)
+    input_ids = inputs["input_ids"]
+    decoded_text = processor.decode(input_ids, skip_special_tokens=False)
+    print(decoded_text)
+    print(list(inputs.keys()))
+    for k, v in inputs.items():
+        print(f"{k}: type: {type(v)}")
+        if isinstance(v, torch.Tensor):
+            print(f"shape: {v.shape}")
+        if "grid" in k:
+            print(f"{k}: {v}")
+    num_image_token_id_tokens = inputs["input_ids"] == processor.image_token_id
+    print(f"num_image_token_id_tokens: {num_image_token_id_tokens.sum()}")
+    num_video_token_id_tokens = inputs["input_ids"] == processor.video_token_id
+    print(f"num_video_token_id_tokens: {num_video_token_id_tokens.sum()}")
+
+    assistant_tokens_mask = processor.add_assistant_tokens_mask(inputs["input_ids"])
+    print(f"assistant_tokens_mask: {assistant_tokens_mask.sum()}")
+    assistant_tokens = inputs["input_ids"][assistant_tokens_mask]
+    print(f"assistant_tokens: {assistant_tokens}")
+    decoded_assistant_tokens = processor.decode(assistant_tokens, skip_special_tokens=False)
+    print(f"decoded_assistant_tokens: {decoded_assistant_tokens}")
+
+    print("\n\n\n\n\n=============== done ===============")
diff --git a/cosmos_framework/data/vlm/processors/qwen3vl_processor.py b/cosmos_framework/data/vlm/processors/qwen3vl_processor.py
index 1dd37fd..b624d6a 100644
--- a/cosmos_framework/data/vlm/processors/qwen3vl_processor.py
+++ b/cosmos_framework/data/vlm/processors/qwen3vl_processor.py
@@ -131,17 +131,18 @@ def apply_chat_template(
         num_video, video_fps, video_total_num_frames, video_frames_indices = maybe_parse_video_content(messages)
         if num_video > 0:
             # Here we add the args to avoid the error:
-            # File "/usr/local/lib/python3.12/dist-packages/transformers/video_processing_utils.py", line 321, in _decode_and_sample_videos
+            # File "/invalid_dir", line 321, in _decode_and_sample_videos
             #     raise ValueError(
             # ValueError: Sampling frames from a list of images is not supported! Set `do_sample_frames=False`.
-            kwargs["videos_kwargs"] = dict(do_sample_frames=False)
-            assert num_video == 1, "only support one video for now"
-            fps = video_fps[0]
-            total_num_frames = video_total_num_frames[0]
-            frames_indices = video_frames_indices[0]
+            video_metadata = [
+                dict(fps=fps, total_num_frames=total_num_frames, frames_indices=frames_indices)
+                for fps, total_num_frames, frames_indices in zip(
+                    video_fps, video_total_num_frames, video_frames_indices
+                )
+            ]
             kwargs["videos_kwargs"] = {
                 "do_sample_frames": False,
-                "video_metadata": dict(fps=fps, total_num_frames=total_num_frames, frames_indices=frames_indices),
+                "video_metadata": video_metadata[0] if num_video == 1 else video_metadata,
             }
         inputs = self.processor.apply_chat_template(
             messages,
@@ -273,12 +274,12 @@ def decode(self, *args, **kwargs):
             "content": [
                 {
                     "type": "video",
-                    "video": ["https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"] * 4,
+                    "video": ["https://invalid_url"] * 4,
                     "fps": 12,
                 },
                 # {
                 #     "type": "image",
-                #     "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+                #     "image": "https://invalid_url",
                 #     "max_pixels": 256 * 32 * 32,  # this will lead to 486 vision tokens
                 #     "min_pixels": 32 * 32,
                 # },
diff --git a/cosmos_framework/inference/action.py b/cosmos_framework/inference/action.py
index b172806..7f27c35 100644
--- a/cosmos_framework/inference/action.py
+++ b/cosmos_framework/inference/action.py
@@ -9,12 +9,16 @@
 
 import torch
 
+from cosmos_framework.data.vfm.action.action_processing import (
+    ActionProcessingRecord,
+    make_batched_action_processing_fields,
+    pad_action_to_max_dim,
+)
 from cosmos_framework.data.vfm.action.domain_utils import EMBODIMENT_TO_RAW_ACTION_DIM, get_domain_id
 from cosmos_framework.data.vfm.action.json_formatter import ActionPromptJsonFormatter
 from cosmos_framework.data.vfm.action.transforms import (
     build_sequence_plan_from_mode,
     find_closest_target_size,
-    pad_action_to_max_dim,
     reflection_pad_to_target,
 )
 from cosmos_framework.inference.args import ModelMode
@@ -122,10 +126,15 @@ def build_action_batch(
         image_size=padded_image_size,
     )
 
+    action_processing_record = ActionProcessingRecord(
+        raw_action_dim=raw_action_dim,
+        action_normalizer=None,
+    )
+
     return {
         input_video_key: [[video_padded]] * batch_size,
         "action": [[action]] * batch_size,
-        "raw_action_dim": [torch.tensor(raw_action_dim, dtype=torch.long)] * batch_size,
+        **make_batched_action_processing_fields(action_processing_record, batch_size),
         "mode": [model_mode.value] * batch_size,
         "ai_caption": [ai_caption] * batch_size,
         "prompt": [prompt] * batch_size,
diff --git a/cosmos_framework/model/attention/checks.py b/cosmos_framework/model/attention/checks.py
index 87d1d53..cb74e62 100644
--- a/cosmos_framework/model/attention/checks.py
+++ b/cosmos_framework/model/attention/checks.py
@@ -254,7 +254,7 @@ def varlen_tensor_checks(
             f"Q, K, and V must match in batch size, got {query.shape[0]=}, {key.shape[0]=}, {value.shape[0]=}."
         )
 
-
+    # NOTE: these checks introduce recompiles
     if not is_torch_compiling():
         # Validate max_seqlen values: neither can be negative, and they must be
         # both zero/None (not varlen) or both positive (varlen).
@@ -299,7 +299,7 @@ def varlen_tensor_checks(
         )
 
     # Validate user-input cumulative_seqlen_{Q,KV}, max_seqlen_{Q,KV}, total_seqlen_{Q,KV}
-
+    # NOTE: max_seqlen_Q == max_seqlen_KV == 0 is valid here (skip kernel / empty-batch case).
     # Mismatch (one 0, the other positive) is already caught by the early check above.
     # This feature may require support in the backends themselves; see NATTEN PR:
     # https://github.com/SHI-Labs/NATTEN/pull/327
@@ -334,7 +334,7 @@ def varlen_tensor_checks(
     total_seqlen_Q = query.shape[1]
     total_seqlen_KV = key.shape[1]
 
-
+    # NOTE: these checks introduce recompiles
     if not is_torch_compiling():
         # When both max_seqlens are 0, skip bounds checks (skip kernel / empty-batch case).
         # Mismatch is already caught by the early check, so at this point either both are 0 or both are positive.
diff --git a/cosmos_framework/model/attention/cudnn/checks.py b/cosmos_framework/model/attention/cudnn/checks.py
index 32b4540..ffc5f96 100644
--- a/cosmos_framework/model/attention/cudnn/checks.py
+++ b/cosmos_framework/model/attention/cudnn/checks.py
@@ -124,7 +124,6 @@ def cudnn_attention_check(
         causal_type=causal_type,
     )
 
-
     if is_causal and causal_type not in [CausalType.TopLeft, CausalType.DontCare]:
         target_fn("cuDNN Attention only supports top-left causal masking for now.", exception=RuntimeError)
         return False
diff --git a/cosmos_framework/model/attention/cudnn/cudnn_forward.py b/cosmos_framework/model/attention/cudnn/cudnn_forward.py
index 3637f7e..a68ed00 100644
--- a/cosmos_framework/model/attention/cudnn/cudnn_forward.py
+++ b/cosmos_framework/model/attention/cudnn/cudnn_forward.py
@@ -19,7 +19,6 @@
 from cosmos_framework.model.attention.utils.safe_ops import log
 from cosmos_framework.model.attention.utils.safe_ops.functools import lru_cache
 
-
 # Force using padded mask as a potential workaround for failing use cases
 FORCE_PADDED_MASK = False
 
@@ -316,7 +315,7 @@ def cudnn_sdpa_fwd_generate_op(
     handle = cudnn.create_handle()
 
     def cudnn_operation(q: Tensor, k: Tensor, v: Tensor, output: Tensor, lse: Tensor | None = None):
-
+        # NOTE: This is INCREDIBLY important to do -- this is what wasted days of my time
         # with random NaNs and illegal memory accesses and things of that nature.
         stream = torch.cuda.current_stream(q.device)
         cudnn.set_stream(handle=handle, stream=stream.cuda_stream)
diff --git a/cosmos_framework/model/attention/cudnn/functions.py b/cosmos_framework/model/attention/cudnn/functions.py
index 4087390..6d06c59 100644
--- a/cosmos_framework/model/attention/cudnn/functions.py
+++ b/cosmos_framework/model/attention/cudnn/functions.py
@@ -236,7 +236,6 @@ def cudnn_attention(
         raise_error=True,
     )
 
-
     assert not is_varlen  # cudnn_attention_check should prevent this assertion failing
 
     num_heads = query.shape[-2]
diff --git a/cosmos_framework/model/attention/cudnn/meta.py b/cosmos_framework/model/attention/cudnn/meta.py
index 9a21992..5cd1534 100644
--- a/cosmos_framework/model/attention/cudnn/meta.py
+++ b/cosmos_framework/model/attention/cudnn/meta.py
@@ -46,5 +46,4 @@ def get_bwd_dtypes(arch_tag: int) -> list[torch.dtype]:
 
     """
 
-
     return []
diff --git a/cosmos_framework/model/attention/flash2/__init__.py b/cosmos_framework/model/attention/flash2/__init__.py
index 77ada63..85c7cbc 100644
--- a/cosmos_framework/model/attention/flash2/__init__.py
+++ b/cosmos_framework/model/attention/flash2/__init__.py
@@ -11,6 +11,7 @@
 import torch
 
 from cosmos_framework.model.attention.utils.safe_ops import log
+from cosmos_framework.model.attention.utils.version import version_in_range
 
 # We lock to safe releases of Flash 2
 # We will have a separate backend identifier for 2025 releases with CuTeDSL
@@ -50,15 +51,13 @@ def flash2_supported() -> bool:
     else:
         flash2_version_str = flash_attn.__version__
 
-    # Version range check disabled to accept whatever flash_attn the OSS
-    # container ships.
-    # if not version_in_range(flash2_version_str, FLASH_ATTENTION_V2_MIN_VERSION, FLASH_ATTENTION_V2_MAX_VERSION):
-    #     log.debug(
-    #         "Flash Attention v2 build is not supported; this backend only supports versions "
-    #         f"{FLASH_ATTENTION_V2_MIN_VERSION} through {FLASH_ATTENTION_V2_MAX_VERSION}, got "
-    #         f"{flash2_version_str}."
-    #     )
-    #     return False
+    if not version_in_range(flash2_version_str, FLASH_ATTENTION_V2_MIN_VERSION, FLASH_ATTENTION_V2_MAX_VERSION):
+        log.debug(
+            "Flash Attention v2 build is not supported; this backend only supports versions "
+            f"{FLASH_ATTENTION_V2_MIN_VERSION} through {FLASH_ATTENTION_V2_MAX_VERSION}, got "
+            f"{flash2_version_str}."
+        )
+        return False
 
     return True
 
diff --git a/cosmos_framework/model/attention/flash2/checks.py b/cosmos_framework/model/attention/flash2/checks.py
index 5332edb..73fef61 100644
--- a/cosmos_framework/model/attention/flash2/checks.py
+++ b/cosmos_framework/model/attention/flash2/checks.py
@@ -75,17 +75,12 @@ def flash2_attention_check(
         )
         return False
 
-
-    # mixed_modality_sft_8b smoke on Blackwell — flash3 isn't built for arch
-    # 100/103 and natten doesn't support varlen. Revisit before production
-    # training on this hardware.
-    # if is_varlen:
-    #     target_fn(
-    #         "Flash Attention v2 (flash2) varlen is banned due to instability. "
-    #         "Please choose another backend.",
-    #         exception=ValueError,
-    #     )
-    #     return False
+    if is_varlen:
+        target_fn(
+            "Flash Attention v2 (flash2) varlen is banned due to instability. Please choose another backend.",
+            exception=ValueError,
+        )
+        return False
 
     arch_tag = get_arch_tag(device)
     fwd_dtypes = get_fwd_dtypes(arch_tag)
diff --git a/cosmos_framework/model/attention/flash2/functions.py b/cosmos_framework/model/attention/flash2/functions.py
index 25cfd1a..2d1f491 100644
--- a/cosmos_framework/model/attention/flash2/functions.py
+++ b/cosmos_framework/model/attention/flash2/functions.py
@@ -175,7 +175,7 @@ def flash2_attention(
     assert output.dim() == 4  # [B,N,H,Dv] or [1,total_tokens,H,Dv]
     assert lse.dim() == 3  # [B,H,N] or [1,H,total_tokens]
 
-
+    # NOTE: Do NOT call .contiguous on LSE, otherwise Attention Merging backward pass will be
     # incorrect. All output and lse tensors passed into `merge_attentions` must have the same data
     # pointer as their corresponding attention autograd ops!
     lse = lse.permute(0, 2, 1)  # [B,N,H] or [1,total_tokens,H]
diff --git a/cosmos_framework/model/attention/flash3/functions.py b/cosmos_framework/model/attention/flash3/functions.py
index fd8fb7f..76c1505 100644
--- a/cosmos_framework/model/attention/flash3/functions.py
+++ b/cosmos_framework/model/attention/flash3/functions.py
@@ -15,7 +15,7 @@
 from flash_attn_3_nv.flash_attn_interface import flash_attn_func, flash_attn_varlen_func
 from torch import Tensor
 
-
+# NOTE: older commits didn't have `return_attn_probs` as an argument, and there is no
 # reflection of the commit hash in the version, so we have to manually inspect the signatures
 HAS_RETURN_ATTN_PROBS = "return_attn_probs" in inspect.signature(flash_attn_func).parameters
 
@@ -190,7 +190,7 @@ def flash3_attention(
     assert output.dim() == 4  # [B,N,H,Dv] or [1,total_tokens,H,Dv]
     assert lse.dim() == 3  # [B,H,N] or [1,H,total_tokens]
 
-
+    # NOTE: Do NOT call .contiguous on LSE, otherwise Attention Merging backward pass will be
     # incorrect. All output and lse tensors passed into `merge_attentions` must have the same data
     # pointer as their corresponding attention autograd ops!
     lse = lse.permute(0, 2, 1)  # [B,N,H] or [1,total_tokens,H]
diff --git a/cosmos_framework/model/attention/frontend.py b/cosmos_framework/model/attention/frontend.py
index d69c980..018932a 100644
--- a/cosmos_framework/model/attention/frontend.py
+++ b/cosmos_framework/model/attention/frontend.py
@@ -393,7 +393,7 @@ def multi_dimensional_attention(
 
     # Automatic transformation for 1s in token layout
     # I.e. Attention over a (1, 16, 32) token layout is identical to over a (16, 32)
-
+    # NOTE: assumes QKV token layouts match
     token_layout_ones = [i for i in range(num_dims) if token_layout_shape[i] == 1]
     if len(token_layout_ones) > 0:
         token_layout_t = tuple(s for i, s in enumerate(token_layout_shape) if i not in token_layout_ones)
@@ -552,7 +552,7 @@ def multi_dimensional_attention_varlen(
         value (Tensor): 4-D value tensor with sequence-packed layout
             (`[1, seqlen_total, heads_kv, head_dim_v]`)
 
-        metadata (dict): Pre-computed varlen metadata from `imaginaire.varlen.generate_multi_dim_varlen_parameters`.
+        metadata (dict): Pre-computed varlen metadata from `cosmos_framework.varlen.generate_multi_dim_varlen_parameters`.
 
         scale (float | None): Attention scale. Defaults to head_dim ** -0.5.
 
diff --git a/cosmos_framework/model/attention/natten/checks.py b/cosmos_framework/model/attention/natten/checks.py
index 49b1a29..dfae074 100644
--- a/cosmos_framework/model/attention/natten/checks.py
+++ b/cosmos_framework/model/attention/natten/checks.py
@@ -118,7 +118,7 @@ def choose_natten_backend(
 
     target_fn = partial(log_or_raise_error, raise_error=raise_error)
 
-
+    # NOTE: assumes attention_tensor_checks have already been run once!
     arch_tag = get_arch_tag(device)
 
     is_mla = query_shape[-1] != value_shape[-1]
diff --git a/cosmos_framework/model/attention/varlen.py b/cosmos_framework/model/attention/varlen.py
index e67ca6b..270e1d2 100644
--- a/cosmos_framework/model/attention/varlen.py
+++ b/cosmos_framework/model/attention/varlen.py
@@ -23,7 +23,7 @@ def generate_varlen_parameters(
 ) -> (
     tuple[None, None, int, int] | tuple[Tensor, Tensor, int, int]
 ):  # (cumseqlen_Q[B+1], cumseqlen_KV[B+1], max_seqlen_Q, max_seqlen_KV)
-
+    # NOTE: max_seqlen_{Q,KV} require a device-host sync, since they're expected to be ints (with
     # which we launch the varlen kernel) and not device tensors.
     # .item() introduces control flow and breaks the graph.
     # It is also inefficient to repeat this per-op, and mostly there for convenience.
@@ -97,7 +97,7 @@ def generate_varlen_parameters(
     if max_seqlen_Q < 0 or max_seqlen_KV < 0:
         raise ValueError(f"max_seqlen_Q and max_seqlen_KV cannot be negative, got {max_seqlen_Q=}, {max_seqlen_KV=}.")
 
-
+    # NOTE: max_seqlen_Q == max_seqlen_KV == 0 is a valid case (skip kernel / empty batch).
     # This feature may require support in the backends themselves; see NATTEN PR:
     # https://github.com/SHI-Labs/NATTEN/pull/327
     if (max_seqlen_Q == 0) != (max_seqlen_KV == 0):
@@ -106,7 +106,7 @@ def generate_varlen_parameters(
             f"but computed {max_seqlen_Q=}, {max_seqlen_KV=} from provided seqlens."
         )
 
-
+    # NOTE: we have to prepend with 0 manually :(
     z = torch.tensor([0], dtype=torch.int32, device=seqlens_Q.device)  # [1]
     cumulative_seqlen_Q = torch.cat([z, seqlens_Q.cumsum(0).to(torch.int32)], dim=0)  # [B+1]
     cumulative_seqlen_KV = torch.cat([z, seqlens_KV.cumsum(0).to(torch.int32)], dim=0)  # [B+1]
diff --git a/cosmos_framework/model/tokenizer/evaluation/metric.py b/cosmos_framework/model/tokenizer/evaluation/metric.py
new file mode 100644
index 0000000..955b658
--- /dev/null
+++ b/cosmos_framework/model/tokenizer/evaluation/metric.py
@@ -0,0 +1,431 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""Standalone metric computation functions.
+
+This module provides metric computation for tokenizer evaluation:
+    - compute_psnr: Peak signal-to-noise ratio
+    - compute_fid: Frechet Inception Distance (using torchmetrics)
+    - compute_imagenet_accuracy: ImageNet zero-shot classification accuracy
+"""
+
+from __future__ import annotations
+
+import os
+from contextlib import nullcontext
+from typing import Any
+
+import torch
+from loguru import logger as logging
+
+
+def compute_psnr(
+    original: torch.Tensor,
+    reconstructed: torch.Tensor,
+    max_value: float = 1.0,
+) -> float:
+    """Compute PSNR between original and reconstructed tensors.
+
+    Args:
+        original: Original tensor in [0, 1] range (or [0, max_value]).
+            Shape: (B, C, H, W) or (C, H, W), or list of tensors.
+        reconstructed: Reconstructed tensor in same range.
+        max_value: Maximum value of the pixels (1.0 for normalized images).
+
+    Returns:
+        PSNR value in dB.
+    """
+    # Handle list inputs
+    if isinstance(original, list) and isinstance(reconstructed, list):
+        if len(original) != len(reconstructed):
+            raise ValueError(f"Image lists must have same length. Got {len(original)} and {len(reconstructed)}")
+        psnr_values = [compute_psnr(orig, recon, max_value) for orig, recon in zip(original, reconstructed)]
+        return sum(psnr_values) / len(psnr_values)
+
+    # Ensure same shape
+    if original.shape != reconstructed.shape:
+        raise ValueError(f"Images must have same shape. Got {original.shape} and {reconstructed.shape}")
+
+    # Add batch dimension if not present
+    if len(original.shape) == 3:
+        original = original.unsqueeze(0)
+        reconstructed = reconstructed.unsqueeze(0)
+
+    # Calculate MSE per image in batch (average over C, H, W)
+    mse = torch.mean(
+        (original.detach() - reconstructed.detach()) ** 2,
+        dim=[1, 2, 3],
+    )
+
+    # Handle identical images (return 100.0 dB as maximum)
+    if torch.any(mse == 0):
+        max_psnr = 100.0
+        mse = torch.where(
+            mse == 0,
+            torch.tensor(10.0 ** (-max_psnr / 10.0), device=mse.device),
+            mse,
+        )
+
+    # Calculate PSNR
+    psnr = 20 * torch.log10(torch.tensor(max_value, device=mse.device) / torch.sqrt(mse))
+
+    # Return mean PSNR if batch size > 1
+    return psnr.mean().item() if psnr.shape[0] > 1 else psnr[0].item()
+
+
+@torch.no_grad()
+def compute_imagenet_accuracy(
+    image_features: torch.Tensor,
+    text_features: torch.Tensor,
+    labels: torch.Tensor,
+    logit_scale: float = 100.0,
+    logit_bias: float | None = None,
+    top_k: tuple[int, ...] = (1, 5),
+) -> dict[str, float]:
+    """Compute ImageNet zero-shot classification accuracy.
+
+    Args:
+        image_features: Image features of shape (N, D), L2-normalized.
+        text_features: Text features for class templates of shape (num_classes, D), L2-normalized.
+        labels: Ground truth labels of shape (N,).
+        logit_scale: Logit scaling factor.
+        logit_bias: Optional logit bias.
+        top_k: Tuple of k values for top-k accuracy.
+
+    Returns:
+        Dictionary with top-k accuracies.
+    """
+    # Compute logits
+    logits = logit_scale * image_features @ text_features.T
+    if logit_bias is not None:
+        logits = logits + logit_bias
+
+    # Compute top-k accuracy
+    results = {}
+    for k in top_k:
+        _, pred = logits.topk(k, dim=1, largest=True, sorted=True)
+        correct = pred.eq(labels.view(-1, 1).expand_as(pred))
+        accuracy = correct.float().sum() / labels.numel()
+        results[f"top{k}_acc"] = accuracy.item() * 100
+
+    return results
+
+
+@torch.no_grad()
+def compute_codebook_usage(
+    indices: torch.Tensor,
+    num_codes: int = 65536,
+) -> dict[str, float]:
+    """Compute codebook usage statistics.
+
+    Args:
+        indices: Quantized code indices.
+        num_codes: Total number of codes in codebook.
+
+    Returns:
+        Dictionary with usage statistics.
+    """
+    # Flatten indices
+    flat_indices = indices.flatten().long()
+
+    # Handle empty indices
+    if flat_indices.numel() == 0:
+        return {
+            "perplexity": 0.0,
+            "active_codes": 0,
+            "active_ratio": 0.0,
+        }
+
+    # Gather indices across all GPUs for accurate codebook usage
+    if torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+        # Gather sizes first (indices may have different lengths per GPU)
+        local_size = torch.tensor([flat_indices.numel()], device=flat_indices.device)
+        sizes = [torch.zeros(1, dtype=torch.long, device=flat_indices.device) for _ in range(world_size)]
+        torch.distributed.all_gather(sizes, local_size)
+        max_size = max(s.item() for s in sizes)
+
+        # Pad indices to max_size for gathering
+        padded = torch.zeros(max_size, dtype=flat_indices.dtype, device=flat_indices.device)
+        padded[: flat_indices.numel()] = flat_indices
+        gathered = [
+            torch.zeros(max_size, dtype=flat_indices.dtype, device=flat_indices.device) for _ in range(world_size)
+        ]
+        torch.distributed.all_gather(gathered, padded)
+
+        # Concatenate only valid indices from each GPU
+        all_indices = []
+        for i, g in enumerate(gathered):
+            all_indices.append(g[: sizes[i].item()])
+        flat_indices = torch.cat(all_indices)
+
+    # Compute code histogram
+    histogram = torch.bincount(flat_indices, minlength=num_codes).float()
+    total = histogram.sum()
+    if total == 0:
+        return {
+            "perplexity": 0.0,
+            "active_codes": 0,
+            "active_ratio": 0.0,
+        }
+    histogram_norm = histogram / total
+
+    # Compute perplexity (exponential of entropy)
+    log_probs = torch.log(histogram_norm + 1e-10)
+    entropy = -torch.sum(histogram_norm * log_probs)
+    perplexity = torch.exp(entropy)
+
+    # Compute active code ratio
+    active_codes = (histogram > 0).sum()
+    active_ratio = active_codes.float() / num_codes
+
+    return {
+        "perplexity": perplexity.item(),
+        "active_codes": active_codes.item(),
+        "active_ratio": active_ratio.item() * 100,
+    }
+
+
+class FIDComputer:
+    """Compute Frechet Inception Distance between two sets of images.
+
+    Uses torchmetrics.image.fid.FrechetInceptionDistance for computation.
+
+    This wrapper is lazy and distributed-aware:
+    - It delays Inception construction until first use.
+    - It coordinates feature extractor initialization so only one local rank per
+      node performs the initial weight download/cache population.
+    - It can use a pre-downloaded weight file via ``feature_extractor_weights_path``
+      or the ``TOKENIZER_FID_WEIGHTS_PATH`` environment variable.
+    - In distributed mode, it reduces FID sufficient statistics explicitly
+      instead of relying on torchmetrics internal process-group synchronization.
+    """
+
+    def __init__(
+        self,
+        device: str = "cuda",
+        feature: int | torch.nn.Module = 2048,
+        normalize: bool = True,
+        sync_on_compute: bool = True,
+        dist_sync_on_step: bool = False,
+        feature_extractor_weights_path: str | None = None,
+    ) -> None:
+        """Initialize FID computer.
+
+        Args:
+            device: Device for computation.
+            feature: InceptionV3 feature dimension (2048 for final pool), or a
+                custom feature extractor module for testing/specialized use.
+            normalize: Whether to normalize input images to [0, 1].
+            sync_on_compute: Whether to synchronize metric state on compute.
+            dist_sync_on_step: Whether to synchronize on each update step.
+            feature_extractor_weights_path: Optional local path to torch-fidelity
+                Inception weights. Falls back to TOKENIZER_FID_WEIGHTS_PATH env var.
+        """
+        self.device = device
+        self._metric = None
+        self._normalize = normalize
+        self._feature = feature
+        self._sync_on_compute = sync_on_compute
+        self._dist_sync_on_step = dist_sync_on_step
+        self._feature_extractor_weights_path = feature_extractor_weights_path or os.environ.get(
+            "TOKENIZER_FID_WEIGHTS_PATH"
+        )
+        self._initialized = False
+
+    @staticmethod
+    def _is_local_leader() -> bool:
+        """Return True for the first process on the current node."""
+        local_rank = os.environ.get("LOCAL_RANK")
+        if local_rank is not None:
+            return int(local_rank) == 0
+        if torch.cuda.is_available():
+            return torch.cuda.current_device() == 0
+        return True
+
+    def _autocast_context(self) -> Any:
+        """Return an autocast context appropriate for the current device."""
+        device_type = torch.device(self.device).type
+        if device_type == "cuda":
+            return torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32)
+        return nullcontext()
+
+    def _ensure_initialized(self) -> bool:
+        """Lazily initialize the FID metric."""
+        if self._initialized:
+            return self._metric is not None
+
+        self._initialized = True
+        init_exception: Exception | None = None
+        should_coordinate = False
+        is_local_leader = True
+        try:
+            import torch.distributed as dist
+            import torchmetrics.image.fid
+
+            is_distributed = dist.is_available() and dist.is_initialized()
+            should_coordinate = self._feature_extractor_weights_path is None and is_distributed
+            is_local_leader = self._is_local_leader()
+            if should_coordinate and not is_local_leader:
+                dist.barrier()
+
+            self._metric = torchmetrics.image.fid.FrechetInceptionDistance(
+                feature=self._feature,
+                normalize=self._normalize,
+                sync_on_compute=self._sync_on_compute if not is_distributed else False,
+                dist_sync_on_step=self._dist_sync_on_step if not is_distributed else False,
+                feature_extractor_weights_path=self._feature_extractor_weights_path,
+            )
+            self._metric.to(self.device)
+            logging.info(
+                f"Initialized FID metric with feature={self._feature}, "
+                f"normalize={self._normalize}, "
+                f"weights_path={self._feature_extractor_weights_path or '<download>'}"
+            )
+        except ImportError:
+            logging.warning("torchmetrics not available for FID computation")
+            self._metric = None
+        except Exception as e:
+            init_exception = e
+            self._metric = None
+        finally:
+            if should_coordinate and is_local_leader:
+                import torch.distributed as dist
+
+                dist.barrier()
+
+        if init_exception is not None:
+            logging.warning(
+                f"FID metric initialization failed: {init_exception}. "
+                "Pre-cache the torch-fidelity Inception weights or set TOKENIZER_FID_WEIGHTS_PATH."
+            )
+
+        return self._metric is not None
+
+    def to(self, device: str | torch.device) -> "FIDComputer":
+        """Move the underlying metric to a device.
+
+        FID accumulates statistics in float64 internally for numerical
+        stability, so callers should not use this wrapper to change dtypes.
+        """
+        self.device = str(device)
+        if self._metric is not None:
+            self._metric.to(device)
+        return self
+
+    def reset(self) -> None:
+        """Reset accumulated features."""
+        if self._metric is not None:
+            self._metric.reset()
+
+    @torch.no_grad()
+    def update(
+        self,
+        images: torch.Tensor,
+        real: bool = True,
+    ) -> None:
+        """Update with a batch of images.
+
+        Separate calls for real and fake images.
+
+        Args:
+            images: Images in [0, 1] range, shape (B, C, H, W).
+            real: Whether these are real images (True) or fake/reconstructed (False).
+        """
+        if not self._ensure_initialized():
+            return
+
+        # Handle video tensors (B, C, T, H, W) -> (B*T, C, H, W)
+        if images.ndim == 5:
+            images = images.reshape(-1, *images.shape[-3:])
+
+        if self._normalize and images.dtype == torch.uint8:
+            images = images.float() / 255.0
+
+        if self._normalize:
+            images = images.float().to(self.device)
+        else:
+            images = images.to(self.device)
+
+        # Update with autocast disabled for numerical stability
+        with self._autocast_context():
+            self._metric.update(images, real=real)
+
+    def _get_reduced_states(self) -> dict[str, torch.Tensor]:
+        """Return local or globally reduced FID sufficient statistics."""
+        states = {
+            "real_features_sum": self._metric.real_features_sum.detach().clone(),
+            "real_features_cov_sum": self._metric.real_features_cov_sum.detach().clone(),
+            "real_features_num_samples": self._metric.real_features_num_samples.detach().clone(),
+            "fake_features_sum": self._metric.fake_features_sum.detach().clone(),
+            "fake_features_cov_sum": self._metric.fake_features_cov_sum.detach().clone(),
+            "fake_features_num_samples": self._metric.fake_features_num_samples.detach().clone(),
+        }
+
+        import torch.distributed as dist
+
+        if dist.is_available() and dist.is_initialized():
+            for value in states.values():
+                dist.all_reduce(value, op=dist.ReduceOp.SUM)
+
+        return states
+
+    @staticmethod
+    def _compute_fid_from_states(states: dict[str, torch.Tensor]) -> torch.Tensor:
+        """Compute FID from sufficient statistics."""
+        real_sum = states["real_features_sum"]
+        real_cov_sum = states["real_features_cov_sum"]
+        fake_sum = states["fake_features_sum"]
+        fake_cov_sum = states["fake_features_cov_sum"]
+
+        real_num = states["real_features_num_samples"].to(real_sum.dtype)
+        fake_num = states["fake_features_num_samples"].to(fake_sum.dtype)
+
+        if real_num.item() < 2 or fake_num.item() < 2:
+            raise RuntimeError(
+                "More than one sample is required for both the real and fake distributions to compute FID"
+            )
+
+        mean_real = (real_sum / real_num).unsqueeze(0)
+        mean_fake = (fake_sum / fake_num).unsqueeze(0)
+
+        cov_real_num = real_cov_sum - real_num * mean_real.t().mm(mean_real)
+        cov_real = cov_real_num / (real_num - 1)
+        cov_fake_num = fake_cov_sum - fake_num * mean_fake.t().mm(mean_fake)
+        cov_fake = cov_fake_num / (fake_num - 1)
+
+        diff = (mean_real.squeeze(0) - mean_fake.squeeze(0)).square().sum(dim=-1)
+        trace = cov_real.trace() + cov_fake.trace()
+        covmean = torch.linalg.eigvals(cov_real @ cov_fake).sqrt().real.sum(dim=-1)
+        return diff + trace - 2 * covmean
+
+    def compute(self) -> float:
+        """Compute FID from accumulated features.
+
+        Returns:
+            FID value (lower is better).
+        """
+        if not self._ensure_initialized():
+            return float("nan")
+
+        try:
+            import torch.distributed as dist
+
+            if dist.is_available() and dist.is_initialized():
+                states = self._get_reduced_states()
+                fid_value = self._compute_fid_from_states(states)
+            else:
+                with self._autocast_context():
+                    fid_value = self._metric.compute()
+            return fid_value.item()
+        except Exception as e:
+            logging.warning(f"FID computation failed: {e}")
+            return float("nan")
+
+
+__all__ = [
+    "compute_psnr",
+    "compute_imagenet_accuracy",
+    "compute_codebook_usage",
+    "FIDComputer",
+]
diff --git a/cosmos_framework/model/tokenizer/evaluation/reconstruction_metrics.py b/cosmos_framework/model/tokenizer/evaluation/reconstruction_metrics.py
new file mode 100644
index 0000000..66db4fb
--- /dev/null
+++ b/cosmos_framework/model/tokenizer/evaluation/reconstruction_metrics.py
@@ -0,0 +1,497 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""Metric computation for tokenizer evaluation.
+
+This module provides metrics for evaluating tokenizer quality:
+    - PSNRMetric: Peak signal-to-noise ratio (using torchmetrics)
+    - SSIMMetric: Structural similarity index (using torchmetrics)
+    - LPIPSMetric: Learned perceptual image patch similarity
+    - TokenizerMetric: Composite metric that includes codebook usage via compute_codebook_usage
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+import torch
+import torch.nn as nn
+
+# Import torchmetrics for SSIM and LPIPS
+try:
+    from torchmetrics.image import StructuralSimilarityIndexMeasure
+    from torchmetrics.image.lpip import LearnedPerceptualImagePatchSimilarity
+
+    HAS_TORCHMETRICS = True
+except ImportError:
+    HAS_TORCHMETRICS = False
+
+# Standard batch keys
+INPUT_KEY = "inputs"  # [0, 1] range for PSNR/SSIM
+RECON_KEY = "reconstructions"  # [0, 1] range for PSNR/SSIM
+INPUT_RAW_KEY = "inputs_raw"  # [-1, 1] range for LPIPS
+RECON_RAW_KEY = "reconstructions_raw"  # [-1, 1] range for LPIPS
+
+
+class TokenizerMetric(nn.Module):
+    """Composite metric module for tokenizer evaluation.
+
+    Combines multiple metrics and computes them in a single forward pass.
+
+    Args:
+        compute_psnr: Whether to compute PSNR.
+        compute_ssim: Whether to compute SSIM.
+        compute_lpips: Whether to compute LPIPS.
+        compute_code_usage: Whether to compute codebook usage.
+    """
+
+    def __init__(
+        self,
+        compute_psnr: bool = True,
+        compute_ssim: bool = True,
+        compute_lpips: bool = False,
+        compute_code_usage: bool = False,
+        num_codes: int = 65536,
+    ) -> None:
+        super().__init__()
+        self.compute_psnr = compute_psnr
+        self.compute_ssim = compute_ssim
+        self.compute_lpips = compute_lpips
+        self.compute_code_usage = compute_code_usage
+        self.num_codes = num_codes
+
+        if compute_psnr:
+            self.psnr = PSNRMetric()
+        if compute_ssim:
+            self.ssim = SSIMMetric()
+        if compute_lpips:
+            self.lpips = LPIPSMetric()
+
+    def forward(
+        self,
+        inputs: dict[str, torch.Tensor],
+        output_batch: dict[str, torch.Tensor],
+        iteration: int,
+    ) -> dict[str, Any]:
+        """Compute all enabled metrics.
+
+        Args:
+            inputs: Input batch with original images/videos. Should contain:
+                - "inputs": [0, 1] range for PSNR/SSIM
+                - "inputs_raw": [-1, 1] range for LPIPS
+            output_batch: Output batch with reconstructions. Should contain:
+                - "reconstructions": [0, 1] range for PSNR/SSIM
+                - "reconstructions_raw": [-1, 1] range for LPIPS
+            iteration: Current iteration.
+
+        Returns:
+            Dictionary of metric values. PSNR/SSIM/LPIPS return dicts with 'sum' and 'count'
+            for proper distributed averaging.
+        """
+        metrics = {}
+
+        # [0, 1] range data for PSNR/SSIM
+        original = inputs.get(INPUT_KEY)
+        recon = output_batch.get(RECON_KEY)
+
+        # [-1, 1] range data for LPIPS
+        original_raw = inputs.get(INPUT_RAW_KEY)
+        recon_raw = output_batch.get(RECON_RAW_KEY)
+
+        if original is None or recon is None:
+            return metrics
+
+        if self.compute_psnr:
+            metrics["psnr"] = self.psnr(original, recon)
+
+        if self.compute_ssim:
+            metrics["ssim"] = self.ssim(original, recon)
+
+        if self.compute_lpips:
+            # Use [-1, 1] range data for LPIPS
+            # Fall back to converting [0, 1] to [-1, 1] if raw data not available
+            if original_raw is not None and recon_raw is not None:
+                metrics["lpips"] = self.lpips(original_raw, recon_raw)
+            else:
+                # Convert [0, 1] to [-1, 1] if raw data not provided
+                original_lpips = original * 2.0 - 1.0
+                recon_lpips = recon * 2.0 - 1.0
+                metrics["lpips"] = self.lpips(original_lpips, recon_lpips)
+
+        if self.compute_code_usage:
+            quant_info = output_batch.get("quant_info")
+            if quant_info is not None:
+                indices = quant_info.get("indices")
+                if indices is not None:
+                    from cosmos_framework.model.tokenizer.evaluation.metric import compute_codebook_usage
+
+                    code_stats = compute_codebook_usage(indices, self.num_codes)
+                    metrics["code_perplexity"] = code_stats["perplexity"]
+                    metrics["code_active_ratio"] = code_stats["active_ratio"]
+                    metrics["code_active_count"] = code_stats["active_codes"]
+
+        return metrics
+
+
+class PSNRMetric(nn.Module):
+    """Peak Signal-to-Noise Ratio metric.
+
+    Computes PSNR between original and reconstructed images.
+    Expects inputs in [0, 1] range (already normalized by caller).
+
+    Uses per-sample MSE calculation on uint8 [0, 255] range:
+    - Convert [0, 1] float to [0, 255] uint8
+    - Compute MSE per sample on uint8 values (average over C, H, W dimensions)
+    - Compute PSNR per sample with max_val=255
+    - Return dict with sum and count for proper distributed averaging
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, original: torch.Tensor, reconstructed: torch.Tensor) -> dict:
+        """Compute PSNR between original and reconstructed tensors.
+
+        Args:
+            original: Original tensor in [0, 1] range. Shape: (B, C, H, W) or (B, T, C, H, W).
+            reconstructed: Reconstructed tensor in [0, 1] range.
+
+        Returns:
+            Dict with 'sum' (sum of per-sample PSNRs) and 'count' (number of samples)
+            for proper distributed averaging.
+        """
+        # Handle video format by flattening batch and time dimensions
+        if original.dim() == 5:  # (B, T, C, H, W)
+            b, t, c, h, w = original.shape
+            original = original.reshape(b * t, c, h, w)
+            reconstructed = reconstructed.reshape(b * t, c, h, w)
+
+        # Convert to uint8 [0, 255] range
+        original_uint8 = (original.clamp(0, 1) * 255).byte()
+        reconstructed_uint8 = (reconstructed.clamp(0, 1) * 255).byte()
+
+        # Compute per-sample MSE on uint8 values (as float for precision)
+        mse = torch.mean((original_uint8.float() - reconstructed_uint8.float()) ** 2, dim=[1, 2, 3])  # (B,)
+
+        # Handle zero MSE (identical images) - use max PSNR of 100 dB
+        max_psnr = 100.0
+        mse = torch.where(
+            mse == 0,
+            torch.tensor(10.0 ** (-max_psnr / 10.0) * 255.0 * 255.0, device=mse.device, dtype=mse.dtype),
+            mse,
+        )
+
+        # Compute PSNR per sample with max_val=255
+        psnr = 20 * torch.log10(255.0 / torch.sqrt(mse))
+
+        # Return sum and count for proper distributed averaging
+        return {"sum": psnr.sum().item(), "count": psnr.shape[0]}
+
+
+class SSIMMetric(nn.Module):
+    """Structural Similarity Index metric.
+
+    Uses torchmetrics for SSIM computation.
+    Expects inputs in [0, 1] range (already normalized by caller).
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+        if HAS_TORCHMETRICS:
+            # data_range=1.0 for [0, 1] normalized images
+            self._ssim_metric = StructuralSimilarityIndexMeasure(
+                data_range=1.0,
+                sync_on_compute=False,
+                dist_sync_on_step=False,
+            )
+        else:
+            self._ssim_metric = None
+
+    def forward(self, original: torch.Tensor, reconstructed: torch.Tensor) -> dict:
+        """Compute SSIM between original and reconstructed tensors.
+
+        Args:
+            original: Original tensor in [0, 1] range. Shape: (B, C, H, W) or (B, T, C, H, W).
+            reconstructed: Reconstructed tensor in [0, 1] range.
+
+        Returns:
+            Dict with 'sum' (sum of per-sample SSIMs) and 'count' (number of samples)
+            for proper distributed averaging.
+        """
+        if not HAS_TORCHMETRICS or self._ssim_metric is None:
+            return {"sum": 0.0, "count": 0}
+
+        # Handle video by flattening temporal dimension
+        if original.dim() == 5:  # B, T, C, H, W
+            b, t, c, h, w = original.shape
+            original = original.reshape(b * t, c, h, w)
+            reconstructed = reconstructed.reshape(b * t, c, h, w)
+
+        # Clamp to [0, 1] range and convert to float32 for SSIM computation
+        original = original.clamp(0, 1).float()
+        reconstructed = reconstructed.clamp(0, 1).float()
+
+        batch_size = original.shape[0]
+
+        # Move metric to correct device
+        self._ssim_metric = self._ssim_metric.to(original.device)
+
+        # Reset metric state before computing to avoid accumulation from previous calls
+        self._ssim_metric.reset()
+
+        # Compute SSIM for each sample individually to get per-sample values
+        # We need to reset between samples to avoid state accumulation
+        ssim_sum = 0.0
+        for i in range(batch_size):
+            orig_i = original[i : i + 1]
+            recon_i = reconstructed[i : i + 1]
+            # Update with single sample
+            self._ssim_metric.update(recon_i, orig_i)
+            # Compute returns the value for accumulated samples (just 1 here)
+            ssim_val = self._ssim_metric.compute()
+            ssim_sum += ssim_val.item()
+            # Reset for next sample to avoid accumulation
+            self._ssim_metric.reset()
+
+        # Return sum and count for proper distributed averaging
+        return {"sum": ssim_sum, "count": batch_size}
+
+
+class LPIPSMetric(nn.Module):
+    """Learned Perceptual Image Patch Similarity metric.
+
+    Uses torchmetrics LPIPS with VGG backbone.
+    Expects inputs in [-1, 1] range for LPIPS computation.
+    Note: The forward() method expects [-1, 1] range directly (no conversion needed).
+    """
+
+    def __init__(self, net_type: str = "vgg") -> None:
+        super().__init__()
+        if HAS_TORCHMETRICS:
+            # LPIPS expects inputs in [-1, 1] range
+            self._lpips_metric = LearnedPerceptualImagePatchSimilarity(
+                net_type=net_type,
+                sync_on_compute=False,
+                dist_sync_on_step=False,
+            )
+        else:
+            self._lpips_metric = None
+
+    def forward(self, original: torch.Tensor, reconstructed: torch.Tensor) -> dict:
+        """Compute LPIPS between original and reconstructed tensors.
+
+        Args:
+            original: Original tensor in [-1, 1] range. Shape: (B, C, H, W) or (B, T, C, H, W).
+            reconstructed: Reconstructed tensor in [-1, 1] range.
+
+        Returns:
+            Dict with 'sum' (sum of per-sample LPIPS) and 'count' (number of samples)
+            for proper distributed averaging.
+        """
+        if not HAS_TORCHMETRICS or self._lpips_metric is None:
+            return {"sum": 0.0, "count": 0}
+
+        # Handle video by flattening temporal dimension
+        if original.dim() == 5:  # B, T, C, H, W
+            b, t, c, h, w = original.shape
+            original = original.reshape(b * t, c, h, w)
+            reconstructed = reconstructed.reshape(b * t, c, h, w)
+
+        # LPIPS expects [-1, 1] range - clamp and convert to float32
+        original_lpips = original.clamp(-1.0, 1.0).float()
+        reconstructed_lpips = reconstructed.clamp(-1.0, 1.0).float()
+
+        batch_size = original.shape[0]
+
+        # Move metric to correct device
+        self._lpips_metric = self._lpips_metric.to(original.device)
+
+        # Reset metric state before computing to avoid accumulation from previous calls
+        self._lpips_metric.reset()
+
+        # Compute LPIPS for each sample individually
+        lpips_sum = 0.0
+        for i in range(batch_size):
+            orig_i = original_lpips[i : i + 1]
+            recon_i = reconstructed_lpips[i : i + 1]
+            # Update with single sample
+            self._lpips_metric.update(recon_i, orig_i)
+            # Compute returns the value for accumulated samples (just 1 here)
+            lpips_val = self._lpips_metric.compute()
+            lpips_sum += lpips_val.item()
+            # Reset for next sample to avoid accumulation
+            self._lpips_metric.reset()
+
+        # Return sum and count for proper distributed averaging
+        return {"sum": lpips_sum, "count": batch_size}
+
+
+def calculate_psnr(
+    original: torch.Tensor | list[torch.Tensor],
+    reconstructed: torch.Tensor | list[torch.Tensor],
+) -> torch.Tensor:
+    """Calculate PSNR between two tensors or lists of tensors.
+
+    This is a standalone function for use in evaluation and training logging.
+    Expects inputs already in [0, 1] range. Converts to uint8 [0, 255] internally.
+
+    Supports multiple input formats:
+    - Lists of tensors (variable-size images from sparse_to_img_list)
+    - 5D tensors (B, T, C, H, W) for video
+    - 4D tensors (B, C, H, W) for batched images
+    - 3D tensors (C, H, W) for single images
+
+    Args:
+        original: Original image(s) in [0, 1] range. Can be tensor or list of tensors.
+        reconstructed: Reconstructed image(s) in [0, 1] range. Must match original format.
+
+    Returns:
+        PSNR value as a tensor (scalar, for distributed gathering).
+    """
+    # Handle lists of tensors (from sparse_to_img_list)
+    if isinstance(original, list) and isinstance(reconstructed, list):
+        if len(original) != len(reconstructed):
+            raise ValueError(f"Image lists must have the same length. Got {len(original)} and {len(reconstructed)}")
+
+        psnr_values = []
+        for orig, rec in zip(original, reconstructed):
+            psnr_values.append(calculate_psnr(orig, rec))
+
+        # Average PSNR across all images
+        return sum(psnr_values) / len(psnr_values)
+
+    # At this point, both should be tensors
+    if original.shape != reconstructed.shape:
+        raise ValueError(f"Images must have the same shape. Got {original.shape} and {reconstructed.shape}")
+
+    # Handle 3D tensor (C, H, W) - add batch dimension
+    if original.dim() == 3:
+        original = original.unsqueeze(0)
+        reconstructed = reconstructed.unsqueeze(0)
+
+    # Handle 5D tensor (B, T, C, H, W) - flatten batch and time
+    if original.dim() == 5:
+        b, t = original.shape[:2]
+        original = original.reshape(b * t, *original.shape[2:])
+        reconstructed = reconstructed.reshape(b * t, *reconstructed.shape[2:])
+
+    # Now we have 4D tensors (B, C, H, W)
+    # Convert to uint8 [0, 255] range
+    original_uint8 = (original.detach().clamp(0, 1) * 255).byte()
+    reconstructed_uint8 = (reconstructed.detach().clamp(0, 1) * 255).byte()
+
+    # Compute MSE per sample on uint8 values
+    mse = torch.mean((original_uint8.float() - reconstructed_uint8.float()) ** 2, dim=[1, 2, 3])
+
+    # Handle zero MSE (identical images) - cap at 100 dB
+    max_psnr = 100.0
+    mse = torch.where(
+        mse == 0,
+        torch.tensor(10.0 ** (-max_psnr / 10.0) * 255.0 * 255.0, device=mse.device, dtype=mse.dtype),
+        mse,
+    )
+
+    # Compute PSNR with max_val=255
+    psnr = 20 * torch.log10(torch.tensor(255.0, device=mse.device, dtype=mse.dtype)) - 10 * torch.log10(mse)
+
+    # Return mean PSNR
+    return psnr.mean()
+
+
+class Rank0FIDMetric(nn.Module):
+    """FID metric that runs only on rank 0 to avoid distributed sync issues.
+
+    Uses torchmetrics FrechetInceptionDistance internally but only computes
+    on rank 0's data to avoid NCCL collective operation mismatches caused by
+    torchmetrics/torch-fidelity's internal distributed synchronization.
+
+    Note: FID is computed only on rank 0's portion of the data (1/world_size),
+    which may be less representative than full dataset FID, but avoids
+    distributed synchronization issues.
+
+    Usage:
+        fid = Rank0FIDMetric(rank=rank).to(device)
+
+        # During evaluation loop (only rank 0 updates)
+        for batch in dataloader:
+            fid.update(real_images, fake_images)
+
+        # Compute FID (only rank 0 has valid result)
+        if rank == 0:
+            fid_value = fid.compute()
+    """
+
+    def __init__(self, rank: int = 0, feature_dim: int = 2048) -> None:
+        super().__init__()
+        self.rank = rank
+        self.feature_dim = feature_dim
+        self._fid_metric = None
+
+        # Only initialize FID metric on rank 0
+        if self.rank == 0:
+            try:
+                from torchmetrics.image.fid import FrechetInceptionDistance
+
+                # normalize=True means input is [0, 1] float, not uint8
+                self._fid_metric = FrechetInceptionDistance(
+                    feature=feature_dim,
+                    normalize=True,
+                    sync_on_compute=False,
+                    dist_sync_on_step=False,
+                )
+            except ImportError:
+                pass
+
+    @torch.no_grad()
+    def update(self, real_images: torch.Tensor, fake_images: torch.Tensor) -> None:
+        """Update FID statistics with a batch of real and fake images.
+
+        Only updates on rank 0.
+
+        Args:
+            real_images: Real images in [0, 1] range, shape (B, C, H, W) or (B, T, C, H, W)
+            fake_images: Fake/reconstructed images in [0, 1] range
+        """
+        if self.rank != 0 or self._fid_metric is None:
+            return
+
+        # Handle video format by flattening batch and time dimensions
+        if real_images.dim() == 5:  # (B, T, C, H, W)
+            real_images = real_images.reshape(-1, *real_images.shape[2:])
+            fake_images = fake_images.reshape(-1, *fake_images.shape[2:])
+
+        # Move metric to same device as images
+        device = real_images.device
+        self._fid_metric = self._fid_metric.to(device)
+
+        # torchmetrics FID update
+        self._fid_metric.update(real_images, real=True)
+        self._fid_metric.update(fake_images, real=False)
+
+    def compute(self) -> torch.Tensor:
+        """Compute FID from accumulated statistics.
+
+        Only valid on rank 0.
+
+        Returns:
+            FID value as a scalar tensor (inf if not rank 0 or metric unavailable)
+        """
+        if self.rank != 0 or self._fid_metric is None:
+            return torch.tensor(float("inf"))
+
+        return self._fid_metric.compute()
+
+    def reset(self) -> None:
+        """Reset accumulated statistics."""
+        if self._fid_metric is not None:
+            self._fid_metric.reset()
+
+
+__all__ = [
+    "TokenizerMetric",
+    "PSNRMetric",
+    "SSIMMetric",
+    "LPIPSMetric",
+    "Rank0FIDMetric",
+    "calculate_psnr",
+]
diff --git a/cosmos_framework/model/tokenizer/models/__init__.py b/cosmos_framework/model/tokenizer/models/__init__.py
index e3c835b..798b8f6 100644
--- a/cosmos_framework/model/tokenizer/models/__init__.py
+++ b/cosmos_framework/model/tokenizer/models/__init__.py
@@ -11,6 +11,8 @@
 
 # Generic utilities
 # Metrics (moved from utils to metrics module for consolidation)
+from cosmos_framework.model.tokenizer.evaluation.reconstruction_metrics import calculate_psnr
+
 # Dense runtime
 from cosmos_framework.model.tokenizer.models.dense_runtime import (
     DenseAutoencoderRuntime,
@@ -46,6 +48,7 @@
     # Utils
     "average_with_scatter_add",
     "batch_tensor_to_sparse",
+    "calculate_psnr",
     "crop_tensors_to_match",
     "reconstruct_from_temporal_slices",
     "resize_and_crop",
diff --git a/cosmos_framework/model/tokenizer/models/dense_backends.py b/cosmos_framework/model/tokenizer/models/dense_backends.py
index 841fd7c..35d9647 100644
--- a/cosmos_framework/model/tokenizer/models/dense_backends.py
+++ b/cosmos_framework/model/tokenizer/models/dense_backends.py
@@ -12,10 +12,12 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from cosmos_framework.model.tokenizer.models.modules.attention.full_attn import tensor_dense_scaled_dot_product_attention
+from cosmos_framework.model.tokenizer.models.modules.attention.full_attn import (
+    tensor_dense_scaled_dot_product_attention,
+)
 
-DenseRuntimeBackend = Literal["varlen", "batched", "auto"]
-DenseResolvedBackend = Literal["varlen", "batched"]
+DenseRuntimeBackend = Literal["varlen", "batched", "batched_with_padding", "auto"]
+DenseResolvedBackend = Literal["varlen", "batched", "batched_with_padding"]
 
 
 def resolve_dense_backend(backend: DenseRuntimeBackend, use_compile: bool) -> DenseResolvedBackend:
@@ -33,7 +35,7 @@ def resolve_dense_backend(backend: DenseRuntimeBackend, use_compile: bool) -> De
     """
     if backend == "auto":
         return "batched" if use_compile else "varlen"
-    if backend in ("varlen", "batched"):
+    if backend in ("varlen", "batched", "batched_with_padding"):
         return backend
     raise ValueError(f"Unsupported dense runtime backend: {backend}")
 
@@ -69,6 +71,8 @@ def run_varlen_block_stack(
 def run_batched_block_stack(
     blocks: nn.ModuleList,
     feats: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    max_q_seqlen: int | None = None,
     q_freqs_cis: torch.Tensor | None = None,
 ) -> torch.Tensor:
     """Run the dense batched block path over uniform `[B, S, D]` chunks."""
@@ -79,21 +83,31 @@ def run_batched_block_stack(
     for block in blocks:
         if block.training and getattr(block, "use_checkpoint", False):
             output = torch.utils.checkpoint.checkpoint(
-                partial(run_batched_block, block, q_freqs_cis=q_freqs_cis),
+                partial(
+                    run_batched_block,
+                    block,
+                    cu_seqlens_q=cu_seqlens_q,
+                    max_q_seqlen=max_q_seqlen,
+                    q_freqs_cis=q_freqs_cis,
+                ),
                 output,
                 use_reentrant=False,
             )
         else:
-            output = run_batched_block(block, output, q_freqs_cis=q_freqs_cis)
+            output = run_batched_block(
+                block, output, cu_seqlens_q=cu_seqlens_q, max_q_seqlen=max_q_seqlen, q_freqs_cis=q_freqs_cis
+            )
     return output
 
 
 def run_batched_block(
     block: nn.Module,
     feats: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    max_q_seqlen: int | None = None,
     q_freqs_cis: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    """Run one transformer block with the dense batched attention path."""
+    """Run one transformer block with the dense batched attention path with optional padding."""
     if getattr(block, "multiscale", None) is not None:
         raise NotImplementedError("Dense runtime batched backend does not support multiscale blocks.")
     if getattr(block.attn, "_type", None) != "self":
@@ -101,7 +115,9 @@ def run_batched_block(
 
     residual = feats
     h = block.norm1(feats)
-    h = run_batched_attention(block.attn, h, q_freqs_cis=q_freqs_cis)
+    h = run_batched_attention(
+        block.attn, h, cu_seqlens_q=cu_seqlens_q, max_q_seqlen=max_q_seqlen, q_freqs_cis=q_freqs_cis
+    )
     feats = residual + h
     residual = feats
     h = block.norm2(feats)
@@ -112,15 +128,19 @@ def run_batched_block(
 def run_batched_attention(
     attention: nn.Module,
     feats: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    max_q_seqlen: int | None = None,
     q_freqs_cis: torch.Tensor | None = None,
 ) -> torch.Tensor:
-    """Run one dense self-attention layer via the imaginaire attention frontend."""
+    """Run one dense self-attention layer via the cosmos_framework attention frontend."""
     if not hasattr(attention, "to_qkv"):
         raise ValueError("Dense runtime batched backend requires fused to_qkv linear projections.")
     if not hasattr(attention, "to_out"):
         raise ValueError("Dense runtime batched backend requires an output projection linear layer.")
 
+    # feats: [B, S_padded, hidden]  (S_padded = pad_to tokens per batch item, padded for CUDA graph)
     batch_size, seq_len, hidden_size = feats.shape
+    # qkv: [B, S_padded, 3, H, D]
     qkv = F.linear(feats, attention.to_qkv.weight, attention.to_qkv.bias).reshape(
         batch_size,
         seq_len,
@@ -128,9 +148,11 @@ def run_batched_attention(
         attention.num_heads,
         -1,
     )
+    # q, k, v: [B, S_padded, H, D]
     q, k, v = qkv.unbind(dim=2)
 
     if getattr(attention, "qk_rms_norm", False):
+        # flatten to [B*S_padded, H, D] for per-token RMSNorm, then restore
         flat_q = q.reshape(batch_size * seq_len, attention.num_heads, -1)
         flat_k = k.reshape(batch_size * seq_len, attention.num_heads, -1)
         q = attention.q_rms_norm(flat_q).reshape(batch_size, seq_len, attention.num_heads, -1)
@@ -139,6 +161,7 @@ def run_batched_attention(
     if getattr(attention, "use_rope", False):
         if q_freqs_cis is None:
             raise ValueError("Dense runtime batched backend requires precomputed q_freqs_cis when RoPE is enabled.")
+        # flatten to [B*S_padded, H, D] for RoPE application, then restore to [B, S_padded, H, D]
         flat_q = q.reshape(batch_size * seq_len, attention.num_heads, -1)
         flat_k = k.reshape(batch_size * seq_len, attention.num_heads, -1)
         flat_q, flat_k = attention.rope.apply_rotary_emb(
@@ -150,6 +173,16 @@ def run_batched_attention(
         q = flat_q.reshape(batch_size, seq_len, attention.num_heads, -1)
         k = flat_k.reshape(batch_size, seq_len, attention.num_heads, -1)
 
-    h = tensor_dense_scaled_dot_product_attention(q=q, k=k, v=v)
+    # q, k, v: [B, S_padded, H, D] → attention → h: [B, S_padded, H, D]
+    h = tensor_dense_scaled_dot_product_attention(
+        q=q,
+        k=k,
+        v=v,
+        cu_seqlens_q=cu_seqlens_q,
+        cu_seqlens_kv=cu_seqlens_q,
+        max_q_seqlen=max_q_seqlen,
+        max_kv_seqlen=max_q_seqlen,
+    )
+    # h: [B, S_padded, hidden]
     h = h.reshape(batch_size, seq_len, hidden_size)
     return F.linear(h, attention.to_out.weight, attention.to_out.bias)
diff --git a/cosmos_framework/model/tokenizer/models/dense_runtime.py b/cosmos_framework/model/tokenizer/models/dense_runtime.py
index 02a938e..59c6b9d 100644
--- a/cosmos_framework/model/tokenizer/models/dense_runtime.py
+++ b/cosmos_framework/model/tokenizer/models/dense_runtime.py
@@ -115,12 +115,43 @@ def __init__(
         self,
         autoencoder: AutoencoderKL,
         backend: DenseRuntimeBackend = "auto",
+        pad_frames: int = 0,
+        pixel_trim: bool = True,
+        chunk_size: int = 16,
     ) -> None:
-        """Initialize the dense runtime wrapper."""
+        """Initialize the dense runtime wrapper.
+
+        Args:
+            autoencoder: The sparse autoencoder to wrap.
+            backend: Backend selection for block-stack execution.
+            pad_frames: Number of boundary frames to replicate at each end of
+                every temporal chunk before encoding.  Must be divisible by
+                ``patch_size[0]``.  Set ``0`` to disable boundary padding;
+                set ``>0`` (typically one temporal patch, e.g. ``4``) to give
+                the non-causal encoder additional context across chunk edges,
+                eliminating the per-chunk-boundary PSNR dip.
+            pixel_trim: When ``True`` and ``pad_frames > 0``, boundary latents
+                are kept in the encoded output and trimmed in pixel space after
+                decoding.  When ``False``, boundary latents are trimmed
+                immediately after encoding.  ``True`` should always be used
+                for the best reconstruction quality.
+            chunk_size: Number of *raw* frames consumed by the encoder per
+                temporal chunk.  Forwarded to
+                ``autoencoder.num_sample_frames_batch_size`` and used to
+                slice the input video into encode batches.  Must satisfy
+                ``2 * pad_frames < chunk_size``.  Default ``16``.
+        """
         super().__init__()
         self.autoencoder = autoencoder
         self.backend = backend
-        self._metadata_cache = {}
+        autoencoder.num_sample_frames_batch_size = chunk_size
+        if pad_frames < 0:
+            raise ValueError(f"pad_frames must be non-negative, got {pad_frames}.")
+        if 2 * pad_frames >= chunk_size:
+            raise ValueError(f"pad_frames must be less than chunk_size / 2, got {pad_frames=}, {chunk_size=}.")
+        self.pad_frames = pad_frames
+        self.pixel_trim = pixel_trim
+        self._metadata_cache: dict[DenseGridMetadataKey, DenseGridMetadata] = {}
         self.cg_compiled = False
 
     @classmethod
@@ -128,10 +159,19 @@ def from_autoencoder(
         cls,
         autoencoder: AutoencoderKL,
         backend: DenseRuntimeBackend = "auto",
+        pad_frames: int = 0,
+        pixel_trim: bool = True,
+        chunk_size: int = 16,
     ) -> "DenseAutoencoderRuntime":
         """Build a dense runtime from a supported sparse autoencoder."""
         cls._validate_autoencoder(autoencoder)
-        return cls(autoencoder=autoencoder, backend=backend)
+        return cls(
+            autoencoder=autoencoder,
+            backend=backend,
+            pad_frames=pad_frames,
+            pixel_trim=pixel_trim,
+            chunk_size=chunk_size,
+        )
 
     @staticmethod
     def _validate_autoencoder(autoencoder: AutoencoderKL) -> None:
@@ -204,9 +244,21 @@ def clear_metadata_cache(self) -> None:
         """Drop cached dense-grid metadata."""
         self._metadata_cache.clear()
 
-    def encode(self, dense_video: torch.Tensor, sample_posterior: bool = False) -> torch.Tensor:
+    def encode(
+        self,
+        dense_video: torch.Tensor,
+        sample_posterior: bool = False,
+        pad_to: int | None = None,
+        chunk_raw_frames: int | None = None,
+        encode_chunk_batch_size: int = 1,
+    ) -> torch.Tensor:
         """Encode a dense video tensor into latent moments or posterior samples."""
-        moments = self.encode_moments(dense_video)
+        moments = self.encode_moments(
+            dense_video,
+            chunk_raw_frames=chunk_raw_frames,
+            pad_to=pad_to,
+            encode_chunk_batch_size=encode_chunk_batch_size,
+        )
         if not sample_posterior:
             return moments
         return self._sample_dense_posterior(moments)
@@ -215,8 +267,48 @@ def encode_moments(
         self,
         video: torch.Tensor,
         chunk_raw_frames: int | None = None,
+        pad_to: int | None = None,
+        encode_chunk_batch_size: int = 1,
     ) -> torch.Tensor:
-        """Encode a dense video tensor into `[B, T_p, H_p, W_p, 2C]` latent moments."""
+        """Encode a dense video tensor into `[B, T_p, H_p, W_p, 2C]` latent moments.
+
+        Args:
+            video: Dense channels-last video tensor ``[B, T, H, W, 3]``.
+            chunk_raw_frames: Number of raw frames per encoder chunk.  Defaults
+                to ``self.encoder_chunk_spec.raw_frames``.
+            pad_to: Sequence-length padding target for the ``batched_with_padding``
+                backend (reduces CUDA graph recapture).
+            encode_chunk_batch_size: Number of full temporal chunks to encode
+                together. Only supported for the ``batched`` backend; defaults
+                to ``1`` (sequential encoding).
+
+        Shapes (example):
+            Config: ``patch_size = (1, 16, 16)``, ``chunk_size = 16``,
+            ``pad_frames = 1`` (1 raw frame replicated on each chunk edge).
+            Whole-video input: ``[B=1, T=28, H=480, W=832, 3]``.
+
+            Per-chunk pipeline (loop slices the 28 frames into 2 chunks of
+            ``chunk_raw_frames = 16 - 2*1 = 14``):
+
+              ::
+
+                step                              shape                          notes
+                ---------------------------------------------------------------------------------------
+                1. raw chunk                      [1, 14, 480, 832, 3]           1 of 2 chunks
+                2. after input padding            [1, 16, 480, 832, 3]           1 pre + 14 raw + 1 post
+                3. after encoding (latent)        [1, 16,  30,  52, 2C]          T_p=16/1, H_p=480/16, W_p=832/16
+                4. after decoding                 [1, 16, 480, 832, 3]
+                5. after pixel trim               [1, 14, 480, 832, 3]           drops pad_frames=1 pixel frame
+                                                                                 on each end
+
+            Across both chunks the concatenated pixel-space output is
+            ``[1, 28, 480, 832, 3]``; the latent fed to a downstream DiT is
+            ``[1, 32, 30, 52, 2C]``.
+
+            For images (``T = 1``) the input is repeated to one temporal patch
+            (``T = patch_time``) and ``latents_per_boundary = 0``, so the
+            DiT-facing shape is ``[B, 1, H_p, W_p, 2C]``.
+        """
         if video.ndim != 5:
             raise ValueError(f"Dense runtime expects 5D video tensor, got {video.ndim}D")
         if video.shape[4] != 3:
@@ -224,31 +316,139 @@ def encode_moments(
 
         batch_size, raw_frames, height, width, _ = video.shape
         patch_time, patch_height, patch_width = self.patch_size
-        if raw_frames % patch_time != 0:
+        assert batch_size == 1 or encode_chunk_batch_size == 1, (
+            "Dense runtime with batching currently only supports batch size 1"
+        )
+
+        if chunk_raw_frames is None:
+            chunk_raw_frames = self.encoder_chunk_spec.raw_frames
+            chunk_raw_frames = chunk_raw_frames - 2 * self.pad_frames
+            assert chunk_raw_frames > 0, (
+                f"Padding frames must be less than chunk_raw_frames, got {chunk_raw_frames=}, {self.pad_frames=}."
+            )
+        if chunk_raw_frames <= 0:
+            raise ValueError(f"chunk_raw_frames must be positive, got {chunk_raw_frames}.")
+        if encode_chunk_batch_size < 1:
+            raise ValueError(f"encode_chunk_batch_size must be positive, got {encode_chunk_batch_size}.")
+        if encode_chunk_batch_size > 1 and self.backend != "batched":
+            raise ValueError(
+                f"encode_chunk_batch_size > 1 is only supported for the batched backend, got backend={self.backend!r}."
+            )
+
+        # if input is an image, we pad to form single temporal patch
+        if raw_frames == 1:
+            is_image = True
+            video = video.repeat(1, patch_time, 1, 1, 1)
+            raw_frames = patch_time
+        else:
+            is_image = False
+
+        if (chunk_raw_frames + 2 * self.pad_frames) % patch_time != 0:
             raise ValueError(
-                f"Dense runtime requires frame count divisible by patch_size[0]={patch_time}, got {raw_frames}."
+                f"chunk_raw_frames + 2 * pad_frames must be divisible by patch_size[0]={patch_time}, got {chunk_raw_frames=}, {self.pad_frames=}."
             )
+
+        if not is_image:
+            # Noncausal scheme: first frame is its own chunk; remaining must fill complete regular chunks.
+            remaining_frames = raw_frames - 1
+            remainder = remaining_frames % chunk_raw_frames
+            if remainder != 0 and (remainder + 2 * self.pad_frames) % patch_time != 0:
+                raise ValueError(
+                    f"Dense runtime requires (frame_count - 1) equal to "
+                    f"chunk_raw_frames * N + patch_time - 2 * pad_frames, "
+                    f"got {raw_frames=}, {chunk_raw_frames=}, {self.pad_frames=}, {patch_time=}."
+                )
         if height % patch_height != 0 or width % patch_width != 0:
             raise ValueError(
                 "Dense runtime requires spatial dimensions divisible by patch size "
                 f"{(patch_height, patch_width)}, got {(height, width)}."
             )
+        pad_frames = self.pad_frames
+        if not is_image:
+            latents_per_boundary = pad_frames // patch_time
+        else:
+            latents_per_boundary = 0
 
         del batch_size
-        if chunk_raw_frames is None:
-            chunk_raw_frames = self.encoder_chunk_spec.raw_frames
-        if chunk_raw_frames <= 0:
-            raise ValueError(f"chunk_raw_frames must be positive, got {chunk_raw_frames}.")
-        if chunk_raw_frames % patch_time != 0:
-            raise ValueError(
-                f"chunk_raw_frames must be divisible by patch_size[0]={patch_time}, got {chunk_raw_frames}."
-            )
+
+        # preserve the chunk size to reduce number of captured cuda graphs
+        if self.backend == "batched_with_padding" and pad_to is None and self.cg_compiled:
+            width_patches = width // patch_width
+            height_patches = height // patch_height
+            padded_chunk_frames = chunk_raw_frames + 2 * pad_frames
+            temporal_patches = padded_chunk_frames // patch_time
+            pad_to = width_patches * height_patches * temporal_patches
+
+        use_chunk_batching = self.backend == "batched" and encode_chunk_batch_size > 1
+
+        def _pad_video_chunk(video_chunk: torch.Tensor) -> torch.Tensor:
+            if pad_frames > 0 and not is_image:
+                # UniAE chunk-wise encoding suffers a PSNR dip at chunk boundaries
+                # because the non-causal encoder lacks context beyond the chunk edges.
+                # Padding each chunk with pad_frames replicated boundary frames on both
+                # sides gives the encoder that context, eliminating the boundary dip.
+                # In practice pad_frames=4 (one temporal patch) is used.
+                # The corresponding boundary latents are trimmed after decoding
+                # (see pixel_trim / latents_per_boundary below).
+                pre = video_chunk[:, 0:1].expand(-1, pad_frames, -1, -1, -1)  # [B,pad,H,W,3]
+                post = video_chunk[:, -1:].expand(-1, pad_frames, -1, -1, -1)  # [B,pad,H,W,3]
+                video_chunk = torch.cat([pre, video_chunk, post], dim=1)  # [B,t+2*pad,H,W,3]
+            return video_chunk
+
+        def _trim_boundary_latents(encoded_chunk: torch.Tensor) -> torch.Tensor:
+            if latents_per_boundary > 0 and not self.pixel_trim:
+                t_latent = encoded_chunk.shape[1]
+                encoded_chunk = encoded_chunk[:, latents_per_boundary : t_latent - latents_per_boundary]
+            return encoded_chunk
+
+        def _encode_padded_chunks(padded_chunks: list[torch.Tensor]) -> list[torch.Tensor]:
+            if len(padded_chunks) == 1:
+                encoded = self._encode_video_chunk(padded_chunks[0], pad_to=pad_to)
+                return [_trim_boundary_latents(encoded)]
+
+            batched_video = torch.cat(padded_chunks, dim=0)  # [B*G,t_pad,H,W,3]
+            encoded = self._encode_video_chunk(batched_video, pad_to=pad_to)  # [B*G,T_lat,Hp,Wp,2C]
+            per_video_batch = padded_chunks[0].shape[0]
+            return list(_trim_boundary_latents(encoded).split(per_video_batch, dim=0))
+
         encoded_chunks: list[torch.Tensor] = []
-        for start_frame in range(0, raw_frames, chunk_raw_frames):
-            end_frame = min(start_frame + chunk_raw_frames, raw_frames)
-            video_chunk = video[:, start_frame:end_frame]
-            encoded_chunk = self._encode_video_chunk(video_chunk)
-            encoded_chunks.append(encoded_chunk)
+
+        if not is_image:
+            # Noncausal first chunk: encode frame 0 alone, padded to patch_time copies
+            # at the head so the encoder sees exactly patch_time frames → 1 latent L₁.
+            # pad_to=None: this chunk has 1 temporal patch, not the regular chunk shape.
+            first_frame = video[:, 0:1]
+            first_chunk = first_frame.expand(-1, patch_time, -1, -1, -1).contiguous()
+            encoded_chunks.append(self._encode_video_chunk(first_chunk, pad_to=None))
+
+        chunk_specs = [
+            (
+                start_frame,
+                end_frame := min(start_frame + chunk_raw_frames, raw_frames),
+                end_frame - start_frame == chunk_raw_frames,
+            )
+            for start_frame in range(0 if is_image else 1, raw_frames, chunk_raw_frames)
+        ]
+
+        pending_full_chunks: list[torch.Tensor] = []
+        for start_frame, end_frame, is_full_chunk in chunk_specs:
+            padded_chunk = _pad_video_chunk(video[:, start_frame:end_frame])  # [B,t_pad,H,W,3]
+
+            if not use_chunk_batching or not is_full_chunk:
+                if pending_full_chunks:
+                    encoded_chunks.extend(_encode_padded_chunks(pending_full_chunks))
+                    pending_full_chunks = []
+                encoded_chunks.extend(_encode_padded_chunks([padded_chunk]))
+                continue
+
+            pending_full_chunks.append(padded_chunk)
+            if len(pending_full_chunks) == encode_chunk_batch_size:
+                encoded_chunks.extend(_encode_padded_chunks(pending_full_chunks))
+                pending_full_chunks = []
+
+        if pending_full_chunks:
+            encoded_chunks.extend(_encode_padded_chunks(pending_full_chunks))
+
         return torch.cat(encoded_chunks, dim=1)
 
     def decode(
@@ -256,7 +456,12 @@ def decode(
         dense_latent: torch.Tensor,
         chunk_raw_frames: int | None = None,
     ) -> torch.Tensor:
-        """Decode a dense latent grid into a dense channels-last video tensor."""
+        """Decode a dense latent grid into a dense channels-last video tensor.
+
+        When ``pixel_trim`` is enabled and ``pad_frames > 0``, the latent
+        contains boundary tokens from encoding.  After decoding, the
+        corresponding boundary pixel frames are trimmed from each chunk.
+        """
         if self.decoder_cache_spec.patch_frames != 0:
             raise NotImplementedError("Dense runtime decoder V1 does not support KV cache.")
 
@@ -272,11 +477,32 @@ def decode(
                     f"chunk_raw_frames must be divisible by patch_size[0]={self.patch_size[0]}, got {chunk_raw_frames}."
                 )
             chunk_patch_frames = chunk_raw_frames // self.patch_size[0]
+
+        pad_frames = self.pad_frames
+        trim_pixel = self.pixel_trim and pad_frames > 0
+
+        patch_time = self.patch_size[0]
+        # Images were encoded as a single latent (no noncausal first chunk).
+        # Videos have temporal_patches > 1: latent[0] is the noncausal first frame.
+        is_image = temporal_patches == 1
+
         decoded_chunks: list[torch.Tensor] = []
-        for start_patch in range(0, temporal_patches, chunk_patch_frames):
+
+        if not is_image:
+            # Noncausal first latent: decode → patch_time pixel frames, keep last
+            # (the reconstructed original first frame).
+            first_latent = latent[:, 0:1]
+            decoded_first = self._decode_latent_chunk(first_latent)  # [B, patch_time, H, W, C]
+            decoded_chunks.append(decoded_first[:, -1:])
+
+        for start_patch in range(0 if is_image else 1, temporal_patches, chunk_patch_frames):
             end_patch = min(start_patch + chunk_patch_frames, temporal_patches)
             latent_chunk = latent[:, start_patch:end_patch]
-            decoded_chunks.append(self._decode_latent_chunk(latent_chunk))
+            decoded_chunk = self._decode_latent_chunk(latent_chunk)
+            # Images have no boundary padding, so pixel trim only applies to video chunks.
+            if trim_pixel and not is_image:
+                decoded_chunk = decoded_chunk[:, pad_frames:-pad_frames]
+            decoded_chunks.append(decoded_chunk)
         return torch.cat(decoded_chunks, dim=1)
 
     def _metadata_cache_key(
@@ -361,13 +587,22 @@ def _canonicalize_dense_latent(self, dense_latent: torch.Tensor) -> torch.Tensor
             )
         return latent.contiguous()
 
-    def _encode_video_chunk(self, dense_video_chunk: torch.Tensor) -> torch.Tensor:
+    def _encode_video_chunk(
+        self,
+        dense_video_chunk: torch.Tensor,
+        pad_to: int | None = None,
+    ) -> torch.Tensor:
         """Encode one dense video chunk into projected latent moments."""
+        assert pad_to is None or self.backend == "batched_with_padding", (
+            "pad_to is only supported for batched_with_padding backend"
+        )
+
         batch_size, raw_frames, height, width, _ = dense_video_chunk.shape
         patch_time, patch_height, patch_width = self.patch_size
         temporal_patches = raw_frames // patch_time
         height_patches = height // patch_height
         width_patches = width // patch_width
+        seq_len = temporal_patches * height_patches * width_patches
 
         patch_feats = self._patchify_dense_video(dense_video_chunk)
         metadata = self._get_or_build_grid_metadata(
@@ -380,14 +615,43 @@ def _encode_video_chunk(self, dense_video_chunk: torch.Tensor) -> torch.Tensor:
             device=patch_feats.device,
             dtype=self.autoencoder.encoder.input_layer.weight.dtype,
         )
+
+        learned_pe = metadata.learned_pe
+        rope_freqs_cis = metadata.rope_freqs_cis
+
+        needs_padding = pad_to is not None and pad_to > seq_len
+        if pad_to is not None and pad_to < seq_len:
+            raise ValueError(f"pad_to ({pad_to}) must be >= sequence length ({seq_len}).")
+        if needs_padding:
+            if batch_size != 1:
+                raise ValueError(
+                    f"pad_to requires batch_size=1 for correct varlen masking, got batch_size={batch_size}."
+                )
+            pad_amount = pad_to - seq_len
+            patch_feats = F.pad(patch_feats, (0, 0, 0, pad_amount))
+            if learned_pe is not None:
+                learned_pe = F.pad(learned_pe, (0, 0, 0, pad_amount))
+            if rope_freqs_cis is not None:
+                rope_pad = torch.zeros(
+                    pad_amount,
+                    rope_freqs_cis.shape[-1],
+                    dtype=rope_freqs_cis.dtype,
+                    device=rope_freqs_cis.device,
+                )
+                rope_freqs_cis = torch.cat([rope_freqs_cis, rope_pad], dim=0)
+
         moments = self._encode_chunk_core(
             patch_feats,
-            learned_pe=metadata.learned_pe,
-            rope_freqs_cis=metadata.rope_freqs_cis,
+            learned_pe=learned_pe,
+            rope_freqs_cis=rope_freqs_cis,
             q_seqlen=metadata.q_seqlen,
             cu_seqlens_q=metadata.cu_seqlens,
-            max_q_seqlen=metadata.max_seq_len,
+            max_q_seqlen=metadata.max_seq_len if not needs_padding else pad_to,
         )
+
+        if needs_padding:
+            moments = moments[:, :seq_len]
+
         if self.cg_compiled:
             moments = moments.clone()
         return moments.reshape(batch_size, temporal_patches, height_patches, width_patches, -1)
@@ -568,6 +832,17 @@ def _run_block_stack(
                 feats,
                 q_freqs_cis=rope_freqs_cis,
             )
+        if backend == "batched_with_padding":
+            assert feats.shape[0] == 1, (
+                "batched_with_padding backend only supports batch_size=1, due to varlen kernel requirements."
+            )
+            return run_batched_block_stack(
+                blocks,
+                feats,
+                cu_seqlens_q=cu_seqlens_q,
+                max_q_seqlen=max_q_seqlen,
+                q_freqs_cis=rope_freqs_cis,
+            )
         raise ValueError(f"Unsupported dense runtime backend: {backend}")
 
     def _get_or_build_grid_metadata(
diff --git a/cosmos_framework/model/tokenizer/models/modules/attention/full_attn.py b/cosmos_framework/model/tokenizer/models/modules/attention/full_attn.py
index d5a80e1..78b077f 100644
--- a/cosmos_framework/model/tokenizer/models/modules/attention/full_attn.py
+++ b/cosmos_framework/model/tokenizer/models/modules/attention/full_attn.py
@@ -85,8 +85,12 @@ def tensor_dense_scaled_dot_product_attention(
     q: torch.Tensor,
     k: torch.Tensor,
     v: torch.Tensor,
+    cu_seqlens_q: torch.Tensor | None = None,
+    cu_seqlens_kv: torch.Tensor | None = None,
+    max_q_seqlen: int | None = None,
+    max_kv_seqlen: int | None = None,
 ) -> torch.Tensor:
-    """Apply dense batched attention via the imaginaire attention frontend."""
+    """Apply dense batched attention via the cosmos_framework attention frontend."""
     if q.ndim != 4 or k.ndim != 4 or v.ndim != 4:
         raise ValueError(
             "Dense tensor attention expects [B, S, H, D]-style tensors, "
@@ -99,6 +103,10 @@ def tensor_dense_scaled_dot_product_attention(
         query=q.contiguous(),
         key=k.contiguous(),
         value=v.contiguous(),
+        cumulative_seqlen_Q=cu_seqlens_q,
+        cumulative_seqlen_KV=cu_seqlens_kv,
+        max_seqlen_Q=max_q_seqlen,
+        max_seqlen_KV=max_kv_seqlen,
     )
 
 
diff --git a/cosmos_framework/model/tokenizer/models/modules/quantizers/fsq.py b/cosmos_framework/model/tokenizer/models/modules/quantizers/fsq.py
index 911d49a..6a59f0a 100644
--- a/cosmos_framework/model/tokenizer/models/modules/quantizers/fsq.py
+++ b/cosmos_framework/model/tokenizer/models/modules/quantizers/fsq.py
@@ -13,7 +13,7 @@
 
 import torch
 import torch.nn as nn
-from einops import pack, rearrange, unpack
+from einops import rearrange
 from torch import Tensor, int32
 from torch.amp import autocast
 from torch.nn import Module
@@ -104,16 +104,6 @@ def default(*args):
     return None
 
 
-def pack_one(t, pattern):
-    """Pack single tensor."""
-    return pack([t], pattern)
-
-
-def unpack_one(t, ps, pattern):
-    """Unpack single tensor."""
-    return unpack(t, ps, pattern)[0]
-
-
 def round_ste(z: Tensor) -> Tensor:
     """Round with straight through gradients.
 
diff --git a/cosmos_framework/model/tokenizer/models/modules/quantizers/lfq.py b/cosmos_framework/model/tokenizer/models/modules/quantizers/lfq.py
index 369e4b5..b309827 100644
--- a/cosmos_framework/model/tokenizer/models/modules/quantizers/lfq.py
+++ b/cosmos_framework/model/tokenizer/models/modules/quantizers/lfq.py
@@ -20,8 +20,9 @@
 from typing import TYPE_CHECKING
 
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
-from einops import pack, rearrange, reduce, unpack
+from einops import rearrange, reduce
 from torch.nn import Module
 
 if TYPE_CHECKING:
@@ -38,6 +39,8 @@
     ["per_sample_entropy", "codebook_entropy", "commitment", "avg_probs"],
 )
 
+_MAX_DIM_ONLY_CODEBOOK_BITS = 20
+
 
 # Helper functions
 
@@ -55,16 +58,6 @@ def default(*args):
     return None
 
 
-def pack_one(t, pattern):
-    """Pack single tensor."""
-    return pack([t], pattern)
-
-
-def unpack_one(t, ps, pattern):
-    """Unpack single tensor."""
-    return unpack(t, ps, pattern)[0]
-
-
 def entropy(prob: torch.Tensor) -> torch.Tensor:
     """Compute entropy of probability distribution."""
     return (-prob * torch.log(prob + 1e-5)).sum(dim=-1)
@@ -163,7 +156,7 @@ def __init__(
         batch_maximization_weight: float = 1.0,
         token_factorization: bool = False,
         factorized_bits: list[int] = [9, 9],
-    ):
+    ) -> None:
         """Initialize LFQ.
 
         Args:
@@ -177,14 +170,25 @@ def __init__(
         """
         super().__init__()
 
-        # Validation
-        assert exists(dim) or exists(codebook_size), "either dim or codebook_size must be specified for LFQ"
-        assert not exists(codebook_size) or log2(codebook_size).is_integer(), (
-            f"codebook size must be power of 2 (suggested {2 ** ceil(log2(codebook_size))})"
-        )
+        if not exists(dim) and not exists(codebook_size):
+            raise ValueError("Either dim or codebook_size must be specified for LFQ.")
+
+        if codebook_size is None:
+            if dim is None:
+                raise ValueError("dim must be specified when codebook_size is omitted.")
+            if dim > _MAX_DIM_ONLY_CODEBOOK_BITS:
+                raise ValueError(
+                    "LFQ dim-only construction materializes a 2**dim codebook; "
+                    f"got dim={dim}. Pass codebook_size explicitly or use dim <= {_MAX_DIM_ONLY_CODEBOOK_BITS}."
+                )
+            resolved_codebook_size = 2**dim
+        else:
+            resolved_codebook_size = int(codebook_size)
+        if not log2(resolved_codebook_size).is_integer():
+            raise ValueError(f"codebook size must be power of 2 (suggested {2 ** ceil(log2(resolved_codebook_size))})")
 
-        self.codebook_size = default(codebook_size, lambda: 2**dim)
-        self.codebook_dim = int(log2(codebook_size))
+        self.codebook_size = resolved_codebook_size
+        self.codebook_dim = int(log2(self.codebook_size))
 
         codebook_dims = self.codebook_dim * num_codebooks
         dim = default(dim, codebook_dims)
@@ -195,6 +199,8 @@ def __init__(
         self.dim = dim
         self.codebook_dim = self.codebook_dim
         self.num_codebooks = num_codebooks
+        self.project_in = nn.Linear(self.dim, codebook_dims) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(codebook_dims, self.dim) if has_projections else nn.Identity()
 
         # For entropy loss
         self.sample_minimization_weight = sample_minimization_weight
@@ -212,9 +218,9 @@ def __init__(
         self.register_buffer("zero", torch.tensor(0.0), persistent=False)
 
         # Build codebook
-        all_codes = torch.arange(codebook_size)
-        bits = self.indices_to_bits(all_codes)
-        codebook = bits * 2.0 - 1.0
+        all_codes = torch.arange(self.codebook_size)  # [K]
+        bits = self.indices_to_bits(all_codes)  # [K,Z]
+        codebook = bits * 2.0 - 1.0  # [K,Z]
 
         self.register_buffer("codebook", codebook, persistent=False)
 
@@ -295,11 +301,11 @@ def decode(self, x: torch.Tensor) -> torch.Tensor:
         Returns:
             Decoded tensor with values in {-1, 1}.
         """
-        x = self.indices_to_bits(x)
-        x = x.to(self.dtype)
-        x = x * 2 - 1
-        x = rearrange(x, "... NC Z-> ... (NC Z)")
-        return x
+        x = self.indices_to_bits(x)  # [...,NC,Z]
+        x = x.to(self.dtype)  # [...,NC,Z]
+        x = x * 2 - 1  # [...,NC,Z]
+        x = rearrange(x, "... NC Z-> ... (NC Z)")  # [...,Dq]
+        return self.project_out.to(x.dtype)(x)  # [...,D]
 
     def forward(
         self,
@@ -310,8 +316,8 @@ def forward(
         return_loss: bool = True,
         fp32_loss_computation: bool = False,
     ) -> (
-        tuple[torch.Tensor, torch.Tensor, torch.Tensor]
-        | tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], LossBreakdown]
+        tuple[torch.Tensor, torch.Tensor | tuple[torch.Tensor, torch.Tensor], torch.Tensor]
+        | tuple[tuple[torch.Tensor, torch.Tensor | tuple[torch.Tensor, torch.Tensor], torch.Tensor], LossBreakdown]
     ):
         """Forward pass for LFQ on SparseTensor.
 
@@ -324,57 +330,53 @@ def forward(
             fp32_loss_computation: Whether to compute losses in fp32.
 
         Returns:
-            Tuple of (quantized_feats, entropy_loss, indices).
+            Tuple of (quantized_feats, indices, quantizer_loss).
             If return_loss_breakdown, also returns LossBreakdown.
         """
         # Extract features from sparse tensor
         N, feature_dim = x.shape
 
-        # Validate feature dimension
         expected_dim = self.num_codebooks * self.codebook_dim
-        if feature_dim != expected_dim:
-            raise ValueError(
-                f"Feature dimension {feature_dim} doesn't match expected {expected_dim} "
-                f"(num_codebooks={self.num_codebooks} * codebook_dim={self.codebook_dim})"
-            )
+        if feature_dim != self.dim:
+            raise ValueError(f"Feature dimension {feature_dim} doesn't match LFQ input dimension {self.dim}.")
 
-        # Reshape for codebook processing: [N, num_codebooks, codebook_dim]
-        features_reshaped = x.view(N, self.num_codebooks, self.codebook_dim)
+        features = self.project_in.to(x.dtype)(x.view(N, feature_dim))  # [N,Dq]
+        features_reshaped = features.view(N, self.num_codebooks, self.codebook_dim)  # [N,NC,Z]
 
         # Quantization step
-        codebook_value = torch.tensor(1.0, device=x.device, dtype=x.dtype)
-        quantized_values = torch.where(features_reshaped > 0, codebook_value, -codebook_value)
+        codebook_value = torch.tensor(1.0, device=x.device, dtype=x.dtype)  # []
+        quantized_values = torch.where(features_reshaped > 0, codebook_value, -codebook_value)  # [N,NC,Z]
 
         # Index calculation
         if self.token_factorization:
-            pre_bits = quantized_values[..., : self.factorized_bits[0]]
-            post_bits = quantized_values[..., self.factorized_bits[0] :]
+            pre_bits = quantized_values[..., : self.factorized_bits[0]]  # [N,NC,Zpre]
+            post_bits = quantized_values[..., self.factorized_bits[0] :]  # [N,NC,Zpost]
 
-            indices_pre = ((pre_bits > 0).int() * self.pre_mask.int()).sum(-1)
-            indices_post = ((post_bits > 0).int() * self.post_mask.int()).sum(-1)
+            indices_pre = ((pre_bits > 0).int() * self.pre_mask.int()).sum(-1)  # [N,NC]
+            indices_post = ((post_bits > 0).int() * self.post_mask.int()).sum(-1)  # [N,NC]
 
-            indices_pre_flat = indices_pre.flatten()
-            indices_post_flat = indices_post.flatten()
+            indices_pre_flat = indices_pre.flatten()  # [N*NC]
+            indices_post_flat = indices_post.flatten()  # [N*NC]
             sparse_indices_quantized = (indices_pre_flat, indices_post_flat)
         else:
-            indices = ((quantized_values > 0).int() * self.mask.int()).sum(-1)
-            sparse_indices_quantized = indices.flatten()
+            indices = ((quantized_values > 0).int() * self.mask.int()).sum(-1)  # [N,NC]
+            sparse_indices_quantized = indices.flatten()  # [N*NC]
 
         # Entropy loss (training only)
         if self.training and return_loss:
             if fp32_loss_computation:
-                features_flat_fp32 = features_reshaped.view(-1, self.codebook_dim).float()
-                codebook_fp32 = self.codebook.float()
+                features_flat_fp32 = features_reshaped.view(-1, self.codebook_dim).float()  # [N*NC,Z]
+                codebook_fp32 = self.codebook.float()  # [K,Z]
             else:
-                features_flat_fp32 = features_reshaped.view(-1, self.codebook_dim)
-                codebook_fp32 = self.codebook
+                features_flat_fp32 = features_reshaped.view(-1, self.codebook_dim)  # [N*NC,Z]
+                codebook_fp32 = self.codebook.to(features_flat_fp32.dtype)  # [K,Z]
 
-            logits = 2 * torch.mm(features_flat_fp32, codebook_fp32.T)
+            logits = 2 * torch.mm(features_flat_fp32, codebook_fp32.T)  # [N*NC,K]
 
             if mask is not None:
                 if mask.shape[0] != N:
                     raise ValueError(f"Mask shape {mask.shape} doesn't match number of features {N}")
-                mask_expanded = mask.unsqueeze(1).repeat(1, self.num_codebooks).view(-1)
+                mask_expanded = mask.unsqueeze(1).repeat(1, self.num_codebooks).view(-1)  # [N*NC]
             else:
                 mask_expanded = None
 
@@ -388,44 +390,46 @@ def forward(
             )
         else:
             dtype = torch.float32 if fp32_loss_computation else x.dtype
-            per_sample_entropy = torch.tensor(0.0, dtype=dtype, device=x.device)
-            codebook_entropy = torch.tensor(0.0, dtype=dtype, device=x.device)
-            entropy_aux_loss = torch.tensor(0.0, dtype=dtype, device=x.device)
+            per_sample_entropy = torch.tensor(0.0, dtype=dtype, device=x.device)  # []
+            codebook_entropy = torch.tensor(0.0, dtype=dtype, device=x.device)  # []
+            entropy_aux_loss = torch.tensor(0.0, dtype=dtype, device=x.device)  # []
 
         # Commitment loss
         if self.training:
             if fp32_loss_computation:
-                features_fp32 = features_reshaped.float()
-                quantized_fp32 = quantized_values.float()
+                features_fp32 = features_reshaped.float()  # [N,NC,Z]
+                quantized_fp32 = quantized_values.float()  # [N,NC,Z]
             else:
-                features_fp32 = features_reshaped
-                quantized_fp32 = quantized_values
+                features_fp32 = features_reshaped  # [N,NC,Z]
+                quantized_fp32 = quantized_values  # [N,NC,Z]
 
-            commit_loss = F.mse_loss(features_fp32, quantized_fp32.detach(), reduction="none")
+            commit_loss = F.mse_loss(features_fp32, quantized_fp32.detach(), reduction="none")  # [N,NC,Z]
 
             if mask is not None:
-                mask_expanded = mask.view(N, 1, 1).expand_as(commit_loss)
-                commit_loss = commit_loss[mask_expanded].mean()
+                mask_expanded = mask.view(N, 1, 1).expand_as(commit_loss)  # [N,NC,Z]
+                commit_loss = commit_loss[mask_expanded].mean()  # []
             else:
-                commit_loss = commit_loss.mean()
+                commit_loss = commit_loss.mean()  # []
         else:
             dtype = torch.float32 if fp32_loss_computation else x.dtype
-            commit_loss = torch.tensor(0.0, dtype=dtype, device=x.device)
+            commit_loss = torch.tensor(0.0, dtype=dtype, device=x.device)  # []
 
         # Straight-through estimator
-        quantized_values_ste = features_reshaped + (quantized_values - features_reshaped).detach()
+        quantized_values_ste = features_reshaped + (quantized_values - features_reshaped).detach()  # [N,NC,Z]
 
         # Output construction
-        quantized_feats = quantized_values_ste.view(N, feature_dim)
+        quantized_feats = quantized_values_ste.view(N, expected_dim)  # [N,Dq]
+        quantized_feats = self.project_out.to(x.dtype)(quantized_feats)  # [N,D]
 
         # Ensure fp32 losses if requested
         if self.training and return_loss and fp32_loss_computation:
-            entropy_aux_loss = entropy_aux_loss.float()
-            per_sample_entropy = per_sample_entropy.float()
-            codebook_entropy = codebook_entropy.float()
-            commit_loss = commit_loss.float()
+            entropy_aux_loss = entropy_aux_loss.float()  # []
+            per_sample_entropy = per_sample_entropy.float()  # []
+            codebook_entropy = codebook_entropy.float()  # []
+            commit_loss = commit_loss.float()  # []
 
-        ret = (quantized_feats, entropy_aux_loss, sparse_indices_quantized)
+        quantizer_loss = commit_loss + entropy_aux_loss  # []
+        ret = (quantized_feats, sparse_indices_quantized, quantizer_loss)
 
         if not return_loss_breakdown:
             return ret
@@ -435,5 +439,5 @@ def forward(
             per_sample_entropy,
             codebook_entropy,
             commit_loss,
-            torch.tensor(0.0, dtype=placeholder_dtype, device=x.device),
+            torch.tensor(0.0, dtype=placeholder_dtype, device=x.device),  # []
         )
diff --git a/cosmos_framework/model/tokenizer/models/modules/quantizers/residual_vq.py b/cosmos_framework/model/tokenizer/models/modules/quantizers/residual_vq.py
index d2f160d..806ac12 100644
--- a/cosmos_framework/model/tokenizer/models/modules/quantizers/residual_vq.py
+++ b/cosmos_framework/model/tokenizer/models/modules/quantizers/residual_vq.py
@@ -291,6 +291,78 @@ def __init__(
 
         self.commitment_loss = commitment_loss
 
+    def to_code_shape(self, x: torch.Tensor) -> torch.Tensor:
+        """Reshape dense latent features to code-grid feature vectors."""
+        embed_dim = self.codebooks[0].weight.shape[-1]
+        if x.ndim == 2 and x.shape[-1] == embed_dim:
+            return x  # [N,E]
+
+        if x.ndim != 4 or tuple(x.shape[1:]) != tuple(self.latent_shape):
+            raise ValueError(
+                f"Expected latent shape [B,{tuple(self.latent_shape)}] or [N,{embed_dim}], got {tuple(x.shape)}."
+            )
+
+        batch_size = x.shape[0]
+        latent_h, latent_w, latent_dim = [int(dim) for dim in self.latent_shape]
+        code_h, code_w, _ = [int(dim) for dim in self.code_shape]
+        height_factor = latent_h // code_h
+        width_factor = latent_w // code_w
+
+        x = x.reshape(batch_size, code_h, height_factor, code_w, width_factor, latent_dim)  # [B,h,Hs,w,Ws,D]
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous()  # [B,h,w,Hs,Ws,D]
+        return x.reshape(batch_size, code_h, code_w, embed_dim)  # [B,h,w,E]
+
+    def to_latent_shape(self, embeds: torch.Tensor) -> torch.Tensor:
+        """Reshape code-grid embeddings back to dense latent layout."""
+        embed_dim = self.codebooks[0].weight.shape[-1]
+        if embeds.ndim == 2 and embeds.shape[-1] == embed_dim:
+            return embeds  # [N,E]
+
+        code_h, code_w, _ = [int(dim) for dim in self.code_shape]
+        if embeds.ndim != 4 or tuple(embeds.shape[1:3]) != (code_h, code_w) or embeds.shape[-1] != embed_dim:
+            raise ValueError(
+                f"Expected code embedding shape [B,{code_h},{code_w},{embed_dim}] or [N,{embed_dim}], "
+                f"got {tuple(embeds.shape)}."
+            )
+
+        batch_size = embeds.shape[0]
+        latent_h, latent_w, latent_dim = [int(dim) for dim in self.latent_shape]
+        height_factor = latent_h // code_h
+        width_factor = latent_w // code_w
+
+        embeds = embeds.reshape(batch_size, code_h, code_w, height_factor, width_factor, latent_dim)  # [B,h,w,Hs,Ws,D]
+        embeds = embeds.permute(0, 1, 3, 2, 4, 5).contiguous()  # [B,h,Hs,w,Ws,D]
+        return embeds.reshape(batch_size, latent_h, latent_w, latent_dim)  # [B,H,W,D]
+
+    def _embed_code_slices(self, code: torch.Tensor) -> list[torch.Tensor]:
+        """Look up per-depth codebook embeddings without summing codebook depth."""
+        if code.shape[-1] != self.code_shape[-1]:
+            raise ValueError(f"Expected code depth {self.code_shape[-1]}, got code shape {tuple(code.shape)}.")
+
+        code = code.long()  # [...,Dq]
+        code_slices = torch.chunk(code, chunks=code.shape[-1], dim=-1)  # list[[...,1]]
+
+        if self.shared_codebook:
+            embeds = [self.codebooks[0].embed(code_slice) for code_slice in code_slices]  # list[[...,1,E]]
+        else:
+            embeds = [
+                self.codebooks[i].embed(code_slice) for i, code_slice in enumerate(code_slices)
+            ]  # list[[...,1,E]]
+
+        return embeds
+
+    def get_codes_from_indices(self, indices: torch.Tensor) -> torch.Tensor:
+        """Decode flat residual-quantizer indices to summed embedding vectors."""
+        if indices.ndim == 1:
+            if self.code_shape[-1] != 1:
+                raise ValueError(
+                    f"Flat indices require one codebook, but this RQ bottleneck has depth {self.code_shape[-1]}."
+                )
+            indices = indices.unsqueeze(-1)  # [N,1]
+
+        embeds = self._embed_code_slices(indices)  # list[[...,1,E]]
+        return torch.cat(embeds, dim=-2).sum(-2)  # [...,E]
+
     def quantize(self, x: torch.Tensor) -> tuple[list[torch.Tensor], torch.Tensor]:
         """Quantize input using residual quantization.
 
@@ -365,22 +437,22 @@ def embed_code(self, code: torch.Tensor) -> torch.Tensor:
         """Decode codes to embeddings.
 
         Args:
-            code: Code tensor of shape (B, h, w, d).
+            code: Code tensor of shape (B, h, w, d) or flat shape (N, d).
 
         Returns:
-            Embedded features of shape (B, H, W, embed_dim).
+            Embedded features of shape (B, H, W, D) or flat shape (N, embed_dim).
         """
-        assert code.shape[1:] == self.code_shape
-
-        code_slices = torch.chunk(code, chunks=code.shape[-1], dim=-1)
+        if code.ndim == 2:
+            return self.get_codes_from_indices(code)  # [N,E]
 
-        if self.shared_codebook:
-            embeds = [self.codebooks[0].embed(code_slice) for i, code_slice in enumerate(code_slices)]
-        else:
-            embeds = [self.codebooks[i].embed(code_slice) for i, code_slice in enumerate(code_slices)]
+        if tuple(code.shape[1:]) != tuple(self.code_shape):
+            raise ValueError(
+                f"Expected code shape [B,{tuple(self.code_shape)}] or [N,{self.code_shape[-1]}], "
+                f"got {tuple(code.shape)}."
+            )
 
-        embeds = torch.cat(embeds, dim=-2).sum(-2)
-        embeds = self.to_latent_shape(embeds)
+        embeds = self.get_codes_from_indices(code)  # [B,h,w,E]
+        embeds = self.to_latent_shape(embeds)  # [B,H,W,D]
 
         return embeds
 
@@ -397,18 +469,11 @@ def embed_code_with_depth(self, code: torch.Tensor, to_latent_shape: bool = Fals
         Returns:
             Tuple of (embedded features, None).
         """
-        assert code.shape[-1] == self.code_shape[-1]
-
-        code_slices = torch.chunk(code, chunks=code.shape[-1], dim=-1)
-
-        if self.shared_codebook:
-            embeds = [self.codebooks[0].embed(code_slice) for i, code_slice in enumerate(code_slices)]
-        else:
-            embeds = [self.codebooks[i].embed(code_slice) for i, code_slice in enumerate(code_slices)]
+        embeds = self._embed_code_slices(code)  # list[[...,1,E]]
 
         if to_latent_shape:
-            embeds = [self.to_latent_shape(embed.squeeze(-2)).unsqueeze(-2) for embed in embeds]
-        embeds = torch.cat(embeds, dim=-2)
+            embeds = [self.to_latent_shape(embed.squeeze(-2)).unsqueeze(-2) for embed in embeds]  # list[[B,H,W,1,D]]
+        embeds = torch.cat(embeds, dim=-2)  # [...,Dq,E] or [B,H,W,Dq,D]
 
         return embeds, None
 
@@ -429,25 +494,23 @@ def embed_partial_code(
         Returns:
             Quantized feature map.
         """
-        assert code.shape[1:] == self.code_shape
-        assert code_idx < code.shape[-1]
+        if tuple(code.shape[1:]) != tuple(self.code_shape):
+            raise ValueError(f"Expected code shape [B,{tuple(self.code_shape)}], got {tuple(code.shape)}.")
+        if code_idx >= code.shape[-1]:
+            raise ValueError(f"code_idx must be smaller than code depth {code.shape[-1]}, got {code_idx}.")
 
         B, h, w, _ = code.shape
 
-        code_slices = torch.chunk(code, chunks=code.shape[-1], dim=-1)
-        if self.shared_codebook:
-            embeds = [self.codebooks[0].embed(code_slice) for i, code_slice in enumerate(code_slices)]
-        else:
-            embeds = [self.codebooks[i].embed(code_slice) for i, code_slice in enumerate(code_slices)]
+        embeds = self._embed_code_slices(code)  # list[[B,h,w,1,E]]
 
         if decode_type == "select":
-            embeds = embeds[code_idx].view(B, h, w, -1)
+            embeds = embeds[code_idx].view(B, h, w, -1)  # [B,h,w,E]
         elif decode_type == "add":
-            embeds = torch.cat(embeds[: code_idx + 1], dim=-2).sum(-2)
+            embeds = torch.cat(embeds[: code_idx + 1], dim=-2).sum(-2)  # [B,h,w,E]
         else:
             raise NotImplementedError(f"{decode_type} is not implemented in partial decoding")
 
-        embeds = self.to_latent_shape(embeds)
+        embeds = self.to_latent_shape(embeds)  # [B,H,W,D]
 
         return embeds
 
@@ -468,30 +531,30 @@ def get_soft_codes(
         Returns:
             Tuple of (soft codes, hard codes).
         """
-        x = self.to_code_shape(x)
+        x = self.to_code_shape(x)  # [N,E] or [B,h,w,E]
 
-        residual_feature = x.detach().clone()
+        residual_feature = x.detach().clone()  # [N,E] or [B,h,w,E]
         soft_code_list = []
         code_list = []
 
         n_codebooks = self.code_shape[-1]
         for i in range(n_codebooks):
             codebook = self.codebooks[i]
-            distances = codebook.compute_distances(residual_feature)
-            soft_code = F.softmax(-distances / temp, dim=-1)
+            distances = codebook.compute_distances(residual_feature)  # [N,K] or [B,h,w,K]
+            soft_code = F.softmax(-distances / temp, dim=-1)  # [N,K] or [B,h,w,K]
 
             if stochastic:
-                soft_code_flat = soft_code.reshape(-1, soft_code.shape[-1])
-                code = torch.multinomial(soft_code_flat, 1)
-                code = code.reshape(*soft_code.shape[:-1])
+                soft_code_flat = soft_code.reshape(-1, soft_code.shape[-1])  # [M,K]
+                code = torch.multinomial(soft_code_flat, 1)  # [M,1]
+                code = code.reshape(*soft_code.shape[:-1])  # [N] or [B,h,w]
             else:
-                code = distances.argmin(dim=-1)
-            quants = codebook.embed(code)
-            residual_feature -= quants
+                code = distances.argmin(dim=-1)  # [N] or [B,h,w]
+            quants = codebook.embed(code)  # [N,E] or [B,h,w,E]
+            residual_feature -= quants  # [N,E] or [B,h,w,E]
 
             code_list.append(code.unsqueeze(-1))
             soft_code_list.append(soft_code.unsqueeze(-2))
 
-        code = torch.cat(code_list, dim=-1)
-        soft_code = torch.cat(soft_code_list, dim=-2)
+        code = torch.cat(code_list, dim=-1)  # [N,Dq] or [B,h,w,Dq]
+        soft_code = torch.cat(soft_code_list, dim=-2)  # [N,Dq,K] or [B,h,w,Dq,K]
         return soft_code, code
diff --git a/cosmos_framework/model/tokenizer/models/sparse_autoencoder.py b/cosmos_framework/model/tokenizer/models/sparse_autoencoder.py
index 2da33f6..2400287 100644
--- a/cosmos_framework/model/tokenizer/models/sparse_autoencoder.py
+++ b/cosmos_framework/model/tokenizer/models/sparse_autoencoder.py
@@ -1354,8 +1354,9 @@ def __init__(
         self._logged_decoder_temporal_plan = False
 
         # Load SigLIP2 pretrained model (text encoder always needed for text alignment)
-        # Use HF_HOME env var for cache, with local_files_only to avoid network downloads
-        hf_cache_dir = os.environ.get("HF_HOME")
+        # Use HF_HUB_CACHE (set by configure_hf_cache_env to HF_HOME/hub) so the cache_dir
+        # matches the actual hub layout where models are stored.
+        hf_cache_dir = os.environ.get("HF_HUB_CACHE") or os.environ.get("HF_HOME")
         local_files_only = hf_cache_dir is not None
         pretrained_model = None
         pretrained_vision_model = None
@@ -1419,7 +1420,7 @@ def __init__(
         if use_quantizer:
             if self.quantizer_type == "lfq":
                 self.quantizer = LFQ(
-                    dim=latent_channels,
+                    dim=self.quantizer_feature_dim // self.quantizer_chunk_size,
                     codebook_size=self.quantizer_codebook_size // self.quantizer_chunk_size,
                     num_codebooks=self.quantizer_num_codebooks,
                     sample_minimization_weight=1.0,
@@ -1517,7 +1518,7 @@ def __init__(
             self.post_logit_scale = None
             self.post_logit_bias = None
 
-        # Text decoder (Qwen3-based causal LM for image-to-text generation)
+        # Text decoder (configured causal LM for image-to-text generation)
         if self.use_text_decoder and text_decoder_model_name is not None:
             from cosmos_framework.model.tokenizer.models.text_decoder import (
                 TextDecoderWrapper,
@@ -1595,6 +1596,22 @@ def disable_slicing(self):
         """Disable sliced VAE decoding."""
         self.use_slicing = False
 
+    def _frame_count_to_latent_steps(self, frame_count: int, name: str, *, allow_zero: bool = False) -> int:
+        """Convert a raw frame count to latent temporal steps with strict divisibility checks."""
+        frame_count = int(frame_count)
+        temporal_patch_size = int(self.patch_size[0])
+        if temporal_patch_size <= 0:
+            raise ValueError(f"patch_size[0] must be positive, got {temporal_patch_size}.")
+        if frame_count == 0 and allow_zero:
+            return 0
+        if frame_count < 0 and allow_zero:
+            raise ValueError(f"{name} must be non-negative, got {frame_count}.")
+        if frame_count <= 0:
+            raise ValueError(f"{name} must be positive, got {frame_count}.")
+        if frame_count % temporal_patch_size != 0:
+            raise ValueError(f"{name} must be divisible by patch_size[0]={temporal_patch_size}, got {frame_count}.")
+        return frame_count // temporal_patch_size
+
     def _encode(
         self,
         x: SparseTensor,
@@ -1617,7 +1634,10 @@ def _encode(
         else:
             num_sample_frames_batch_size = self.num_sample_frames_batch_size
 
-        frame_batch_size = num_sample_frames_batch_size // self.patch_size[0]
+        frame_batch_size = self._frame_count_to_latent_steps(
+            int(num_sample_frames_batch_size),
+            "num_sample_frames_batch_size",
+        )
 
         temporal_slices = x.split_by_temporal_batches(frame_batch_size, adjust_temporal=True)
         processed_slices = []
@@ -1827,13 +1847,26 @@ def _get_decode_temporal_plan(
             num_sample_frames_batch_size = self.num_sample_frames_batch_size
 
         if training:
-            frame_batch_size = num_sample_frames_batch_size // self.patch_size[0]
+            frame_batch_size = self._frame_count_to_latent_steps(
+                int(num_sample_frames_batch_size),
+                "num_sample_frames_batch_size",
+            )
             frame_batch_strides = frame_batch_size
             kv_cache_size = 0
         else:
-            frame_batch_size = self.inference_num_sample_frames_batch_size // self.patch_size[0]
-            frame_batch_strides = self.inference_num_sample_frames_stride // self.patch_size[0]
-            kv_cache_size = self.inference_kv_cache_size // self.patch_size[0]
+            frame_batch_size = self._frame_count_to_latent_steps(
+                self.inference_num_sample_frames_batch_size,
+                "inference_num_sample_frames_batch_size",
+            )
+            frame_batch_strides = self._frame_count_to_latent_steps(
+                self.inference_num_sample_frames_stride,
+                "inference_num_sample_frames_stride",
+            )
+            kv_cache_size = self._frame_count_to_latent_steps(
+                self.inference_kv_cache_size,
+                "inference_kv_cache_size",
+                allow_zero=True,
+            )
             if frame_batch_size < frame_batch_strides:
                 raise ValueError(
                     "Non-causal inference requires inference_num_sample_frames_batch_size >= "
diff --git a/cosmos_framework/model/tokenizer/models/text_decoder.py b/cosmos_framework/model/tokenizer/models/text_decoder.py
index 322dd67..234a5f1 100644
--- a/cosmos_framework/model/tokenizer/models/text_decoder.py
+++ b/cosmos_framework/model/tokenizer/models/text_decoder.py
@@ -1,7 +1,5 @@
-# -----------------------------------------------------------------------------
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# -----------------------------------------------------------------------------
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
 
 """Text decoder modules for image-to-text generation in the tokenizer.
 
@@ -1257,8 +1255,8 @@ def generate_answer(
         """Generate an answer to a question about an image.
 
         Qwen3 uses its native chat template. Nemotron uses its own native
-        chat template with ``<|im_start|>``, ``<|im_end|>``, and
-        ``</think>`` in no-thinking mode.
+        chat template with ``<|im_start|>``, ``<|im_end|>``, and the
+        ``</think>\n`` no-thinking prefix.
 
         Args:
             image_feats_tensor: [N, encoder_dim] features for ONE image.
diff --git a/cosmos_framework/model/tokenizer/models/utils.py b/cosmos_framework/model/tokenizer/models/utils.py
index 33eaad8..d105228 100644
--- a/cosmos_framework/model/tokenizer/models/utils.py
+++ b/cosmos_framework/model/tokenizer/models/utils.py
@@ -10,12 +10,11 @@
     - Temporal utilities: split_temporal_dimension, restore_original_shape,
       reconstruct_from_temporal_slices
 
-Note: Metrics like calculate_psnr have been moved to projects.cosmos3.tokenizer.evaluation.reconstruction_metrics
+Note: Metrics like calculate_psnr have been moved to cosmos_framework.model.tokenizer.evaluation.reconstruction_metrics
 """
 
 from __future__ import annotations
 
-import re
 from functools import lru_cache
 from typing import Any, Literal
 
@@ -331,9 +330,6 @@ def sparse_to_batched_tensor(
     channels: int = 3,
 ) -> torch.Tensor | None:
     """Convert a uniform SparseTensor batch to a dense `[B, T, C, H, W]` tensor."""
-    if re.search(r"3D", task_type):
-        return None
-
     Pt, Ph, Pw = patch_size
     full_patch_size = [Pt, Ph, Pw, channels]
     variable_factor = np.prod(full_patch_size) // sparse_tensor.feats.shape[1]
@@ -386,16 +382,12 @@ def sparse_to_img_list(
         sparse_tensor: Input SparseTensor to convert.
         patch_size: Tuple of (Pt, Ph, Pw) patch dimensions.
         var_patch_axis: Axis with variable patch size.
-        task_type: Type of task (e.g., "image", "video", "3D").
+        task_type: Type of task (e.g., "image", "video").
         channels: Number of channels.
 
     Returns:
         List of image tensors.
     """
-    # Check if this is a 3D batch (coords with different z index)
-    if re.search(r"3D", task_type):
-        return [sparse_tensor]
-
     Pt, Ph, Pw = patch_size
 
     # Select patch_size based on input
@@ -561,7 +553,7 @@ def resize_and_crop(
 # =============================================================================
 # Logging
 # =============================================================================
-# Note: calculate_psnr has been moved to projects.cosmos3.tokenizer.evaluation.reconstruction_metrics
+# Note: calculate_psnr has been moved to cosmos_framework.model.tokenizer.evaluation.reconstruction_metrics
 # =============================================================================
 
 
diff --git a/cosmos_framework/model/tokenizer/utils/hf.py b/cosmos_framework/model/tokenizer/utils/hf.py
index 51cd1f1..37293c5 100644
--- a/cosmos_framework/model/tokenizer/utils/hf.py
+++ b/cosmos_framework/model/tokenizer/utils/hf.py
@@ -1,7 +1,5 @@
-# -----------------------------------------------------------------------------
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# -----------------------------------------------------------------------------
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
 
 """Helpers for consistent HuggingFace cache-only loading in tokenizer jobs."""
 
diff --git a/cosmos_framework/model/vfm/algorithm/loss/__init__.py b/cosmos_framework/model/vfm/algorithm/loss/__init__.py
index a688c8a..cdad794 100644
--- a/cosmos_framework/model/vfm/algorithm/loss/__init__.py
+++ b/cosmos_framework/model/vfm/algorithm/loss/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Loss functions used by VFM (rectified flow) and VLM (next-token CE) training paths."""
 
 __all__: list[str] = []
diff --git a/cosmos_framework/model/vfm/algorithm/loss/cross_entropy.py b/cosmos_framework/model/vfm/algorithm/loss/cross_entropy.py
index dde176f..dea0bf7 100644
--- a/cosmos_framework/model/vfm/algorithm/loss/cross_entropy.py
+++ b/cosmos_framework/model/vfm/algorithm/loss/cross_entropy.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """CE loss for VLM training.
 
 Ported from cosmos_rl.policy.trainer.llm_trainer.sft_trainer.async_safe_ce
diff --git a/cosmos_framework/model/vfm/algorithm/loss/flow_matching.py b/cosmos_framework/model/vfm/algorithm/loss/flow_matching.py
index 7e4b431..b10bfd9 100644
--- a/cosmos_framework/model/vfm/algorithm/loss/flow_matching.py
+++ b/cosmos_framework/model/vfm/algorithm/loss/flow_matching.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Rectified-flow matching loss (vision / action / sound modalities).
 
 Extracted from OmniMoTModel._compute_flow_matching_loss. The loss math is
diff --git a/cosmos_framework/model/vfm/diffusion/samplers/__init__.py b/cosmos_framework/model/vfm/diffusion/samplers/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/diffusion/samplers/__init__.py
+++ b/cosmos_framework/model/vfm/diffusion/samplers/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/diffusion/samplers/edm.py b/cosmos_framework/model/vfm/diffusion/samplers/edm.py
index ed5e156..17cd296 100644
--- a/cosmos_framework/model/vfm/diffusion/samplers/edm.py
+++ b/cosmos_framework/model/vfm/diffusion/samplers/edm.py
@@ -199,7 +199,7 @@ def fori_loop(lower: int, upper: int, body_fun: Callable[[int, Any], Any], init_
     """
     val = init_val
     for i in range(lower, upper):
-        # Periodic log during sampling so long-running jobs keep producing output.
+        # Add log during sampling to meet APS job health requirement of one log every 2mins
         if i % 10 == 0:
             log.info(f"fori_loop: {i}")
         val = body_fun(i, val)
diff --git a/cosmos_framework/model/vfm/diffusion/samplers/fm_solvers_unipc.py b/cosmos_framework/model/vfm/diffusion/samplers/fm_solvers_unipc.py
index 2c87fea..5fefbc4 100644
--- a/cosmos_framework/model/vfm/diffusion/samplers/fm_solvers_unipc.py
+++ b/cosmos_framework/model/vfm/diffusion/samplers/fm_solvers_unipc.py
@@ -1,10 +1,5 @@
-# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-#
-# Copied from https://github.com/huggingface/diffusers/blob/v0.31.0/src/diffusers/schedulers/scheduling_unipc_multistep.py
-# Converted UniPC for flow matching.
 
 import math
 from typing import List, Optional, Tuple, Union
diff --git a/cosmos_framework/model/vfm/hf_model.py b/cosmos_framework/model/vfm/hf_model.py
index 8b8efc0..a39bbe4 100644
--- a/cosmos_framework/model/vfm/hf_model.py
+++ b/cosmos_framework/model/vfm/hf_model.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Minimal HFModel for the vfm/ unified VLM training path.
 
 Responsibilities:
@@ -259,8 +260,8 @@ def load_weights(
                 ``org/model`` repo IDs fall back to Hugging Face.
             credential_path: S3 credential file, or None for local/HF.
             parallel_dims: ``ParallelDims`` instance (from
-                ``projects.cosmos3.vfm.utils.parallelism``).  The loader uses
-                it via :func:`~projects.cosmos3.vfm.models.utils.safetensors_loader._get_dp_shard_mesh`
+                ``cosmos_framework.utils.vfm.parallelism``).  The loader uses
+                it via :func:`~cosmos_framework.model.vfm.utils.safetensors_loader._get_dp_shard_mesh`
                 to obtain the 1-D ``dp_shard`` sub-mesh (or None when
                 ``dp_shard <= 1``) for striping checkpoint reads across
                 FSDP shard ranks.  When non-None, the caller MUST have
@@ -317,7 +318,7 @@ def load_weights(
             "raw_video",
             # image_sizes is collected by collate_fn but is NOT a Qwen3-VL forward arg
             # (Qwen3-VL uses image_grid_thw instead). Strip it so strict HF signatures
-            # don't reject it.
+            # don't reject it. NOTE: image_sizes IS valid for LLaVA-style models — if
             # a future Phase extends to those, remove this entry.
             "image_sizes",
         }
diff --git a/cosmos_framework/model/vfm/mot/__init__.py b/cosmos_framework/model/vfm/mot/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/mot/__init__.py
+++ b/cosmos_framework/model/vfm/mot/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/mot/attention.py b/cosmos_framework/model/vfm/mot/attention.py
index 002ca62..82d1b00 100644
--- a/cosmos_framework/model/vfm/mot/attention.py
+++ b/cosmos_framework/model/vfm/mot/attention.py
@@ -100,7 +100,7 @@ def two_way_attention(
 
     use_dont_care_mask = causal_q_offsets is causal_k_offsets
 
-
+    # NOTE: cosmos_framework attention is BSHD in, BSHD out
     causal_res = attention(
         causal_q.unsqueeze(0),  # [1,N_und,heads,head_dim]
         causal_k.unsqueeze(0),  # [1,N_und,heads,head_dim]
@@ -178,7 +178,7 @@ def three_way_attention(
 
     use_dont_care_mask = causal_q_offsets is causal_k_offsets
 
-
+    # NOTE: cosmos_framework attention is BSHD in, BSHD out
     causal_res = attention(
         causal_q.unsqueeze(0),  # [1,N_und,heads,head_dim]
         causal_k.unsqueeze(0),  # [1,N_und,heads,head_dim]
@@ -338,7 +338,7 @@ def build_packed_sequence(
     is_image_batch: bool = False,
     cp_world_size: int = 1,
     video_temporal_causal: bool = False,
-    use_rolling_kv_cache: bool = False,
+    skip_natten_metadata: bool = False,
     vision_token_shapes: list[tuple[int, int, int]] | None = None,
     action_token_shapes: list[tuple[int, ...]] | None = None,
     num_action_tokens_per_supertoken: int = 0,
@@ -386,9 +386,9 @@ def build_packed_sequence(
             null_action_supertokens=null_action_supertokens,
         )
         make_pack = factored_from_joint_sequence
-        # The rolling KV-cache path implements temporal causality in
-        # three_way_attention_with_kv_cache; skip NATTEN metadata.
-        if not use_rolling_kv_cache:
+        # Some memory-driven attention paths implement temporal visibility in
+        # their own attention kernels; skip NATTEN metadata for those paths.
+        if not skip_natten_metadata:
             # Temporal causal: encode (T, S) supertoken layout; spatial NATTEN: encode (H, W) layout.
             if video_temporal_causal:
                 natten_metadata_list = generate_temporal_causal_natten_metadata(
diff --git a/cosmos_framework/model/vfm/mot/attention_test.py b/cosmos_framework/model/vfm/mot/attention_test.py
index 66b58d5..56c2dee 100644
--- a/cosmos_framework/model/vfm/mot/attention_test.py
+++ b/cosmos_framework/model/vfm/mot/attention_test.py
@@ -244,7 +244,7 @@ def forward(self, *args, **kwargs):
                 kwargs["sdpa_func"] = self.sdpa_func
             return self.attention_func(*args, **kwargs)
 
-
+    # NOTE: we should try and maintain only one copy of QKV offsets if they're identical
     # between queries and key/values, since this enables the "don't care" mask, which enables
     # more attention backends in I4 attention.
     if query_factored_1["_causal_seq_offsets"].equal(key_factored_1["_causal_seq_offsets"]) and query_factored_1[
diff --git a/cosmos_framework/model/vfm/mot/cfgp_ar_test.py b/cosmos_framework/model/vfm/mot/cfgp_ar_test.py
index 6ac3238..b0fd2a7 100644
--- a/cosmos_framework/model/vfm/mot/cfgp_ar_test.py
+++ b/cosmos_framework/model/vfm/mot/cfgp_ar_test.py
@@ -1,7 +1,5 @@
-# -----------------------------------------------------------------------------
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# -----------------------------------------------------------------------------
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
 
 """Multi-rank tests for CFGP (CFG Parallelism) in AR inference.
 
diff --git a/cosmos_framework/model/vfm/mot/context_parallel_utils.py b/cosmos_framework/model/vfm/mot/context_parallel_utils.py
index 4bb49e0..96bf607 100644
--- a/cosmos_framework/model/vfm/mot/context_parallel_utils.py
+++ b/cosmos_framework/model/vfm/mot/context_parallel_utils.py
@@ -346,7 +346,7 @@ def context_parallel_attention(
         f"Local query heads ({q_heads_per_rank}) must be divisible by local KV heads ({kv_heads_per_rank})"
     )
 
-
+    # NOTE: q_und_seq, k_und_seq, and v_und_seq may have length 0
     # when doing AR-inference with a KV-cache.
 
     if kv_head_repeats > 1:
diff --git a/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py b/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
index 03f0c3f..909a1d0 100644
--- a/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
+++ b/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
@@ -121,6 +121,7 @@ def __init__(self, language_model, config: Cosmos3VFMNetworkConfig):
         text_config = config.vlm_config.text_config if hasattr(config.vlm_config, "text_config") else config.vlm_config
         self.hidden_size = text_config.hidden_size
         self.num_heads = text_config.num_attention_heads
+        self.num_kv_heads = text_config.num_key_value_heads
         self.head_dim = text_config.head_dim
         self.num_hidden_layers = text_config.num_hidden_layers
         self.predict_text_tokens = config.predict_text_tokens
@@ -882,7 +883,7 @@ def _encode_sound(
         packed_tokens_sound = packed_tokens_sound.to(target_dtype)  # [total_sound_tokens,sound_dim]
 
         # Project sound tokens + modality embedding
-
+        # NOTE: Sound position info comes from m-RoPE position IDs in the attention layers.
         # No additive position embedding is used (unlike legacy video which keeps one for backward compat).
         packed_tokens_sound = (
             self.sound2llm(packed_tokens_sound) + self.sound_modality_embed
@@ -1054,7 +1055,7 @@ def forward(
             natten_parameter_list=self.natten_parameter_list,
             cp_world_size=self.parallel_dims.cp_size if self.parallel_dims else 1,
             video_temporal_causal=self.video_temporal_causal,
-            use_rolling_kv_cache=memory is not None and memory.uses_rolling_kv_cache,
+            skip_natten_metadata=memory is not None and not memory.requires_natten_metadata(),
             vision_token_shapes=vision_token_shapes,
             action_token_shapes=packed_seq.action.token_shapes if packed_seq.action else None,
             num_action_tokens_per_supertoken=num_action_tokens_per_supertoken,
diff --git a/cosmos_framework/model/vfm/mot/dot_product_attention.py b/cosmos_framework/model/vfm/mot/dot_product_attention.py
index 8a6f46e..687a169 100644
--- a/cosmos_framework/model/vfm/mot/dot_product_attention.py
+++ b/cosmos_framework/model/vfm/mot/dot_product_attention.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 """
@@ -117,7 +117,6 @@ def cudnn_fused_attn(
     o_quantizer = None
     rng_gen = None
 
-
     # "thd_thd_thd" format requires contiguous tensors.
     # We should benchmark thd_th2d / th3d formats as well.
     q = q.contiguous()
@@ -176,7 +175,7 @@ def cudnn_fused_attn(
         # is_cuda_graph
         args += (False,)
 
-
+    # NOTE: The reason we do this instead of just calling DotProductAttention.forward is
     # I'd have to create DotProductAttention class and somehow pass it in here, but argument types for these torch.ops are very strict.
     # Moreover, back-propagation would still need additional tweaks to work properly.
     output_tensors = tex.fused_attn_fwd(*args)
@@ -207,7 +206,7 @@ def _get_max_tokens(num_tokens: int) -> int:
     return max_t
 
 
-
+# NOTE: we need register_fake in order to make this operator fully torch.compile compatible.
 # The goal for this function is to return fake tensors of the correct shape and dtype
 # without having to run the actual operator.
 
diff --git a/cosmos_framework/model/vfm/mot/modeling_utils.py b/cosmos_framework/model/vfm/mot/modeling_utils.py
index d774fa6..418daec 100644
--- a/cosmos_framework/model/vfm/mot/modeling_utils.py
+++ b/cosmos_framework/model/vfm/mot/modeling_utils.py
@@ -162,7 +162,6 @@ def __init__(
         dim_w = dim_h
         dim_t = dim - 2 * dim_h
         assert dim == dim_h + dim_w + dim_t, f"bad dim: {dim} != {dim_h} + {dim_w} + {dim_t}"
-
         self.register_buffer(
             "dim_spatial_range",
             torch.arange(0, dim_h, 2)[: (dim_h // 2)].float() / dim_h,
diff --git a/cosmos_framework/model/vfm/mot/unified_3dmrope_utils.py b/cosmos_framework/model/vfm/mot/unified_3dmrope_utils.py
index 9da5578..ce0c334 100644
--- a/cosmos_framework/model/vfm/mot/unified_3dmrope_utils.py
+++ b/cosmos_framework/model/vfm/mot/unified_3dmrope_utils.py
@@ -83,6 +83,8 @@ def get_3d_mrope_ids_vae_tokens(
     temporal_compression_factor: int = 4,
     base_temporal_compression_factor: int | None = None,
     start_frame_offset: int = 0,
+    temporal_positions: torch.Tensor | None = None,
+    actual_temporal_compression_factor: int | None = None,
 ) -> tuple[torch.Tensor, int | float]:
     """Generate 3D mRoPE position IDs for VAE vision tokens (image/video latents).
 
@@ -111,6 +113,11 @@ def get_3d_mrope_ids_vae_tokens(
             defaults to ``temporal_compression_factor`` (typical case where base matches actual).
         start_frame_offset: Offset added to frame indices before FPS scaling.
             Use 1 for action embeddings so they start at frame 1 instead of 0.
+        temporal_positions: Optional explicit temporal coordinates for each latent
+            frame, in source-frame / actual-temporal-compression-factor units.
+            When provided, positions can be fractional and must have shape ``(grid_t,)``.
+        actual_temporal_compression_factor: Temporal compression factor that defines
+            ``temporal_positions``. Defaults to ``temporal_compression_factor``.
 
     Returns:
         Tuple of:
@@ -124,6 +131,7 @@ def get_3d_mrope_ids_vae_tokens(
     # Enabled whenever fps is provided, including grid_t=1 (per-frame AR packs).
     # Callers that want integer positions (e.g. images) pass fps=None.
     fps_modulation_enabled = fps is not None
+    explicit_temporal_positions = temporal_positions is not None
 
     # Default base_temporal_compression_factor to temporal_compression_factor if not specified
     effective_base_tcf = (
@@ -131,8 +139,33 @@ def get_3d_mrope_ids_vae_tokens(
         if base_temporal_compression_factor is not None
         else temporal_compression_factor
     )
+    effective_actual_tcf = (
+        actual_temporal_compression_factor
+        if actual_temporal_compression_factor is not None
+        else temporal_compression_factor
+    )
 
-    if fps_modulation_enabled:
+    if explicit_temporal_positions:
+        assert temporal_positions is not None
+        if temporal_positions.ndim != 1 or temporal_positions.shape[0] != grid_t:
+            raise ValueError(
+                f"temporal_positions must have shape (grid_t,), got {tuple(temporal_positions.shape)} for {grid_t=}."
+            )
+        # Explicit coordinates are in latent-time units. Convert nonzero start-frame
+        # offsets from source-frame units into the same coordinate space.
+        frame_indices = temporal_positions.to(dtype=torch.float32)  # [grid_t]
+        if start_frame_offset != 0:
+            frame_indices = frame_indices + start_frame_offset / effective_actual_tcf  # [grid_t]
+
+        if fps_modulation_enabled:
+            scaled_t = (
+                frame_indices * effective_actual_tcf * (base_fps / effective_base_tcf) / fps + temporal_offset
+            )  # [grid_t]
+        else:
+            scaled_t = frame_indices + temporal_offset  # [grid_t]
+
+        t_index = scaled_t.view(-1, 1).expand(-1, grid_h * grid_w).flatten()  # [grid_t*grid_h*grid_w]
+    elif fps_modulation_enabled:
         # FPS modulation: scale temporal indices to reflect real time
         # tps = tokens per second (fps divided by temporal compression)
         # base_tps = base tokens per second
@@ -147,7 +180,6 @@ def get_3d_mrope_ids_vae_tokens(
 
         # Expand temporal indices for all spatial positions
         t_index = scaled_t.view(-1, 1).expand(-1, grid_h * grid_w).flatten()  # [grid_t*grid_h*grid_w]
-        t_dtype = torch.float32
     else:
         # No FPS modulation: use integer frame indices
         # Apply start_frame_offset for cross-modality alignment (e.g., action tokens start at frame 1)
@@ -158,16 +190,16 @@ def get_3d_mrope_ids_vae_tokens(
             + int(temporal_offset)
             + start_frame_offset
         )
-        t_dtype = torch.long
 
     # Height axis: for each temporal frame, cycles through h values, each repeated w times
+    device = t_index.device
     h_index = (
-        torch.arange(grid_h, dtype=torch.long).view(1, -1, 1).expand(grid_t, -1, grid_w).flatten()
+        torch.arange(grid_h, dtype=torch.long, device=device).view(1, -1, 1).expand(grid_t, -1, grid_w).flatten()
     )  # [grid_t*grid_h*grid_w]
 
     # Width axis: for each temporal frame and height, cycles through w values
     w_index = (
-        torch.arange(grid_w, dtype=torch.long).view(1, 1, -1).expand(grid_t, grid_h, -1).flatten()
+        torch.arange(grid_w, dtype=torch.long, device=device).view(1, 1, -1).expand(grid_t, grid_h, -1).flatten()
     )  # [grid_t*grid_h*grid_w]
 
     if not reset_spatial_indices:
@@ -177,9 +209,9 @@ def get_3d_mrope_ids_vae_tokens(
         w_index = w_index + spatial_offset  # [grid_t*grid_h*grid_w]
 
     # Stack into (3, T*H*W) tensor
-    # Note: When FPS modulation is enabled, temporal axis is float, spatial axes are long
-    # We convert h_index and w_index to the same dtype as t_index for stacking
-    if fps_modulation_enabled:
+    # Note: When FPS modulation or explicit temporal positions are enabled, temporal
+    # axis is float. Convert h_index and w_index to the same dtype for stacking.
+    if fps_modulation_enabled or explicit_temporal_positions:
         mrope_ids = torch.stack(
             [t_index, h_index.to(torch.float32), w_index.to(torch.float32)], dim=0
         )  # [3,grid_t*grid_h*grid_w]
diff --git a/cosmos_framework/model/vfm/mot/unified_mot.py b/cosmos_framework/model/vfm/mot/unified_mot.py
index a03e4aa..4ead628 100644
--- a/cosmos_framework/model/vfm/mot/unified_mot.py
+++ b/cosmos_framework/model/vfm/mot/unified_mot.py
@@ -189,7 +189,7 @@ class _MoTConfigBase(object):
       ``text_config`` access picks it up.
 
     Post-construction overrides via plain ``setattr`` (the
-    ``create_vlm_config`` flow in ``cosmos_framework/configs/base/defaults/vlm.py``)
+    ``create_vlm_config`` flow in ``configs/base/defaults/vlm.py``)
     just update the same plain attributes, so the next property access
     picks up the latest values.  No cache, no ``__setattr__``
     interception, no override bucket — the property rebuild is cheap
@@ -229,6 +229,7 @@ def __init__(
         qk_norm_for_text: bool = True,
         qk_norm_for_diffusion: bool = True,
         include_visual: bool = False,
+        gen_noisy_gating: bool = False,
         text_config_overrides: Mapping[str, Any] | None = None,
     ):
         # Defensive copy so downstream materialization can't mutate the
@@ -237,6 +238,9 @@ def __init__(
         self.qk_norm_for_text = qk_norm_for_text
         self.qk_norm_for_diffusion = qk_norm_for_diffusion
         self.include_visual = include_visual
+        # Noisy top-k gating on the generation-tower MoE blocks (Shazeer 2017).
+        # Gen-tower only; the understanding tower never receives this flag.
+        self.gen_noisy_gating = gen_noisy_gating
         # Plain attribute (not a property) so the ``create_vlm_config``
         # post-construction ``setattr`` flow can replace the whole
         # mapping in one shot; default to ``{}`` so the merge in
@@ -620,7 +624,7 @@ def reasoner_forward(
         in a clean AR loop.
 
         All attention compute is dispatched through
-        ``imaginaire.attention.attention`` (per repo policy) which expects the
+        ``cosmos_framework.model.attention.attention`` (per repo policy) which expects the
         heads-last contiguous layout ``[B, S, H, D]`` and natively handles GQA
         (``H_KV != H``) — no manual head expansion is needed.
 
@@ -653,7 +657,7 @@ def reasoner_forward(
         # q: [B,T,num_heads,head_dim], k: [B,T,num_kv_heads,head_dim]
 
         # The KV cache stores tensors in the same BSHD layout that
-        # ``imaginaire.attention.attention`` expects, with the seq dim at axis 1.
+        # ``cosmos_framework.model.attention.attention`` expects, with the seq dim at axis 1.
         if cache is not None:
             k_full, v_full = cache.update(layer_idx, k, v)
         else:
@@ -684,6 +688,7 @@ def _impl_init(
     layer_types: LayerTypes,
     qk_norm_for_text: bool,
     qk_norm_for_diffusion: bool,
+    gen_noisy_gating: bool = False,
 ):
     """Shared ``__init__`` body for the three MoT text-model variants.
 
@@ -705,6 +710,7 @@ def _impl_init(
                 layer_idx=layer_idx,
                 qk_norm_for_text=qk_norm_for_text,
                 qk_norm_for_diffusion=qk_norm_for_diffusion,
+                gen_noisy_gating=gen_noisy_gating,
             )
         )
 
@@ -881,6 +887,7 @@ def __init__(
         layer_types: LayerTypes,
         qk_norm_for_text: bool,
         qk_norm_for_diffusion: bool,
+        gen_noisy_gating: bool = False,
     ):
         super().__init__()
         self.hidden_size = config.hidden_size
@@ -898,7 +905,8 @@ def __init__(
             and (config.num_experts > 0 and (layer_idx + 1) % config.decoder_sparse_step == 0)
         ):
             self.mlp = Qwen3VLMoeTextSparseMoeBlock(config)
-            self.mlp_moe_gen = Qwen3VLMoeTextSparseMoeBlock(config)
+            # Noisy gating is gen-tower only.
+            self.mlp_moe_gen = Qwen3VLMoeTextSparseMoeBlock(config, noisy_gating=gen_noisy_gating)
         else:
             self.mlp = layer_types.mlp(config)
             self.mlp_moe_gen = layer_types.mlp(config)
@@ -1024,7 +1032,7 @@ def forward(
             ln_out_gen = self.post_attention_layernorm_moe_gen(residual_gen)  # [N_gen,hidden_size]
 
             # UNPAD MLP INPUT ===============
-
+            # NOTE: This is only need for the MoE auxiliary loss computation and to avoid
             #       artificial expert inbalance due to routing padding tokens.
             gen_len = pack_attn_out["_num_full_tokens"]
             und_len = pack_attn_out["_num_causal_tokens"]
@@ -1147,6 +1155,7 @@ def __init__(
         *,
         qk_norm_for_text: bool,
         qk_norm_for_diffusion: bool,
+        gen_noisy_gating: bool = False,
     ):
         super().__init__(config)
         _impl_init(
@@ -1155,6 +1164,7 @@ def __init__(
             layer_types=LayerTypes("qwen3_vl_moe"),
             qk_norm_for_text=qk_norm_for_text,
             qk_norm_for_diffusion=qk_norm_for_diffusion,
+            gen_noisy_gating=gen_noisy_gating,
         )
 
     def init_taylorseer(self, cache_dic=None, current=None):
@@ -1225,7 +1235,7 @@ class ReasonerKVCache:
     """Per-layer KV cache for the reasoner-tower autoregressive loop.
 
     Tensors are stored in the heads-last BSHD layout that
-    ``imaginaire.attention.attention`` expects::
+    ``cosmos_framework.model.attention.attention`` expects::
 
         keys[layer_idx]:   [B, T, num_kv_heads, head_dim]
         values[layer_idx]: [B, T, num_kv_heads, head_dim]
@@ -1364,7 +1374,8 @@ def _sample_next_token(
     top_p: float | None,
     repetition_penalty: float = 1.0,
     presence_penalty: float = 0.0,
-    seen_mask: torch.Tensor | None = None,  # [B,vocab_size] bool
+    seen_mask: torch.Tensor | None = None,  # [B,vocab_size] bool — prompt ∪ output, for repetition_penalty
+    output_seen_mask: torch.Tensor | None = None,  # [B,vocab_size] bool — output only,    for presence_penalty
     generator: torch.Generator | None = None,
 ) -> torch.Tensor:  # [B]
     """Greedy / multinomial sampling with optional top-k, top-p, and presence/repetition penalties.
@@ -1375,20 +1386,21 @@ def _sample_next_token(
            ``>1.0`` discourages repetition, ``<1.0`` encourages it,
            ``1.0`` is identity.
         2. Presence penalty (OpenAI semantics) — additive shift of every
-           logit at a position seen in history.  ``>0`` discourages,
-           ``<0`` encourages, ``0`` is identity.  Applied once per token
-           regardless of how often it appeared (presence, not frequency).
+           logit at a position seen in **output** (``output_seen_mask``).  ``>0``
+           discourages, ``<0`` encourages, ``0`` is identity.
         3. ``do_sample=False`` short-circuits to argmax.  The two
            penalties above are applied *before* this branch so they
            legitimately shift the greedy argmax — they're logit
            transformations, not sampling-only tricks.
         4. ``do_sample=True``: temperature → top-k → top-p → multinomial.
 
-    ``seen_mask`` is the canonical "has this vocab id appeared in this
-    sample's history" matrix maintained by
-    :func:`_impl_generate_reasoner_text`.  Both penalties default to
-    identity, and the fast path (both off) skips all penalty work and
-    leaves the existing greedy/sampling logic bit-identical.
+    Mask semantics (match vLLM):
+      * ``seen_mask``  is seeded with prompt tokens and updated with each
+        generated token — penalizes prompt ∪ output (HF convention).
+      * ``output_seen_mask`` is updated with each generated token only — penalizes
+        output only.
+    Both penalties default to identity; the fast path (both off) leaves the
+    existing greedy/sampling logic bit-identical.
 
     ``generator`` is the only RNG-consuming primitive in this module:
     when provided, it is threaded into ``torch.multinomial`` so the
@@ -1398,30 +1410,22 @@ def _sample_next_token(
     pre-seed behavior of consuming the device's default RNG and is
     bit-identical to the previous call signature.
     """
-    # Logit-transform stage: repetition + presence penalties.  Both gate
-    # on ``seen_mask`` being present AND a non-identity coefficient, so
-    # the default-off path costs zero extra ops.
-    if seen_mask is not None and (repetition_penalty != 1.0 or presence_penalty != 0.0):
-        if repetition_penalty != 1.0:
-            # CTRL/HF formula: divide positive logits, multiply negative
-            # logits (both by ``penalty``).  Phrasing the scale as a
-            # single ``where`` over a precomputed factor keeps the
-            # masked-update path branchless and lets autograd / inductor
-            # fuse it with the surrounding ops.
-            penalty_factor = torch.where(
-                logits > 0,
-                torch.full_like(logits, 1.0 / repetition_penalty),
-                torch.full_like(logits, repetition_penalty),
-            )
-            logits = torch.where(seen_mask, logits * penalty_factor, logits)
-        if presence_penalty != 0.0:
-            # OpenAI semantics: subtract a constant from every seen
-            # token's logit, once per token (presence, not frequency).
-            logits = torch.where(
-                seen_mask,
-                logits - presence_penalty,
-                logits,
-            )
+    if seen_mask is not None and repetition_penalty != 1.0:
+        # CTRL/HF formula: divide positive logits, multiply negative.
+        penalty_factor = torch.where(
+            logits > 0,
+            torch.full_like(logits, 1.0 / repetition_penalty),
+            torch.full_like(logits, repetition_penalty),
+        )
+        logits = torch.where(seen_mask, logits * penalty_factor, logits)
+    if output_seen_mask is not None and presence_penalty != 0.0:
+        # OpenAI semantics: subtract a constant from every seen
+        # token's logit, once per token (presence, not frequency).
+        logits = torch.where(
+            output_seen_mask,
+            logits - presence_penalty,
+            logits,
+        )
 
     if not do_sample:
         return torch.argmax(logits, dim=-1)
@@ -1590,9 +1594,11 @@ def _impl_generate_reasoner_text(
             — appearing twice costs the same as appearing once.
             Both penalties are applied *before* the ``do_sample``
             argmax/multinomial branch, so they shift the greedy
-            argmax too.  When both are at identity, the per-sample
-            ``seen_mask`` is never allocated and the loop is
-            bit-identical to the un-penalized fast path.
+            argmax too.  When both are at identity, no history mask
+            is allocated and the loop is bit-identical to the
+            un-penalized fast path. Repetition penalty uses prompt ∪
+            output; presence penalty uses output only (OpenAI / vLLM
+            convention).
         seed: Optional integer seed for the sampling RNG.  When provided
             (and ``do_sample=True``), a fresh ``torch.Generator`` is
             allocated on ``input_ids.device`` and seeded once with
@@ -1675,23 +1681,15 @@ def _impl_generate_reasoner_text(
         )  # [B,T_prompt,hidden_size]
     logits = causal_lm.lm_head(hidden[:, -1, :])  # [B,vocab_size]
 
-    # ``seen_mask`` is the per-sample "vocab id has appeared in this
-    # sample's history" matrix consumed by ``_sample_next_token``'s
-    # repetition / presence penalty paths.  Allocate only when at least
-    # one penalty is non-identity so the un-penalized fast path is
-    # bit-identical to the previous behavior (no extra alloc, no scatter,
-    # no per-step writes).  We size from ``logits.size(-1)`` so we don't
-    # have to reach into ``lm_head.weight.shape`` (which would also
-    # work under FSDP but is one extra coupling point).  The mask
-    # captures prompt tokens first so the prefill's own sampling step
-    # already sees the prompt as history — matching HF's
-    # ``RepetitionPenaltyLogitsProcessor`` convention of penalizing
-    # against the full ``input_ids``.
-    apply_penalties = repetition_penalty != 1.0 or presence_penalty != 0.0
+    # seen_mask is seeded with prompt tokens (HF convention).
+    # output_seen_mask stays empty until output tokens accumulate (OpenAI convention).
     seen_mask: torch.Tensor | None = None
-    if apply_penalties:
+    output_seen_mask: torch.Tensor | None = None
+    if repetition_penalty != 1.0:
         seen_mask = torch.zeros(B, logits.size(-1), dtype=torch.bool, device=device)
         seen_mask.scatter_(1, input_ids, True)
+    if presence_penalty != 0.0:
+        output_seen_mask = torch.zeros(B, logits.size(-1), dtype=torch.bool, device=device)
 
     # Build a device-local ``torch.Generator`` only when an explicit
     # seed is supplied.  ``torch.multinomial(generator=None)`` falls
@@ -1717,13 +1715,14 @@ def _impl_generate_reasoner_text(
         repetition_penalty=repetition_penalty,
         presence_penalty=presence_penalty,
         seen_mask=seen_mask,
+        output_seen_mask=output_seen_mask,
         generator=generator,
     )  # [B]
+    # Fold the just-sampled token into both penalty histories.
     if seen_mask is not None:
-        # Fold the just-sampled token into each sample's history so the
-        # next decode step penalizes it too.  Per-sample row writes are
-        # idempotent — writing True over True is a no-op.
         seen_mask.scatter_(1, next_token.unsqueeze(1), True)
+    if output_seen_mask is not None:
+        output_seen_mask.scatter_(1, next_token.unsqueeze(1), True)
 
     # Hoist invariants used by every decode step out of the loop body so we
     # don't pay per-iter Python and allocator overhead for what is in fact
@@ -1804,19 +1803,19 @@ def _impl_generate_reasoner_text(
             repetition_penalty=repetition_penalty,
             presence_penalty=presence_penalty,
             seen_mask=seen_mask,
+            output_seen_mask=output_seen_mask,
             generator=generator,
         )  # [B]
         # Force pad on already-finished samples; finished stays True afterwards.
         # ``pad_tensor`` is hoisted above so we avoid the per-step
         # ``torch.full_like(next_token, pad_token_id)`` allocation.
         next_token = torch.where(finished, pad_tensor, next_token)
+        # Record (post-pad) emitted token in both penalty histories. Finished
+        # samples write pad_token_id, which is dead state and harmless.
         if seen_mask is not None:
-            # Record the (post-pad) emitted token in history.  For
-            # still-running samples this is the actual sampled token;
-            # for already-finished samples it's ``pad_token_id``, which
-            # is harmless because finished samples don't sample anymore
-            # — their row of ``seen_mask`` is dead state from here on.
             seen_mask.scatter_(1, next_token.unsqueeze(1), True)
+        if output_seen_mask is not None:
+            output_seen_mask.scatter_(1, next_token.unsqueeze(1), True)
         if eos_tensor is not None:
             # Vectorized EOS comparison: broadcast ``next_token`` (``[B,1]``)
             # against ``eos_tensor`` (``[E]``) and reduce-any across the
@@ -2000,6 +1999,7 @@ def __init__(self, config: Qwen3VLMoeMoTConfig):
             text_config,
             qk_norm_for_text=config.qk_norm_for_text,
             qk_norm_for_diffusion=config.qk_norm_for_diffusion,
+            gen_noisy_gating=config.gen_noisy_gating,
         )
         self.vocab_size = text_config.vocab_size
         self.lm_head = nn.Linear(text_config.hidden_size, text_config.vocab_size, bias=False)
@@ -2027,6 +2027,10 @@ def init_moe(self) -> None:
             original_name = name.replace("_moe_gen", "").replace("_checkpoint_wrapped_module.", "")
             if original_name in state_dict:
                 param.data.copy_(state_dict[original_name].data)
+            elif "gate_noise" in original_name:
+                # Noisy-gating projection is gen-tower only (the und tower has no
+                # gate_noise counterpart), so keep its zero-init rather than copy.
+                pass
             else:
                 raise ValueError(f"Could not find {original_name} in state_dict for initialization of {name}")
 
@@ -2197,5 +2201,4 @@ def generate_reasoner_text(
         seed: int | None = None,
         return_only_new_tokens: bool = False,
     ) -> torch.Tensor:
-
         raise NotImplementedError("This method is not implemented for Nemotron 3 Dense VL.")
diff --git a/cosmos_framework/model/vfm/omni_mot_model.py b/cosmos_framework/model/vfm/omni_mot_model.py
index 9f71a00..652b930 100644
--- a/cosmos_framework/model/vfm/omni_mot_model.py
+++ b/cosmos_framework/model/vfm/omni_mot_model.py
@@ -4,6 +4,7 @@
 from __future__ import annotations
 
 import collections
+import json
 import time
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Mapping, Optional, Tuple
@@ -25,6 +26,7 @@
 from cosmos_framework.model.vfm.algorithm.loss.flow_matching import compute_flow_matching_loss
 from cosmos_framework.model.vfm.algorithm.loss.load_balancing import compute_load_balancing_loss
 from cosmos_framework.configs.base.defaults.model_config import OmniMoTModelConfig
+from cosmos_framework.data.vfm.action.action_processing import ActionProcessor, get_action_processing_records
 from cosmos_framework.data.vfm.sequence_packing import (
     PackedSequence,
     SequencePlan,
@@ -69,6 +71,7 @@ def __init__(self, config: OmniMoTModelConfig):
         super().__init__()
         self.config = config
         log.info(f"OmniMoTModel: config {self.config}")
+
         # 0. Set up precision
         self.set_precision()
 
@@ -99,7 +102,6 @@ def set_precision(self) -> None:
         torch.backends.cudnn.allow_tf32 = torch.backends.cuda.matmul.allow_tf32 = False
 
     def set_up_data_key(self) -> None:
-
         self.input_video_key = self.config.input_video_key  # by default it is video key for Video diffusion model
         self.input_image_key = self.config.input_image_key
         self.input_caption_key = self.config.input_caption_key
@@ -143,7 +145,6 @@ def set_up_tokenizers(self) -> None:
         vlm_tokenizer, special_tokens = add_special_tokens(vlm_tokenizer)
         self.vlm_tokenizer = vlm_tokenizer
 
-
         self.llm_special_tokens = special_tokens
         self.llm_special_tokens["eos_token_id"] = vlm_tokenizer.eos_token_id
 
@@ -170,7 +171,6 @@ def set_up_tokenizers(self) -> None:
             self.tokenizer_sound_gen = None
 
 
-
     def build_net(self, dtype: torch.dtype):
         # Build model network and parallelize it.
         with torch.device("meta"):
@@ -178,7 +178,7 @@ def build_net(self, dtype: torch.dtype):
 
             language_model = lazy_instantiate(self.vlm_config.model_instance)
 
-
+            # NOTE: We pass "RF timesteps" to the network in the same scale as the scheduler
             # (i.e., roughly [0, num_train_timesteps]). The MoT network expects to internally
             # rescale timesteps before embedding; avoid hard-coding 1e-3 by computing it from
             # the configured scheduler resolution.
@@ -258,28 +258,64 @@ def load_pretrained_model_if_needed(
         has_resumable_checkpoint: bool,
         has_load_path: bool,
     ) -> None:
-        """Load HF understanding-pathway weights, gated by runtime checkpoint state.
+        """Conditionally seed pretrained understanding/reasoner weights at startup.
+
+        OmniMoT has two weight groups: the understanding/reasoner pathway (the
+        ``language_model`` backbone, e.g. Qwen3-VL / Cosmos-Reason) and the
+        generation pathway (the diffusion MoE experts). This hook runs after the
+        model is built and after DCP has had a chance to restore a checkpoint. It
+        decides (a) whether the understanding weights still need to be seeded from
+        the pretrained HuggingFace source, and (b) whether those weights must be
+        copied into the generation pathway.
 
         Args:
-            has_resumable_checkpoint: A latest_checkpoint.txt exists in the load
-                directory; DCP has already populated the full model. Skip HF load entirely.
-            has_load_path: ``checkpoint.load_path`` is set; DCP has loaded the full
-                model from the warm-start path. Reload HF understanding-pathway weights
-                (e.g. swap Qwen3-VL → Cosmos-Reason) but skip the understanding→generation
-                copy since the generation pathway is already populated by load_path.
-
-        Three cases the gates produce:
-          1. Fresh init (neither): full HF load + understanding→generation copy.
-          2. Warm-start (load_path only): reload understanding, skip the copy.
-          3. Resume (resumable checkpoint): skip everything.
+            has_resumable_checkpoint: A ``latest_checkpoint.txt`` exists in the
+                load directory, i.e. DCP has already restored the full model from a
+                mid-run checkpoint. The understanding weights are normally present
+                in such a checkpoint, so the HF load is skipped -- unless
+                ``exclude_reasoner_weights_from_checkpoint`` is set, in which case
+                those weights were never checkpointed and must be re-seeded here.
+            has_load_path: ``checkpoint.load_path`` is set, i.e. DCP has loaded the
+                full model from a warm-start path. The understanding weights are
+                still re-seeded from HF (e.g. to swap Qwen3-VL -> Cosmos-Reason),
+                but the understanding->generation copy is skipped because the
+                generation pathway was already populated from ``load_path``.
+
+        The gates combine into three startup scenarios:
+          1. Fresh init (neither gate set): seed understanding weights from HF and
+             copy them into the generation pathway.
+          2. Warm-start (``has_load_path`` only): re-seed understanding weights,
+             skip the understanding->generation copy.
+          3. Resume (``has_resumable_checkpoint`` set): skip everything, unless
+             ``exclude_reasoner_weights_from_checkpoint`` forces re-seeding the
+             understanding weights (the copy is still skipped).
         """
+        # A checkpoint of any kind (mid-run resume or warm-start load_path) means
+        # the generation pathway is already populated, so the understanding->
+        # generation copy further below must be skipped.
+        has_checkpoint = has_resumable_checkpoint or has_load_path
+
         pretrained_weights = self.vlm_config.pretrained_weights
-        if not pretrained_weights.enabled:
-            return
-        if has_resumable_checkpoint:
-            log.info("Resumable checkpoint exists; skipping HF understanding-pathway load.")
+
+        if self.config.exclude_reasoner_weights_from_checkpoint and not pretrained_weights.enabled:
+            raise ValueError(
+                "Reasoner weights must be loaded from pretrained checkpoint when "
+                "exclude_reasoner_weights_from_checkpoint is True. However, "
+                "pretrained_weights.enabled is set to False."
+            )
+
+        # Seed understanding weights from HF only when the source is enabled and
+        # either there is no resumable checkpoint to restore them from, or they
+        # were deliberately excluded from the checkpoint (so it cannot contain
+        # them and they must be reloaded from the pretrained source).
+        load_pretrained_weights = pretrained_weights.enabled and (
+            self.config.exclude_reasoner_weights_from_checkpoint or not has_resumable_checkpoint
+        )
+        if not load_pretrained_weights:
             return
 
+        # Load the language_model (understanding/reasoner backbone) safetensors
+        # into the given net, respecting the active parallelism layout.
         def _load_language_model(net: torch.nn.Module):
             load_language_model_safetensors(
                 model=net.language_model,
@@ -289,18 +325,25 @@ def _load_language_model(net: torch.nn.Module):
                 checkpoint_format=pretrained_weights.checkpoint_format,
             )
 
-        log.info(f"Loading understanding pathway weights from {pretrained_weights.backbone_path}")
+        log.info(f"Loading reasoner pathway weights from {pretrained_weights.backbone_path}")
         _load_language_model(self.net)
+        # Keep the EMA copy in sync with the freshly seeded understanding weights.
         if self.config.ema.enabled:
             _load_language_model(self.net_ema)
-        log.info("Successfully loaded understanding pathway weights.")
+        log.info("Successfully loaded reasoner pathway weights.")
 
-        if not self.config.diffusion_expert_config.load_weights_from_pretrained:
-            return
-        if has_load_path:
-            log.info("Warm-start load_path active; skipping understanding→generation copy.")
+        # Copy understanding -> generation only on a truly fresh init: the config
+        # must request it and no checkpoint (resume or warm-start) may have already
+        # populated the generation pathway.
+        load_pretrained_diffusion_weights = (
+            self.config.diffusion_expert_config.load_weights_from_pretrained and not has_checkpoint
+        )
+        if not load_pretrained_diffusion_weights:
+            log.info("Skipping diffusion pathway weights copying.")
             return
 
+        # init_moe() copies the understanding-pathway weights into the generation
+        # (diffusion MoE) experts so generation starts from the pretrained backbone.
         log.info("Copying understanding pathway weights to generation pathway.")
         self.net.language_model.init_moe()
         if self.config.ema.enabled:
@@ -321,7 +364,6 @@ def set_up_model(self):
 
                 self.net_ema_worker = DTensorFastEmaModelUpdater()
 
-
                 s = config.ema.rate
                 self.ema_exp_coefficient = np.roots([1, 7, 16 - s**-2, 12 - s**-2]).real.max()
 
@@ -500,7 +542,7 @@ def _pack_input_sequence(
     ) -> PackedSequence:
         """Wrap ``pack_input_sequence`` with all config-derived args pre-filled.
 
-        Centralises the 9 config-derived positional/embedding args so callers only
+        Centralises the 10 config-derived positional/embedding args so callers only
         supply the four per-call arguments (sequence_plans, text tokens, data, timesteps)
         plus three optional flags.
         """
@@ -519,12 +561,66 @@ def _pack_input_sequence(
             unified_3d_mrope_temporal_modality_margin=self.config.diffusion_expert_config.unified_3d_mrope_temporal_modality_margin,
             enable_fps_modulation=self.config.diffusion_expert_config.enable_fps_modulation,
             base_fps=float(self.config.diffusion_expert_config.base_fps),
+            sound_base_temporal_compression_factor=self.config.diffusion_expert_config.sound_base_temporal_compression_factor,
             temporal_compression_factor=self.tokenizer_vision_gen.temporal_compression_factor,
+            vision_temporal_position_mode=self.config.diffusion_expert_config.vision_temporal_position_mode,
             video_temporal_causal=self.config.video_temporal_causal,
             action_dim=self.config.max_action_dim,
             initial_mrope_temporal_offset=initial_mrope_temporal_offset,
         )
 
+    def _get_temporal_positions_vision(
+        self,
+        raw_state_vision: list[torch.Tensor],
+        x0_tokens_vision: list[torch.Tensor],
+    ) -> list[torch.Tensor] | None:
+        """Return optional per-latent temporal coordinates for vision tokens."""
+        mode = self.config.diffusion_expert_config.vision_temporal_position_mode
+        if mode == "latent_index":
+            return None
+        if mode != "uniae_source_right_edge":
+            raise ValueError(
+                "Unsupported vision_temporal_position_mode: "
+                f"{mode}. Expected 'latent_index' or 'uniae_source_right_edge'."
+            )
+
+        assert self.tokenizer_vision_gen is not None
+        temporal_positions_vision: list[torch.Tensor] = []
+        for raw_state_vision_i, x0_tokens_vision_i in zip(raw_state_vision, x0_tokens_vision, strict=True):
+            if raw_state_vision_i.dim() == 5:
+                num_pixel_frames = int(raw_state_vision_i.shape[2])
+            elif raw_state_vision_i.dim() == 4:
+                num_pixel_frames = int(raw_state_vision_i.shape[1])
+            else:
+                raise ValueError(
+                    "raw_state_vision items must have shape [B,C,T,H,W] or [C,T,H,W], "
+                    f"got shape {tuple(raw_state_vision_i.shape)}."
+                )
+            num_latent_frames = int(x0_tokens_vision_i.shape[2])
+            frame_h = int(raw_state_vision_i.shape[-2])
+            frame_w = int(raw_state_vision_i.shape[-1])
+            resolution = get_vision_data_resolution((frame_h, frame_w))
+            temporal_positions = self.tokenizer_vision_gen.get_latent_temporal_positions(
+                num_pixel_frames=num_pixel_frames,
+                resolution=resolution,
+                num_latent_frames=num_latent_frames,
+            )  # [T_latent]
+            if temporal_positions is None:
+                raise ValueError(
+                    f"{type(self.tokenizer_vision_gen).__name__} does not support vision_temporal_position_mode={mode}."
+                )
+            if temporal_positions.shape[0] != num_latent_frames:
+                raise ValueError(
+                    "Vision temporal position count must match latent frames: "
+                    f"got {temporal_positions.shape[0]} positions for {num_latent_frames} latent frames."
+                )
+            temporal_positions = temporal_positions.to(
+                device=x0_tokens_vision_i.device,
+                dtype=torch.float32,
+            )  # [T_latent]
+            temporal_positions_vision.append(temporal_positions)
+        return temporal_positions_vision
+
     # ------------------------ training ------------------------
 
     def memory_init_training(
@@ -1121,7 +1217,6 @@ def _get_train_noise_level_vision(
             (timesteps, sigmas): Both [B,1] for TF/base, or [B,T_max] for diffusion_forcing.
         """
 
-
         rectified_flow = self.rectified_flow_image if is_image_batch else self.rectified_flow_video
 
         assert not self.config.rectified_flow_training_config.use_discrete_rf, (
@@ -1860,6 +1955,7 @@ def _get_velocity(
             raw_state_vision=gen_data_clean.raw_state_vision,
             x0_tokens_vision=noise_x_vision,
             fps_vision=gen_data_clean.fps_vision,
+            temporal_positions_vision=gen_data_clean.temporal_positions_vision,
             # Action fields
             raw_state_action=gen_data_clean.raw_state_action if has_action else None,
             x0_tokens_action=noise_x_action if has_action else None,
@@ -2105,76 +2201,6 @@ def _run_classifier_free_guidance(
         else:
             return other_v_list, v_list
 
-    def _build_no_control_inference_state(
-        self,
-        sequence_plans: list[SequencePlan],
-        gen_data_clean: GenerationDataClean,
-    ) -> tuple[list[SequencePlan], GenerationDataClean, list[int]] | None:
-        """Build inference state without control-map vision (for control-CFG).
-
-        Transfer packs [control_map(s), target_clip] per sample. The no-control branch
-        drops the control maps from the vision sequence; the text caption and target
-        clip remain. Returns None when every sample has at most one vision item.
-
-        Also returns ``ctrl_dims_per_sample``: flattened control-token width per sample,
-        used to slice ``noise_x`` and blend velocities on the target suffix.
-        """
-        num_items_per_sample = gen_data_clean.num_vision_items_per_sample
-        if num_items_per_sample is None or all(n <= 1 for n in num_items_per_sample):
-            return None
-
-        assert gen_data_clean.x0_tokens_vision is not None
-
-        new_x0_tokens_vision: list[torch.Tensor] = []
-        new_raw_state_vision: list[torch.Tensor] | None = [] if gen_data_clean.raw_state_vision is not None else None
-        ctrl_dims_per_sample: list[int] = []
-        vis_offset = 0
-        for n_vis in num_items_per_sample:
-            ctrl_dim_i = 0
-            for j in range(n_vis - 1):
-                sh = gen_data_clean.x0_tokens_vision[vis_offset + j].shape
-                ctrl_dim_i += int(torch.tensor(list(sh)).prod().item())
-            ctrl_dims_per_sample.append(ctrl_dim_i)
-            tgt_idx = vis_offset + n_vis - 1
-            new_x0_tokens_vision.append(gen_data_clean.x0_tokens_vision[tgt_idx])
-            if new_raw_state_vision is not None:
-                new_raw_state_vision.append(gen_data_clean.raw_state_vision[tgt_idx])  # type: ignore[index]
-            vis_offset += n_vis
-
-        gdc_nc = GenerationDataClean(
-            batch_size=gen_data_clean.batch_size,
-            is_image_batch=gen_data_clean.is_image_batch,
-            raw_state_vision=new_raw_state_vision,
-            x0_tokens_vision=new_x0_tokens_vision,
-            fps_vision=gen_data_clean.fps_vision,
-            num_vision_items_per_sample=None,
-            raw_state_action=gen_data_clean.raw_state_action,
-            x0_tokens_action=gen_data_clean.x0_tokens_action,
-            action_domain_id=gen_data_clean.action_domain_id,
-            fps_action=gen_data_clean.fps_action,
-            raw_action_dim=gen_data_clean.raw_action_dim,
-            raw_state_sound=gen_data_clean.raw_state_sound,
-            x0_tokens_sound=gen_data_clean.x0_tokens_sound,
-            fps_sound=gen_data_clean.fps_sound,
-        )
-
-        sp_nc = [
-            SequencePlan(
-                has_text=sp.has_text,
-                has_vision=sp.has_vision,
-                condition_frame_indexes_vision=sp.condition_frame_indexes_vision,
-                share_vision_temporal_positions=False,
-                has_action=sp.has_action,
-                condition_frame_indexes_action=sp.condition_frame_indexes_action,
-                action_start_frame_offset=sp.action_start_frame_offset,
-                has_sound=sp.has_sound,
-                condition_frame_indexes_sound=sp.condition_frame_indexes_sound,
-            )
-            for sp in sequence_plans
-        ]
-
-        return sp_nc, gdc_nc, ctrl_dims_per_sample
-
     @torch.no_grad()
     def generate_samples_from_batch(
         self,
@@ -2183,13 +2209,10 @@ def generate_samples_from_batch(
         sampler: Any | None = None,
         guidance: float = 1.5,
         guidance_interval: Optional[list[float]] = None,
-        control_guidance: float = 1.0,
-        control_guidance_interval: Optional[list[float]] = None,
         seed: list[int] | int = 1,
         n_sample: int | None = None,
         has_negative_prompt: bool = False,
         num_steps: int = 35,
-        align_num_steps: int | None = None,
         shift: float = 5.0,
         sigma_max: float = 80.0,
         skip_text_tokens_for_cfg: bool = False,
@@ -2224,11 +2247,6 @@ def generate_samples_from_batch(
             guidance (float): Classifier-free guidance weight.
             guidance_interval (list[float] | None): Optional timestep interval to apply guidance.
                 For the timesteps (ranging between 0-1000) that fall between the interval, we perform CFG, otherwise, we skip the unconditional generation.
-            control_guidance (float): Control-CFG scale for transfer inference. ``1.0`` (default)
-                disables the extra comparison forward; values ``> 1.0`` blend velocities from
-                with-control-map vs without-control-map forwards on the generated clip.
-            control_guidance_interval (list[float] | None): Optional timestep interval to apply
-                control-CFG; ``None`` applies on every step.
             seed (list[int] | int): Random seeds for noise generation. For all new use-cases,
                 we use a list of seeds, one for each sample. The length of the list must match
                 the number of samples. Legacy use-cases use a single integer seed which is
@@ -2237,17 +2255,6 @@ def generate_samples_from_batch(
             n_sample (int | None): Number of samples to generate; defaults to batch size.
             has_negative_prompt (bool): If True, use negative prompt for unconditional branch.
             num_steps (int): Number of sampling steps for the diffusion process.
-            align_num_steps (int | None): FSDP collective-sequence alignment
-                target. Under throughput-style inference each FSDP-shard rank holds
-                a different sample, and ``num_steps`` can diverge across ranks.
-                Since the model is sharded across the dp_shard group, every sampler
-                step issues a param all-gather over that group, so a step-count
-                mismatch deadlocks NCCL. The inference caller all_reduce(MAX)es the
-                local ``num_steps`` over the dp_shard group and passes the result
-                here; ranks with ``num_steps < align_num_steps`` run the deficit as
-                extra *discarded* sampler steps to keep every rank's all-gather
-                count identical. ``None`` (or a value ``<= num_steps``) disables
-                padding.
             shift (float): Time shift parameter for the sampler.
             sigma_max (float): Maximum sigma for the EDM sampler.
             skip_text_tokens_for_cfg (bool): If True, skip text tokens in unconditional branch.
@@ -2298,7 +2305,8 @@ def generate_samples_from_batch(
         Returns:
             Dict with keys:
                 - "vision": List of vision latent tensors (one per sample, variable shapes)
-                - "action": List of action latent tensors or None (only present when action_gen=True and has_action)
+                - "action": List of external-space action tensors or None
+                  (only present when action_gen=True and has_action)
 
         Raises:
             ValueError: If the number of samples does not match the number of noise tensors or seeds.
@@ -2354,28 +2362,32 @@ def generate_samples_from_batch(
 
         assert n_sample == len(seed), f"Number of samples {n_sample} must match number of seeds {len(seed)}"
 
-        no_control_state = None
-        if control_guidance != 1.0:
-            no_control_state = self._build_no_control_inference_state(sequence_plans, gen_data_clean)
-            if no_control_state is None:
-                log.warning(
-                    "control_guidance != 1.0 but no multi-vision sample found; "
-                    "control-CFG disabled (single-branch inference)."
-                )
-
-        # FSDP collective-sequence alignment (throughput-style inference). Each
-        # FSDP-shard rank holds a different sample, and ``velocity_fn`` issues 1
-        # model forward when this rank skips CFG (guidance == 1.0, or a timestep
-        # outside guidance_interval) but 2 forwards otherwise. Mixed-modality
-        # batches put e.g. action samples (guidance=1.0 -> 1 forward) and vision
-        # samples (guidance>1 -> 2 forwards) on different ranks of the same
-        # dp_shard group, so the per-step param all-gather count diverges and NCCL
-        # deadlocks. The per-call CFG decision is MAX-reduced over the dp_shard
-        # group inside ``velocity_fn`` below so every rank runs the same number of
-        # forwards. Scoped to dp_shard for the same reason as the num_steps
-        # alignment (see inference.py): cp/cfgp peers share a sample -> same
-        # guidance -> same decision, and the caller's nesting guard rejects layouts
-        # where that would not hold.
+        # Create a velocity function for a single sample (for use with self.sampler).
+        # FSDP collective-sequence alignment (throughput-preset inference).
+        #
+        # In throughput-preset inference each rank holds a different sample,
+        # and different samples can diverge on (a) the CFG decision per
+        # step — ``guidance != 1.0`` (and the optional ``guidance_interval``
+        # gate) determines whether ``velocity_fn`` issues 1 or 2 model
+        # forwards — and (b) ``num_steps``. Either divergence makes the
+        # FSDP allgather sequence misalign across ranks, deadlocking NCCL
+        # at the 30-min watchdog timeout.
+        #
+        # We align in two places:
+        #   1. Inside velocity_fn (per call): all_reduce the local CFG
+        #      decision; if ANY rank needs CFG, every rank does both
+        #      forwards (cond + uncond). Ranks whose local decision was
+        #      "no CFG" return ``cond_v`` directly — bit-identical to the
+        #      original no-CFG path (no guidance blend, no normalize_cfg).
+        #   2. Around the sampler call: all_reduce the local num_steps;
+        #      ranks with local < max issue a dummy sampler call with the
+        #      remaining steps to pad the FSDP allgather stream. The
+        #      dummy call's output is discarded; ``latents`` is never
+        #      re-bound.
+        #
+        # Both collectives are scoped to the FSDP shard group (the only
+        # process group whose collective sequence is at risk), so they're
+        # safe under non-trivial parallel layouts.
         if (
             self.parallel_dims is not None
             and self.parallel_dims.dp_shard_mesh is not None
@@ -2388,8 +2400,6 @@ def generate_samples_from_batch(
             _dp_shard_group = None
             _align_device = None
 
-        # Create a velocity function for a single sample (for use with self.sampler).
-
         def velocity_fn(noise_x: list[torch.Tensor], timestep: torch.Tensor) -> list[torch.Tensor]:
             # len(noise_x) == B, noise_x[i] is shape (D)
             # timestep is shape (B, 1)
@@ -2412,102 +2422,44 @@ def _single_velocity_fn(tokens: list[list[int]], skip_text_tokens: bool):
                     skip_text_tokens=skip_text_tokens,
                 )
 
-            # Local CFG decision for THIS rank, honoring guidance_interval.
-            _local_needs_text_cfg = guidance != 1.0
-            if _local_needs_text_cfg and guidance_interval is not None:
+            # Skip unconditional branch when outside the guidance interval
+            needs_cfg = guidance != 1.0
+            if needs_cfg and guidance_interval is not None:
                 assert len(guidance_interval) == 2, f"guidance_interval must be [lo, hi], got {guidance_interval}"
                 t_lo, t_hi = guidance_interval
-                _local_needs_text_cfg = t_lo < timestep[0].item() < t_hi
-
-            _local_needs_control_cfg = no_control_state is not None
-            if _local_needs_control_cfg and control_guidance_interval is not None:
-                assert len(control_guidance_interval) == 2, (
-                    f"control_guidance_interval must be [lo, hi], got {control_guidance_interval}"
-                )
-                t_lo_c, t_hi_c = control_guidance_interval
-                _local_needs_control_cfg = t_lo_c < timestep[0].item() < t_hi_c
+                needs_cfg = t_lo < timestep[0].item() < t_hi
 
-            # FSDP alignment: if ANY rank in the shard group needs CFG or control-CFG this call,
-            # every rank computes the matching forwards (cheap 1-element all_reduce per
-            # velocity_fn call). Forcing CFG always-on globally would instead
-            # silently ignore the per-timestep guidance_interval gate.
+            # FSDP alignment: if ANY rank in the shard group needs CFG this
+            # call, every rank computes both forwards. Cheap 1-element
+            # all_reduce per velocity_fn call; the alternative (forcing CFG
+            # always-on globally) would silently ignore the per-timestep
+            # ``guidance_interval`` gate.
             if _dp_shard_group is not None:
-                _cfg_t = torch.tensor(
-                    [1 if _local_needs_text_cfg else 0], device=_align_device, dtype=torch.int32
-                )
+                _cfg_t = torch.tensor([1 if needs_cfg else 0], device=_align_device, dtype=torch.int32)
                 torch.distributed.all_reduce(_cfg_t, op=torch.distributed.ReduceOp.MAX, group=_dp_shard_group)
-                _any_needs_text_cfg = bool(_cfg_t.item())
-                _ctrl_t = torch.tensor(
-                    [1 if _local_needs_control_cfg else 0], device=_align_device, dtype=torch.int32
-                )
-                torch.distributed.all_reduce(_ctrl_t, op=torch.distributed.ReduceOp.MAX, group=_dp_shard_group)
-                _any_needs_control_cfg = bool(_ctrl_t.item())
+                _any_needs_cfg = bool(_cfg_t.item())
             else:
-                _any_needs_text_cfg = _local_needs_text_cfg
-                _any_needs_control_cfg = _local_needs_control_cfg
+                _any_needs_cfg = needs_cfg
 
-            if not _any_needs_text_cfg and not _any_needs_control_cfg:
+            if not _any_needs_cfg:
                 return _single_velocity_fn(cond_tokens, skip_text_tokens=False)
 
-            if _any_needs_control_cfg:
-                cond_v_full = _single_velocity_fn(cond_tokens, skip_text_tokens=False)
-                ctrl_dims: list[int] | None = None
-                if no_control_state is not None:
-                    sp_nc, gdc_nc, ctrl_dims = no_control_state
-                    noise_x_nc = [nx[ctrl_dim:] for nx, ctrl_dim in zip(noise_x, ctrl_dims)]
-                    cond_v_nc = self._get_velocity(
-                        net=net,
-                        noise_x=noise_x_nc,
-                        timestep=timestep,
-                        text_tokens=cond_tokens,
-                        sequence_plans=sp_nc,
-                        gen_data_clean=gdc_nc,
-                        skip_text_tokens=False,
-                    )
-                else:
-                    # Another rank in the dp_shard group needs control-CFG, so this
-                    # rank executes a second forward only for FSDP collective alignment.
-                    cond_v_nc = _single_velocity_fn(cond_tokens, skip_text_tokens=False)
-                if _local_needs_control_cfg:
-                    assert ctrl_dims is not None, "local control-CFG requires no_control_state"
-                    cond_v = []
-                    for v_full_i, v_nc_i, ctrl_dim_i in zip(cond_v_full, cond_v_nc, ctrl_dims):
-                        suffix_full = v_full_i[ctrl_dim_i:]
-                        assert suffix_full.shape == v_nc_i.shape, (
-                            f"shape mismatch in control-CFG mix: full suffix {suffix_full.shape} "
-                            f"vs no-control {v_nc_i.shape}"
-                        )
-                        mixed_suffix = v_nc_i + control_guidance * (suffix_full - v_nc_i)
-                        cond_v.append(torch.cat([v_full_i[:ctrl_dim_i], mixed_suffix], dim=0))
-                else:
-                    cond_v = cond_v_full
-
-                if not _any_needs_text_cfg:
-                    return cond_v
-
-                uncond_v = _single_velocity_fn(uncond_tokens, skip_text_tokens=skip_text_tokens_for_cfg)
-                if not _local_needs_text_cfg:
-                    return cond_v
-
-                v_pred = [u_i + guidance * (c_i - u_i) for c_i, u_i in zip(cond_v, uncond_v)]
-            else:
-                # Both forwards happen — needed for FSDP collective alignment
-                # across ranks even if THIS rank's local decision was "no CFG".
-                cond_v, uncond_v = self._run_classifier_free_guidance(
-                    cond_tokens=cond_tokens,
-                    uncond_tokens=uncond_tokens,
-                    skip_text_tokens_for_cfg=skip_text_tokens_for_cfg,
-                    single_velocity_fn=_single_velocity_fn,
-                )
+            cond_v, uncond_v = self._run_classifier_free_guidance(
+                cond_tokens=cond_tokens,
+                uncond_tokens=uncond_tokens,
+                skip_text_tokens_for_cfg=skip_text_tokens_for_cfg,
+                single_velocity_fn=_single_velocity_fn,
+            )
 
-                if not _local_needs_text_cfg:
-                    # This rank didn't actually need CFG (guidance==1.0, or sigma
-                    # outside guidance_interval). Return cond_v directly so the output
-                    # is bit-identical to the no-CFG path; the uncond_v forward ran
-                    # only to keep the FSDP all-gather sequence aligned with peers.
-                    return cond_v
+            if not needs_cfg:
+                # This rank doesn't actually need CFG (guidance==1.0 or sigma
+                # outside guidance_interval). Return cond_v directly so the
+                # output is bit-identical to the original no-CFG path; the
+                # uncond_v forward was only run to keep the FSDP allgather
+                # sequence aligned with peers.
+                return cond_v
 
-                v_pred = [u_i + guidance * (c_i - u_i) for c_i, u_i in zip(cond_v, uncond_v)]
+            v_pred = [u_i + guidance * (c_i - u_i) for c_i, u_i in zip(cond_v, uncond_v)]
 
             if normalize_cfg:
                 v_pred = [
@@ -2517,14 +2469,18 @@ def _single_velocity_fn(tokens: list[list[int]], skip_text_tokens: bool):
 
             return v_pred
 
-        # FSDP collective-sequence alignment (throughput-preset inference). The
-        # inference caller passes the cross-rank MAX of num_steps as
-        # ``align_num_steps``; ranks short of it pad their FSDP all-gather stream
-        # with ``_extra_num_steps`` discarded sampler steps below. See the
-        # ``align_num_steps`` arg docstring for the full rationale.
-        _extra_num_steps = 0
-        if align_num_steps is not None and align_num_steps > num_steps:
-            _extra_num_steps = align_num_steps - num_steps
+        # FSDP collective-sequence alignment (sampler outer loop). See the
+        # large block above the velocity_fn definition for the full
+        # rationale. all_reduce on the local num_steps so every rank knows
+        # the max; below, ranks with local < max issue a dummy sampler call
+        # to pad their FSDP allgather sequence.
+        if _dp_shard_group is not None:
+            _local_steps_t = torch.tensor([num_steps], device=_align_device, dtype=torch.int32)
+            torch.distributed.all_reduce(_local_steps_t, op=torch.distributed.ReduceOp.MAX, group=_dp_shard_group)
+            _max_num_steps = int(_local_steps_t.item())
+        else:
+            _max_num_steps = num_steps
+        _extra_num_steps = _max_num_steps - num_steps
 
         # Run sampler for all samples at once.
         sampler = sampler or self.sampler
@@ -2543,14 +2499,14 @@ def _single_velocity_fn(tokens: list[list[int]], skip_text_tokens: bool):
                 seed=seed,
             )
             if _extra_num_steps > 0:
-                # Dummy sampler call issuing (_extra_num_steps × per-step) FSDP
-                # all-gathers to pad this rank's collective stream. Output is
-                # discarded (``latents`` keeps the real result above); slow ranks
-                # have _extra_num_steps==0 and issue the same all-gather count via
-                # their longer real call.
+                # Dummy sampler call to issue (_extra_num_steps × per-step)
+                # FSDP allgathers; output discarded so `latents` keeps the
+                # real result captured above. Slow ranks have _extra_num_steps==0
+                # here, but they're issuing the SAME number of in-sampler
+                # collectives via their longer real call.
                 log.debug(
-                    f"FSDP alignment: dummy UniPC run with {_extra_num_steps} extra steps "
-                    f"(local={num_steps}, aligned={align_num_steps})"
+                    f"FSDP alignment: dummy sampler run with {_extra_num_steps} "
+                    f"extra steps (local={num_steps}, max={_max_num_steps})"
                 )
                 _ = sampler(
                     velocity_fn,
@@ -2586,30 +2542,50 @@ def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
                 solver_option="2ab",
             )
             if _extra_num_steps > 0:
-                # Pad the FSDP all-gather stream with ``_extra_num_steps`` direct
-                # ``x0_fn`` calls rather than a nested EDM sampler run, which would
-                # add an extra ``sample_clean`` forward (see edm.py) and hit a
-                # ``get_rev_ts(num_steps=0)`` divide-by-zero. Each x0_fn call routes
-                # through the same ``velocity_fn`` closure (one model forward,
-                # discarded). The dummy sigma is mapped into the RF domain
-                # ``sigma/(1+sigma)`` the real EDM loop uses; its exact value is
-                # irrelevant for alignment (collective shape, not sigma, drives it).
+                # Pad the FSDP allgather sequence with ``_extra_num_steps``
+                # direct ``x0_fn`` calls instead of a second EDM sampler
+                # run. Avoids two EDM-specific footguns:
+                #   (1) ``EDMSampler._forward_impl`` always runs an extra
+                #       ``sample_clean`` denoiser forward (see
+                #       ``cosmos_framework/model/vfm/diffusion/samplers/edm.py``).
+                #       A nested sampler call would add one too many
+                #       forwards on fast ranks, since the slow rank's
+                #       single call also pays the ``sample_clean`` cost.
+                #   (2) ``get_rev_ts(..., num_steps=0)`` divides by zero,
+                #       producing NaN sigmas. The fix's ``extra==1`` edge
+                #       case would need num_steps=0 to balance the count.
+                # Direct ``x0_fn`` calls bypass both: each call routes
+                # through the same ``velocity_fn`` closure (so the
+                # per-call CFG all_reduce still aligns ranks), issues
+                # exactly one model forward, and discards its return.
+                # ``latents`` is the catted single tensor at this point;
+                # the dummy sigma value is irrelevant for collective
+                # alignment because the model's allgather sequence is
+                # determined by tensor shapes, not sigma.
                 log.debug(
                     f"FSDP alignment: padding {_extra_num_steps} dummy x0_fn calls "
-                    f"(local={num_steps}, aligned={align_num_steps})"
+                    f"(local={num_steps}, max={_max_num_steps})"
                 )
+                # ``x0_fn`` expects a sigma in the RF domain (the real EDM
+                # loop converts raw sigmas via ``sigmas_L / (1 + sigmas_L)``
+                # at edm.py:174, landing them in ``(0, 1)``). Mirror that
+                # transform here so the dummy call's timestep stays in the
+                # same numerical domain as a real sampler step. The exact
+                # value doesn't matter for collective alignment, only the
+                # domain.
                 _dummy_sigma = latents.new_tensor(sigma_max / (1.0 + sigma_max))
                 for _ in range(_extra_num_steps):
                     _ = x0_fn(latents, _dummy_sigma)
             latents = list(torch.split(latents, chunk_sizes, dim=0))
 
-        # Split flattened latents back into vision, action, and sound
+        # Split flattened latents back into vision latents, external actions, and sound latents
         # Mirror the per-sample logic from _prepare_inference_data:
         # Order: [vision | action (if present) | sound (if present)]
         # action/sound lists are dense (only modality-having samples), so use separate indexes.
         result_vision: list[torch.Tensor] = []
         result_action: list[torch.Tensor] = []
         result_sound: list[torch.Tensor] = []
+        action_processing_records = get_action_processing_records(data_batch)
         idx_vision = 0
         idx_action = 0
         idx_sound = 0
@@ -2624,7 +2600,6 @@ def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
                 vision_shape = gen_data_clean.x0_tokens_vision[idx_vision + j].shape
                 vision_dim = int(torch.prod(torch.tensor(vision_shape)))
                 if j == n_vis - 1:  # the last vision item is the only target for each sample.
-
                     result_vision.append(latents[i][offset : offset + vision_dim].reshape(vision_shape))
                 else:  # the other vision items are the condition inputs that we don't need to return
                     pass
@@ -2636,7 +2611,15 @@ def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
                 assert gen_data_clean.x0_tokens_action is not None
                 action_shape = gen_data_clean.x0_tokens_action[idx_action].shape
                 action_dim = int(torch.prod(torch.tensor(action_shape)))
-                result_action.append(latents[i][offset : offset + action_dim].reshape(action_shape))
+                action_model = latents[i][offset : offset + action_dim].reshape(action_shape)  # [T,D_model]
+                action_record = action_processing_records[i] if i < len(action_processing_records) else None
+                if action_record is None:
+                    raise ValueError(
+                        f"Generated action output for sample {i} cannot be externalized without "
+                        "action_processing_record"
+                    )
+                action_external = ActionProcessor.postprocess_action(action_model, action_record)  # [T,D_raw]
+                result_action.append(action_external)
                 offset += action_dim
                 idx_action += 1
 
@@ -2744,6 +2727,11 @@ def _slice_gen_data_clean(self, gen_data_clean: GenerationDataClean, start: int,
             subset_raw_vision = (
                 gen_data_clean.raw_state_vision[vis_start:vis_end] if gen_data_clean.raw_state_vision else None
             )
+            subset_temporal_positions_vision = (
+                gen_data_clean.temporal_positions_vision[vis_start:vis_end]
+                if gen_data_clean.temporal_positions_vision
+                else None
+            )
             subset_num_items = num_items[start:limit]
         else:
             # Standard single-item mode
@@ -2751,6 +2739,11 @@ def _slice_gen_data_clean(self, gen_data_clean: GenerationDataClean, start: int,
             subset_raw_vision = (
                 gen_data_clean.raw_state_vision[start:limit] if gen_data_clean.raw_state_vision else None
             )
+            subset_temporal_positions_vision = (
+                gen_data_clean.temporal_positions_vision[start:limit]
+                if gen_data_clean.temporal_positions_vision
+                else None
+            )
             subset_num_items = None
         fps_vision = gen_data_clean.fps_vision[start:limit] if gen_data_clean.fps_vision is not None else None
 
@@ -2788,6 +2781,7 @@ def _slice_gen_data_clean(self, gen_data_clean: GenerationDataClean, start: int,
             x0_tokens_action=x0_tokens_action,
             x0_tokens_sound=x0_tokens_sound,
             fps_vision=fps_vision,
+            temporal_positions_vision=subset_temporal_positions_vision,
             fps_action=fps_action,
             fps_sound=fps_sound,
             action_domain_id=action_domain_id,
@@ -2814,7 +2808,6 @@ def get_data_and_condition(self, data_batch: dict[str, torch.Tensor], iteration:
         is_image_batch = self.is_image_batch(data_batch)
         sample_vision_list = data_batch[self.input_image_key if is_image_batch else self.input_video_key]
 
-
         # we should always get this information here during training. If we can read this field
         # from data_batch it means we are in the visualization callback:
         if "num_vision_items_per_sample" not in data_batch:
@@ -2827,7 +2820,6 @@ def get_data_and_condition(self, data_batch: dict[str, torch.Tensor], iteration:
             num_vision_items_per_sample: list[int] | None = (
                 [len(v) for v in sample_vision_list] if has_multiple_vision_per_sample else None
             )
-
             # information is only stored in the GenerationDataClean object which will be discarded
             # outside the training loop. Error will be raised when the data batch is passed to the
             # visualization callbacks.
@@ -2869,6 +2861,11 @@ def get_data_and_condition(self, data_batch: dict[str, torch.Tensor], iteration:
         if frame_size is not None:
             x0_tokens_vision = self._remove_padding_from_latent(x0_tokens_vision, frame_size)
 
+        temporal_positions_vision = self._get_temporal_positions_vision(
+            raw_state_vision=raw_state_vision,
+            x0_tokens_vision=x0_tokens_vision,
+        )
+
         # Action – extract dense action / domain_id without mutating data_batch,
         # so downstream callbacks can still read the original per-sample domain_ids.
         raw_state_action, action_domain_id = self._normalize_action_databatch(data_batch)
@@ -2934,6 +2931,7 @@ def get_data_and_condition(self, data_batch: dict[str, torch.Tensor], iteration:
             x0_tokens_action=x0_tokens_action,
             x0_tokens_sound=x0_tokens_sound,
             fps_vision=fps_vision,
+            temporal_positions_vision=temporal_positions_vision,
             fps_action=fps_action,
             fps_sound=fps_sound,
             action_domain_id=action_domain_id,
@@ -3135,14 +3133,59 @@ def _augment_image_dim_inplace(self, data_batch: dict[str, torch.Tensor], input_
 
     # ------------------ Checkpointing ------------------
 
-    def state_dict(self, prefix: str = "", **kwargs) -> Dict[str, Any]:
-        final_state_dict = self.net.state_dict(prefix=prefix + "net.", **kwargs)
+    def state_dict(
+        self,
+        destination: dict[str, Any] | None = None,
+        prefix: str = "",
+        keep_vars: bool = False,
+    ) -> dict[str, Any]:
+        """Return checkpointable model weights using OmniMoT's flat key layout.
+
+        The regular network is saved under ``net.*`` keys.  When EMA is
+        enabled, the EMA copy is saved under matching ``net_ema.*`` keys so
+        the DCP loader can materialize both trees from one flat state dict.
+        The optional ``prefix`` is prepended before those namespaces, matching
+        the ``torch.nn.Module.state_dict`` convention.
+
+        The full ``torch.nn.Module.state_dict`` signature (``destination``,
+        ``prefix``, ``keep_vars``) is honored so this module behaves correctly
+        when a parent module's ``state_dict`` recurses into it: PyTorch ignores
+        the child return value and expects the entries to be written into the
+        provided ``destination`` mapping.
+
+        If ``exclude_reasoner_weights_from_checkpoint`` is enabled, the
+        understanding/reasoner tower keys are omitted from both regular and
+        EMA state dicts; generation-pathway weights and VFM heads remain
+        checkpointed.
+        """
+        reg_state_dict = self._net_state_dict(
+            self.net,
+            prefix=prefix + "net.",
+            keep_vars=keep_vars,
+        )
+
         if self.config.ema.enabled:
-            ema_state_dict = self.net_ema.state_dict(prefix=prefix + "net_ema.", **kwargs)
-            final_state_dict.update(ema_state_dict)
-        return final_state_dict
+            ema_state_dict = self._net_state_dict(
+                self.net_ema,
+                prefix=prefix + "net_ema.",
+                keep_vars=keep_vars,
+            )
+        else:
+            ema_state_dict = {}
 
-    def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True, assign: bool = False):
+        if destination is not None:
+            destination.update(reg_state_dict)
+            destination.update(ema_state_dict)
+            return destination
+
+        return {**reg_state_dict, **ema_state_dict}
+
+    def load_state_dict(
+        self,
+        state_dict: Mapping[str, Any],
+        strict: bool = True,
+        assign: bool = False,
+    ) -> _IncompatibleKeys:
         """
         Loads a state dictionary into the model and optionally its EMA counterpart.
 
@@ -3150,41 +3193,90 @@ def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True, as
             state_dict (Mapping[str, Any]): A dictionary containing separate state
                 dictionaries for the model and potentially for an EMA version of the model
                 under the keys 'net' and 'net_ema', respectively.
-            strict (bool, optional): If True, the method will enforce that the keys in
-                the state dict match exactly those in the model and EMA model (if applicable).
-                Defaults to True.
-            assign (bool, optional): If True and in strict mode, will assign the state dictionary
-                directly rather than matching keys one-by-one. This is typically used when loading
-                parts of state dicts or using customized loading procedures. Defaults to False.
+            strict (bool, optional): Must be False. Missing and unexpected keys are
+                returned to the caller in an `_IncompatibleKeys` object so the DCP
+                wrapper can report them after `set_model_state_dict` completes.
+                Passing True raises ValueError.
+            assign (bool, optional): Must be False. Assign-mode loading is not
+                supported by this checkpoint path; passing True raises ValueError.
+                Defaults to False.
+
+        Returns:
+            _IncompatibleKeys: A tuple containing the missing and unexpected keys.
         """
-        if not strict:
-            raise ValueError("Strict mode is required for OmniMoTModel load_state_dict")
+        # Note that strict must be set to False to avoid facing errors inside the
+        # `set_model_state_dict` function in the parent class. The caller must check
+        # the returned `_IncompatibleKeys` to get the missing and unexpected keys,
+        # and raise errors if needed.
+        if strict:
+            raise ValueError("Strict mode is not supported for OmniMoTModel load_state_dict")
         if assign:
             raise ValueError("Assign mode is not supported for OmniMoTModel load_state_dict")
 
+        missing_keys: list[str] = []
+        unexpected_keys: list[str] = []
+
         _reg_state_dict = collections.OrderedDict()
         _ema_state_dict = collections.OrderedDict()
         for k, v in state_dict.items():
             if k.startswith("net."):
-                _reg_state_dict[k.replace("net.", "")] = v
-            elif k.startswith("net_ema."):
-                _ema_state_dict[k.replace("net_ema.", "")] = v
-
-        state_dict = _reg_state_dict
+                _reg_state_dict[k.removeprefix("net.")] = v
+            elif k.startswith("net_ema.") and self.config.ema.enabled:
+                _ema_state_dict[k.removeprefix("net_ema.")] = v
+            else:
+                # If the key is prefixed with "net_ema." but EMA is not enabled, it
+                # is unexpected. If the key is not prefixed with "net." or "net_ema.",
+                # it is unexpected.
+                unexpected_keys.append(k)
 
-        reg_results: _IncompatibleKeys = self.net.load_state_dict(_reg_state_dict, strict=True, assign=False)
-        missing_keys = reg_results.missing_keys
-        unexpected_keys = reg_results.unexpected_keys
+        reg_results = self._load_net_state_dict(self.net, _reg_state_dict)
+        missing_keys.extend(f"net.{k}" for k in reg_results.missing_keys)
+        unexpected_keys.extend(f"net.{k}" for k in reg_results.unexpected_keys)
 
         if self.config.ema.enabled:
-            ema_results: _IncompatibleKeys = self.net_ema.load_state_dict(_ema_state_dict, strict=True, assign=False)
-            missing_keys += ema_results.missing_keys
-            unexpected_keys += ema_results.unexpected_keys
-        else:
-            assert len(_ema_state_dict) == 0, f"EMA is disabled but EMA state dict is not empty: {len(_ema_state_dict)}"
+            ema_results = self._load_net_state_dict(self.net_ema, _ema_state_dict)
+            missing_keys.extend(f"net_ema.{k}" for k in ema_results.missing_keys)
+            unexpected_keys.extend(f"net_ema.{k}" for k in ema_results.unexpected_keys)
 
         return _IncompatibleKeys(missing_keys=missing_keys, unexpected_keys=unexpected_keys)
 
+    def _net_state_dict(
+        self,
+        net: torch.nn.Module,
+        prefix: str = "",
+        keep_vars: bool = False,
+    ) -> dict[str, Any]:
+        if self.config.exclude_reasoner_weights_from_checkpoint:
+            return {
+                k: v
+                for k, v in net.state_dict(prefix=prefix, keep_vars=keep_vars).items()
+                if not _is_reasoner_state_dict_key(k.removeprefix(prefix))
+            }
+        else:
+            return net.state_dict(prefix=prefix, keep_vars=keep_vars)
+
+    def _load_net_state_dict(
+        self,
+        net: torch.nn.Module,
+        state_dict: Mapping[str, Any],
+    ) -> _IncompatibleKeys:
+        if self.config.exclude_reasoner_weights_from_checkpoint:
+            # Leave pretrained reasoner weights untouched even if an incoming
+            # checkpoint contains them, and tolerate their absence when they
+            # were intentionally not checkpointed.
+            state_dict = collections.OrderedDict(
+                (k, v) for k, v in state_dict.items() if not _is_reasoner_state_dict_key(k)
+            )
+
+        ret: _IncompatibleKeys = net.load_state_dict(state_dict, strict=False, assign=False)
+
+        if self.config.exclude_reasoner_weights_from_checkpoint:
+            missing_keys = [k for k in ret.missing_keys if not _is_reasoner_state_dict_key(k)]
+        else:
+            missing_keys = ret.missing_keys
+
+        return _IncompatibleKeys(missing_keys=missing_keys, unexpected_keys=ret.unexpected_keys)
+
     # ------------------ public methods ------------------
 
     def ema_beta(self, iteration: int) -> float:
@@ -3236,7 +3328,7 @@ def _extract_upsample_video_specs(
         tensor (``shape[-1]`` for width, ``shape[-2]`` for height), and
         the ``aspect_ratio`` string is reverse-looked-up against the
         canonical ``{IMAGE,VIDEO}_RES_SIZE_INFO`` tables in
-        :mod:`projects.cosmos3.vfm.datasets.utils` — image table for
+        :mod:`cosmos_framework.data.vfm.utils` — image table for
         ``"t2i"``, video table otherwise.  Note these tables are
         ``{res: {ar: (W, H)}}`` (the first entry is *width*); the
         existing logging-only lookup in
@@ -3251,7 +3343,7 @@ def _extract_upsample_video_specs(
         where ``num_frames`` is the temporal dimension
         (``shape[-3]``) of the same vision tensor.  For ``"t2i"`` both
         fields are returned as ``None`` so
-        :func:`projects.cosmos3.vfm.upsampler.prompts.build_user_text`'s
+        :func:`cosmos_framework.model.vfm.upsampler.prompts.build_user_text`'s
         ``t2i``-must-have-no-video-args contract is satisfied.
 
         Args:
@@ -3326,7 +3418,7 @@ def _extract_upsample_video_specs(
             raise ValueError(f"upsample task={task!r}: conditioning_fps must be positive; got {fps_int}.")
         num_frames = int(sample.shape[-3])
         # Integer-floor seconds matches the canonical V4.2 ``M:SS`` rendering
-        # in :func:`projects.cosmos3.vfm.upsampler.prompts._format_duration`,
+        # in :func:`cosmos_framework.model.vfm.upsampler.prompts._format_duration`,
         # which expects an int and rejects fractional seconds.
         duration_secs = max(1, num_frames // fps_int)
         return aspect_ratio, w, h, fps_int, duration_secs
@@ -3837,7 +3929,7 @@ def generate_reasoner_text(
                 ``np.ndarray``, or a CHW / HWC tensor).
             prompt_builder: Optional callback that maps a raw prompt
                 string to a chat-style messages list (e.g.
-                :func:`projects.cosmos3.vfm.upsampler.prompts.build_messages`
+                :func:`cosmos_framework.model.vfm.upsampler.prompts.build_messages`
                 for V4.2 caption upsampling).  When ``None``, prompts are
                 wrapped as ``[{"role": "user", "content": prompt}]`` with
                 no system message.
@@ -4082,7 +4174,7 @@ def upsample_captions(
         prompt-driven branch.  The only thing this method adds on top of
         the generic per-prompt loop is the V4.2 chat-template injection:
         each caption is wrapped via
-        :func:`projects.cosmos3.vfm.upsampler.prompts.build_messages`
+        :func:`cosmos_framework.model.vfm.upsampler.prompts.build_messages`
         (which returns ``[system, user]`` with the user content embedding
         the caption inside the canonical V4.2 template — instructions,
         task constraints, and output JSON schema for the requested task).
@@ -4105,7 +4197,7 @@ def upsample_captions(
           position ids) before kicking off the AR decode loop.
 
         Each raw reasoner output is post-processed by
-        :func:`projects.cosmos3.vfm.upsampler.prompts.clean_response`
+        :func:`cosmos_framework.model.vfm.upsampler.prompts.clean_response`
         before being returned.  The cleaner strips
         ``<think>`` / ``<reasoning>`` / ``<thinking>`` / etc. reasoning
         blocks and any prose preamble that appears before the
@@ -4157,7 +4249,7 @@ def upsample_captions(
             fps: Target frames-per-second for the generated clip.
                 Required for the video tasks (``"t2v"``, ``"i2v"``)
                 and must be ``None`` for ``"t2i"`` — the underlying
-                :func:`projects.cosmos3.vfm.upsampler.prompts.build_user_text`
+                :func:`cosmos_framework.model.vfm.upsampler.prompts.build_user_text`
                 raises ``ValueError`` if a video task is missing
                 ``fps`` or ``duration_secs``.
             duration_secs: Clip duration in whole seconds (rendered as
@@ -4259,25 +4351,32 @@ def _builder(description: str) -> list[dict[str, Any]]:
         # into ``data_batch[self.input_caption_key]`` at the call site.
         cleaned_outputs: list[str] = []
         n_stripped = 0
-        n_fallback = 0
         for raw, original in zip(raw_outputs, captions):
             cleaned_text, clean_info = clean_response(raw)
             if not clean_info["was_clean"]:
                 n_stripped += 1
             if not cleaned_text.strip():
                 cleaned_text = original
-                n_fallback += 1
+
+            # Stamp the actual generation ``duration`` onto the upsampled
+            # JSON object using the duration_secs argument. Only done for
+            # T2V and I2V tasks.
+            if duration_secs is not None:
+                cleaned_text = cleaned_text.removeprefix("```json").removesuffix("```").strip()
+                obj = json.loads(cleaned_text)
+                assert isinstance(obj, dict), f"JSON parsing failed with error: {type(obj)}"
+                obj["duration"] = f"{duration_secs}s"
+                cleaned_text = json.dumps(obj)
+
             cleaned_outputs.append(cleaned_text)
 
         # Stay silent on the canonical all-clean path; only emit
         # telemetry when something actually happened.  Logged per-rank
         # to match the surrounding upsampling logs in
         # :meth:`generate_samples_from_batch` (line ~2218).
-        if n_stripped or n_fallback:
+        if n_stripped:
             log.info(
-                f"upsample_captions(task={task!r}, n={len(raw_outputs)}): "
-                f"thinking-stripped={n_stripped}, "
-                f"empty-clean-fallback={n_fallback}",
+                f"upsample_captions(task={task!r}, n={len(raw_outputs)}): thinking-stripped={n_stripped}",
                 rank0_only=False,
             )
 
@@ -4401,3 +4500,18 @@ def _broadcast_seed(seed: list[int], group: dist.ProcessGroup, rank: int) -> lis
 
     dist.broadcast(seed_tensor, group=group, group_src=0)
     return seed_tensor.tolist()
+
+
+def _is_reasoner_state_dict_key(key: str) -> bool:
+    """Return True for und/reasoner-tower weights nested under ``language_model``.
+
+    Reasoner weights are the understanding-pathway parameters in the MoT
+    language tower: ``embed_tokens``, ``norm``, ``lm_head``, ``visual``, and
+    every layer weight *without* the ``_moe_gen`` suffix.  Generation-pathway
+    duplicates (``*_moe_gen``) and all non-``language_model`` VFM heads are
+    excluded from this predicate.
+    """
+    key = key.replace("_orig_mod.", "").replace("_checkpoint_wrapped_module.", "")
+    if not key.startswith("language_model."):
+        return False
+    return "_moe_gen" not in key
diff --git a/cosmos_framework/model/vfm/parallelize_vlm.py b/cosmos_framework/model/vfm/parallelize_vlm.py
index d4f31a4..04dcca3 100644
--- a/cosmos_framework/model/vfm/parallelize_vlm.py
+++ b/cosmos_framework/model/vfm/parallelize_vlm.py
@@ -1,14 +1,15 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """FSDP2 wrapping for Cosmos3 VLM ``HFModel`` instances.
 
 Hosts the single VLM-specific ``parallelize`` entry point used by
-``vlm_model.VLMModel._init_vlm``.  Lives under ``projects/cosmos3/vfm/models/``
+``vlm_model.VLMModel._init_vlm``.  Lives under ``cosmos_framework/model/vfm/``
 so the FSDP wrapping concern sits next to the model class it operates on
 (mirroring the layout of ``models/mot/parallelize_unified_mot.py`` for the
 MoT path).
 
-Pure parallelism plumbing — :class:`~projects.cosmos3.vfm.utils.parallelism.ParallelDims`
+Pure parallelism plumbing — :class:`~cosmos_framework.utils.vfm.parallelism.ParallelDims`
 and its meshes — stays in ``vfm/utils/parallelism.py``.
 """
 
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/__init__.py b/cosmos_framework/model/vfm/tokenizers/audio/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/__init__.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae.py b/cosmos_framework/model/vfm/tokenizers/audio/avae.py
index a23ee80..d28e910 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae.py
@@ -2,9 +2,9 @@
 # SPDX-License-Identifier: OpenMDW-1.1
 
 """
-AVAE (Audio Variational AutoEncoder) Tokenizer.
-
-Ported from BigVGAN (https://github.com/NVIDIA/BigVGAN).
+AVAE (Audio Variational AutoEncoder) Tokenizer for Imaginaire4
+ported from https://invalid_url
+commit hash: 80fbd8cfecb1867cc864e6d4fe0a474d8403a474
 """
 
 import os
@@ -128,7 +128,7 @@ def _load_avae_model(
         )
 
     # Create model directly on device (don't use meta device)
-
+    # NOTE: Unlike WanVAE/FluxVAE, AVAE uses weight_norm extensively in OobleckDecoder
     # and SpectrogramConvNeXtEncoder. After loading the checkpoint, we must call
     # remove_weight_norm() which requires materialized tensors (not meta tensors).
     # Therefore, we create the model directly on the target device instead of using
@@ -358,7 +358,6 @@ def __init__(
             use_object_store = False
 
             # Parent directory is registered in checkpoint_db.
-
             if vae_path_full:
                 vae_dir, vae_name = os.path.split(vae_path_full)
                 vae_dir = download_checkpoint_v2(vae_dir)
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/activations.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/activations.py
index e1eb85e..5120c99 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/activations.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/activations.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Implementation adapted from https://github.com/EdwardDixon/snake under the MIT license.
 
 from typing import List
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/__init__.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/__init__.py
index 5d8b9ed..6bae2ca 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/__init__.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/__init__.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 
 from .act import Activation1d
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/act.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/act.py
index 1d2763e..176a680 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/act.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/act.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 
 import torch.nn as nn
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/filter.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/filter.py
index 714cfac..27c71c7 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/filter.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/filter.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 
 import math
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/resample.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/resample.py
index 025d226..4f1df94 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/resample.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/alias_free_torch/resample.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/junjun3518/alias-free-torch under the Apache License 2.0
 
 import torch.nn as nn
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/bottlenecks.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/bottlenecks.py
index c7a27f4..36ddacc 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/bottlenecks.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/bottlenecks.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Bottleneck modules for AVAE tokenizer.
 
 This cleaned-up version only includes VAEBottleneck which is used
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/env.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/env.py
index f0c3a88..2572ca9 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/env.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/env.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 
 from typing import Any
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/models.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/models.py
index 130acfc..2567a5e 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/models.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/models.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 
 """AVAE Models.
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules.py
index 0cb53a3..a7a549e 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/jik876/hifi-gan under the MIT license.
 
 """AVAE Modules.
diff --git a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules_encodec.py b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules_encodec.py
index 0ffee6c..c8183a5 100644
--- a/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules_encodec.py
+++ b/cosmos_framework/model/vfm/tokenizers/audio/avae_utils/modules_encodec.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 # Adapted from https://github.com/facebookresearch/encodec under the MIT license.
 
 """Convolutional layers wrappers and utilities."""
diff --git a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_4x32x32.py b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_4x32x32.py
index 9aceec2..18b242a 100644
--- a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_4x32x32.py
+++ b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_4x32x32.py
@@ -1,11 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
+from collections.abc import Sequence
+
 import torch
 
 from cosmos_framework.utils import log
 from cosmos_framework.utils.distributed import get_rank, sync_model_states
 from cosmos_framework.utils.easy_io import easy_io
+from cosmos_framework.data.vfm.utils import VIDEO_RES_SIZE_INFO
 from cosmos_framework.model.vfm.tokenizers.dc_ae.dc_ae_v import (
     DCAEV,
     DCAEVConfig,
@@ -13,7 +16,7 @@
 )
 from cosmos_framework.model.vfm.tokenizers.interface import VideoTokenizerInterface
 
-DEFAULT_MODEL_NAME = "dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.1"
+DEFAULT_MODEL_NAME = "dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2"
 
 
 class DCAE4x32x32Interface(VideoTokenizerInterface):
@@ -22,7 +25,7 @@ def __init__(
         bucket_name: str = "",
         object_store_credential_path_pretrained: str = "",
         vae_path: str = "",
-        chunk_duration: int = 81,
+        chunk_duration: int = 16,
         model_name: str = DEFAULT_MODEL_NAME,
         spatial_compression_factor: int = 32,
         temporal_compression_factor: int = 4,
@@ -30,15 +33,21 @@ def __init__(
         encode_bucket_multiple: int = 2,  # Placeholder
         device: str = "cuda",
         compilable: bool = True,
+        causal: bool = True,
     ):
+        self._causal = causal
+        assert self._causal, "DCAE4x32x32Interface is a causal tokenizer; causal must be True."
         vae_path_full = f"s3://{bucket_name}/{vae_path}"
         self._spatial_compression_factor = spatial_compression_factor
         self._temporal_compression_factor = temporal_compression_factor
         self.chunk_duration = chunk_duration
+        self.model_name = model_name
+        self.resolutions = None
 
         # Build config (without pretrained_path so DCAEV doesn't try to load itself).
         cfg: DCAEVConfig = dc_ae_v_f32t4_encoder_causal_decoder_chunk_causal_4(model_name, pretrained_path=None)
         cfg.compilable = compilable
+        cfg.encode_temporal_tile_size = chunk_duration
 
         # Instantiate model on meta device to avoid double allocation.
         with torch.device("meta"):
@@ -61,6 +70,7 @@ def __init__(
         self.model.to(dtype=torch.bfloat16)
 
         sync_model_states(self.model)
+        self.model.encoder = self.model.encoder.to(memory_format=torch.channels_last_3d)
         self.is_compiled = False
         self.use_streaming_encode = False
 
@@ -72,11 +82,52 @@ def compile_encode_for_cudagraphs(
         dynamic: bool = False,
         backend: str = "inductor",
     ) -> None:
-
-        self.model.encoder = self.model.encoder.to(memory_format=torch.channels_last_3d)
         self.model.encoder = torch.compile(self.model.encoder, fullgraph=True, mode=mode)
         self.is_compiled = True
 
+    @torch.inference_mode()
+    def compile_encode(
+        self,
+        warmup_resolutions: Sequence[str],
+        output_dir: str | None = None,
+        aspect_ratio: str | None = None,
+        backend: str | None = "inductor",
+        mode: str | None = "reduce-overhead",
+        fullgraph: bool = False,
+        dynamic: bool = False,
+    ) -> None:
+        """Compile the encode function for the given resolutions."""
+        if self.is_compiled:
+            log.warning("Tokenizer is already compiled, skipping compilation.")
+            return
+
+        if backend is None:
+            raise ValueError("backend must be provided")
+
+        self.compile_encode_for_cudagraphs(mode=mode, fullgraph=fullgraph, dynamic=dynamic, backend=backend)
+
+        # Run warmup resolutions
+        if aspect_ratio is None:
+            aspect_ratios = list(VIDEO_RES_SIZE_INFO["256"].keys())
+        else:
+            if isinstance(aspect_ratio, str):
+                if aspect_ratio not in VIDEO_RES_SIZE_INFO["256"]:
+                    raise ValueError(f"Aspect ratio {aspect_ratio} not found in predefined aspect ratios")
+                aspect_ratios = [aspect_ratio]
+            else:
+                raise ValueError(f"Aspect ratio {aspect_ratio} must be a string")
+
+        self.resolutions = warmup_resolutions
+        self.aspect_ratios = aspect_ratios
+
+        T = self.chunk_duration - self.model.cfg.num_pad_frames
+        for resolution in warmup_resolutions:
+            for aspect_ratio in aspect_ratios:
+                H, W = VIDEO_RES_SIZE_INFO[resolution][aspect_ratio]
+                log.info(f"Warming up {resolution} {aspect_ratio}")
+                for _ in range(2):
+                    self.model.encode(torch.randn(1, 3, T, H, W).cuda().to(torch.bfloat16))
+
     @property
     def dtype(self):
         return self.model.dtype
@@ -86,6 +137,12 @@ def reset_dtype(self):
 
     @torch.inference_mode()
     def encode(self, state: torch.Tensor) -> torch.Tensor:
+        if self.resolutions is not None:
+            for resolution in self.resolutions:
+                if tuple(state.shape[3:]) in VIDEO_RES_SIZE_INFO[resolution].values():
+                    break
+            else:
+                raise ValueError(f"State shape {state.shape[2:]} is not in {self.resolutions}")
         in_dtype = state.dtype
         tcf = self._temporal_compression_factor
         # Add padding to the sequence length to make it divisible by
diff --git a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v.py b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v.py
index 2d31cae..71daf06 100644
--- a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v.py
+++ b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v.py
@@ -688,7 +688,7 @@ def _visit(module: nn.Module) -> None:
                         w,
                         dtype=dtype,
                         device=device,
-                    )
+                    ).to(memory_format=torch.channels_last_3d)
                 )
         elif isinstance(module, ResBlock3d):
             _visit(module.conv1)
@@ -783,7 +783,8 @@ def temporal_tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
 
         row = []
         for i in tqdm(range(0, x.shape[2], overlap_size), desc="Tiled Encode", disable=not self.cfg.verbose):
-            tile = x[:, :, i : i + tile_size, :, :]
+            # Clone is required for compiled tokenizer to avoid recompilation (view has different memory strides).
+            tile = x[:, :, i : i + tile_size, :, :].clone()
             actual_t = tile.shape[2]
             remove_padding = False
             if actual_t < tile_size and self.cfg.compilable:
@@ -897,7 +898,7 @@ def dc_ae_v_f32t4_encoder_causal_decoder_chunk_causal_4(
     elif name in [
         "dcae4x32x32_c64_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2",
     ]:
-        latent_channels, num_pad_frames, temporal_remainder, scaling_factor = 64, 7, 1, 0.7103
+        latent_channels, num_pad_frames, temporal_remainder, scaling_factor = 64, 7, 1, 0.5704
         encoder_width_list = [0, 64, 128, 512, 1024, 1024, 1024]
     elif name in [
         "dcae4x32x32_c96_t120_256p_fps_all_encoder_causal_decoder_chunk_causal_4_nogan_cosmos_pad_7_v0.2",
diff --git a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_ops.py b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_ops.py
index 4a96748..6f71d99 100644
--- a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_ops.py
+++ b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_ops.py
@@ -311,7 +311,6 @@ def forward(
             x = F.pad(x, self.custom_padding, mode=self.custom_padding_mode)
 
         if self.causal_chunk_length is not None:
-
             B, C, T, H, W = x.shape
             assert T % self.causal_chunk_length == 0
             assert self.conv.stride[0] == 1
diff --git a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_triton_rms_norm.py b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_triton_rms_norm.py
index 2e00cbc..8123e6c 100644
--- a/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_triton_rms_norm.py
+++ b/cosmos_framework/model/vfm/tokenizers/dc_ae/dc_ae_v_triton_rms_norm.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-
 import torch
 import triton
 import triton.language as tl
diff --git a/cosmos_framework/model/vfm/tokenizers/flux_vae_8x8.py b/cosmos_framework/model/vfm/tokenizers/flux_vae_8x8.py
index 98a322a..3446646 100644
--- a/cosmos_framework/model/vfm/tokenizers/flux_vae_8x8.py
+++ b/cosmos_framework/model/vfm/tokenizers/flux_vae_8x8.py
@@ -376,8 +376,10 @@ def __init__(
         chunk_duration: int = 1,
         spatial_compression_factor: int = 8,
         temporal_compression_factor: int = 1,
+        causal: bool = True,
     ):
         super().__init__(object_store_credential_path_pretrained=object_store_credential_path_pretrained)
+        self._causal = causal
 
         # Load the Flux VAE model, passing backend_args for S3 support
         vae_path_full = f"s3://{bucket_name}/{vae_path}"
diff --git a/cosmos_framework/model/vfm/tokenizers/interface.py b/cosmos_framework/model/vfm/tokenizers/interface.py
index 639d241..3c023bc 100644
--- a/cosmos_framework/model/vfm/tokenizers/interface.py
+++ b/cosmos_framework/model/vfm/tokenizers/interface.py
@@ -52,6 +52,21 @@ def get_latent_num_frames(self, num_pixel_frames: int) -> int:
     def get_pixel_num_frames(self, num_latent_frames: int) -> int:
         pass
 
+    def get_latent_temporal_positions(
+        self,
+        num_pixel_frames: int,
+        resolution: str | None = None,
+        num_latent_frames: int | None = None,
+    ) -> torch.Tensor | None:
+        """Return per-latent temporal coordinates when the tokenizer has nonuniform time semantics.
+
+        The default ``None`` preserves legacy latent-index RoPE behavior. Tokenizers
+        with boundary or overlap latents can override this to expose one coordinate
+        per latent frame.
+        """
+        del num_pixel_frames, resolution, num_latent_frames
+        return None
+
     @property
     @abstractmethod
     def spatial_compression_factor(self) -> int:
@@ -87,8 +102,12 @@ def compile_encode(
         warmup_resolutions: Sequence[str],
         output_dir: str,
         aspect_ratio: str | None = None,
+        backend: str | None = None,
+        mode: str | None = None,
+        fullgraph: bool | None = None,
+        dynamic: bool | None = None,
     ) -> None:
-        """AOT-compile the tokenizer for the given resolutions.
+        """Compile the tokenizer for the given resolutions.
 
         Subclasses that support AOT compilation should override this method.
         The default raises ``NotImplementedError``.
@@ -98,6 +117,11 @@ def compile_encode(
             output_dir: Root directory where compiled artifacts are stored
                 (typically ``config.job.path_local``).
             aspect_ratio: If given, only compile this single aspect ratio.
+            --- Only used if the tokenizer does not support AOT compilation ---
+            backend: Backend to use for compilation.
+            mode: Mode to use for compilation.
+            fullgraph: Whether to compile the full graph.
+            dynamic: Whether to compile the dynamic graph.
         """
         raise NotImplementedError(f"{type(self).__name__} does not support compilation")
 
@@ -106,8 +130,9 @@ def is_chunk_overlap(self):
         return False
 
     @property
-    def is_causal(self):
-        return True
+    def is_causal(self) -> bool:
+        # Subclasses set self._causal in their __init__ via the `causal` constructor argument.
+        return getattr(self, "_causal", True)
 
 
 class AudioTokenizerInterface(ABC):
diff --git a/cosmos_framework/model/vfm/tokenizers/tokenization_qwen2.py b/cosmos_framework/model/vfm/tokenizers/tokenization_qwen2.py
index c85f115..29c648c 100644
--- a/cosmos_framework/model/vfm/tokenizers/tokenization_qwen2.py
+++ b/cosmos_framework/model/vfm/tokenizers/tokenization_qwen2.py
@@ -1,5 +1,4 @@
-# Copyright 2024 The Qwen Team and The HuggingFace Inc. team.
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Source Repository: https://github.com/ByteDance-Seed/Bagel
@@ -173,7 +172,7 @@ def __init__(
                     continue
                 bpe_merges.append(tuple(line.split()))
         self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-
+        # NOTE: the cache can grow without bound and will get really large for long running processes
         # (esp. for texts of language that do not use space between word, e.g. Chinese); technically
         # not a memory leak but appears as one.
         # GPT2Tokenizer has the same problem, so let's be consistent.
diff --git a/cosmos_framework/model/vfm/tokenizers/uniae/__init__.py b/cosmos_framework/model/vfm/tokenizers/uniae/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/tokenizers/uniae/__init__.py
+++ b/cosmos_framework/model/vfm/tokenizers/uniae/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/tokenizers/uniae/frame_math.py b/cosmos_framework/model/vfm/tokenizers/uniae/frame_math.py
new file mode 100644
index 0000000..5e9be83
--- /dev/null
+++ b/cosmos_framework/model/vfm/tokenizers/uniae/frame_math.py
@@ -0,0 +1,326 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+"""Shared UniAE noncausal temporal chunking math."""
+
+from collections.abc import Iterable, Mapping
+
+from cosmos_framework.utils.vfm.data_utils import get_vision_data_resolution
+
+DEFAULT_RESOLUTION_KEYS = ("256", "480")
+
+
+def normalize_resolution_int_mapping(
+    value: int | Mapping[str, int],
+    *,
+    name: str,
+    default_keys: Iterable[str] = DEFAULT_RESOLUTION_KEYS,
+    required_keys: Iterable[str] | None = None,
+) -> dict[str, int]:
+    """Normalize a scalar or resolution-keyed integer config."""
+    if isinstance(value, int):
+        normalized = {str(resolution): int(value) for resolution in default_keys}
+    elif isinstance(value, Mapping):
+        normalized = {str(resolution): int(config_value) for resolution, config_value in value.items()}
+    else:
+        raise TypeError(f"{name} must be an int or a resolution-keyed mapping, got {type(value).__name__}.")
+
+    if not normalized:
+        raise ValueError(f"{name} must not be empty.")
+    if required_keys is not None:
+        missing = set(required_keys) - set(normalized)
+        if missing:
+            raise ValueError(f"{name} is missing resolution keys {sorted(missing)}.")
+    return normalized
+
+
+def normalize_uniae_chunk_frames(
+    uniae_chunk_frames: int | Mapping[str, int] | None,
+    *,
+    pad_frames: int | None,
+    temporal_compression_factor: int,
+    missing_chunk_message: str = "uniae_chunk_frames must be provided when uniae_pad_frames is set",
+    missing_pad_message: str = "uniae_pad_frames must be provided when uniae_chunk_frames is set",
+    temporal_divisibility_name: str = "temporal_compression_factor",
+) -> int | dict[str, int] | None:
+    """Normalize and validate UniAE full chunk sizes."""
+    if uniae_chunk_frames is None:
+        if pad_frames is not None:
+            raise ValueError(missing_chunk_message)
+        return None
+
+    if pad_frames is None:
+        raise ValueError(missing_pad_message)
+    if pad_frames <= 0:
+        raise ValueError(f"uniae_pad_frames must be positive, got {pad_frames}.")
+
+    if isinstance(uniae_chunk_frames, Mapping):
+        normalized = {str(resolution): int(chunk_frames) for resolution, chunk_frames in uniae_chunk_frames.items()}
+        if not normalized:
+            raise ValueError("uniae_chunk_frames mapping must not be empty")
+    else:
+        normalized = int(uniae_chunk_frames)
+
+    values = normalized.values() if isinstance(normalized, dict) else [normalized]
+    for chunk_frames in values:
+        if chunk_frames <= 2 * pad_frames:
+            raise ValueError(
+                f"uniae_chunk_frames must be greater than 2 * uniae_pad_frames, got {chunk_frames=} and {pad_frames=}."
+            )
+        if chunk_frames % temporal_compression_factor != 0:
+            raise ValueError(
+                f"uniae_chunk_frames must be divisible by {temporal_divisibility_name}, "
+                f"got {chunk_frames=} and {temporal_compression_factor=}."
+            )
+    return normalized
+
+
+def get_uniae_chunk_frames(
+    uniae_chunk_frames: int | Mapping[str, int],
+    *,
+    resolution: str | None = None,
+    spatial_shape: tuple[int, int] | None = None,
+    target_resolution_key: str | None = None,
+    missing_resolution_message: str = (
+        "spatial_shape or target resolution must be provided for resolution-keyed UniAE chunks"
+    ),
+) -> int:
+    """Select a scalar UniAE full chunk size from a scalar or resolution-keyed config."""
+    if isinstance(uniae_chunk_frames, int):
+        return uniae_chunk_frames
+
+    if target_resolution_key is not None:
+        resolved_resolution = target_resolution_key
+    elif resolution is not None:
+        resolved_resolution = resolution
+    elif spatial_shape is not None:
+        resolved_resolution = get_vision_data_resolution(spatial_shape)
+    else:
+        chunk_values = {int(chunk_frames) for chunk_frames in uniae_chunk_frames.values()}
+        if len(chunk_values) == 1:
+            return next(iter(chunk_values))
+        raise ValueError(missing_resolution_message)
+
+    if resolved_resolution not in uniae_chunk_frames:
+        raise ValueError(
+            f"Resolution {resolved_resolution!r} not found in uniae_chunk_frames. "
+            f"Available resolutions: {list(uniae_chunk_frames.keys())}"
+        )
+    return int(uniae_chunk_frames[resolved_resolution])
+
+
+def get_uniae_latent_num_frames(
+    num_pixel_frames: int,
+    uniae_chunk_frames: int | Mapping[str, int],
+    *,
+    pad_frames: int,
+    temporal_compression_factor: int,
+    resolution: str | None = None,
+    spatial_shape: tuple[int, int] | None = None,
+    target_resolution_key: str | None = None,
+    missing_resolution_message: str = (
+        "spatial_shape or target resolution must be provided for resolution-keyed UniAE chunks"
+    ),
+    invalid_frame_message_prefix: str = "Video frame count is not valid for UniAE non-causal chunking",
+) -> int:
+    """Return UniAE latent frame count for first-frame-alone plus padded-tail chunking."""
+    if num_pixel_frames < 1:
+        raise ValueError(f"num_pixel_frames must be positive, got {num_pixel_frames}.")
+    if num_pixel_frames == 1:
+        return 1
+
+    full_chunk = get_uniae_chunk_frames(
+        uniae_chunk_frames,
+        resolution=resolution,
+        spatial_shape=spatial_shape,
+        target_resolution_key=target_resolution_key,
+        missing_resolution_message=missing_resolution_message,
+    )
+    _validate_full_chunk(full_chunk, pad_frames=pad_frames, temporal_compression_factor=temporal_compression_factor)
+
+    effective_chunk = full_chunk - 2 * pad_frames
+    latents_per_full_chunk = full_chunk // temporal_compression_factor
+    remaining_frames = num_pixel_frames - 1
+    num_full_chunks = remaining_frames // effective_chunk
+    tail_frames = remaining_frames % effective_chunk
+    num_latent_frames = 1 + num_full_chunks * latents_per_full_chunk
+    if tail_frames == 0:
+        return num_latent_frames
+
+    padded_tail_frames = tail_frames + 2 * pad_frames
+    if padded_tail_frames % temporal_compression_factor != 0:
+        raise ValueError(
+            f"{invalid_frame_message_prefix}: "
+            f"got {num_pixel_frames=}, {full_chunk=}, {pad_frames=}, {temporal_compression_factor=}."
+        )
+    return num_latent_frames + padded_tail_frames // temporal_compression_factor
+
+
+def get_uniae_pixel_num_frames(
+    num_latent_frames: int,
+    uniae_chunk_frames: int | Mapping[str, int],
+    *,
+    pad_frames: int,
+    temporal_compression_factor: int,
+    resolution: str | None = None,
+    spatial_shape: tuple[int, int] | None = None,
+    target_resolution_key: str | None = None,
+    missing_resolution_message: str = (
+        "spatial_shape or target resolution must be provided for resolution-keyed UniAE chunks"
+    ),
+) -> int:
+    """Return pixel frame count represented by a valid UniAE latent frame count."""
+    if num_latent_frames < 1:
+        raise ValueError(f"num_latent_frames must be positive, got {num_latent_frames}.")
+    if num_latent_frames == 1:
+        return 1
+
+    full_chunk = get_uniae_chunk_frames(
+        uniae_chunk_frames,
+        resolution=resolution,
+        spatial_shape=spatial_shape,
+        target_resolution_key=target_resolution_key,
+        missing_resolution_message=missing_resolution_message,
+    )
+    _validate_full_chunk(full_chunk, pad_frames=pad_frames, temporal_compression_factor=temporal_compression_factor)
+
+    effective_chunk = full_chunk - 2 * pad_frames
+    latents_per_full_chunk = full_chunk // temporal_compression_factor
+    remaining_latents = num_latent_frames - 1
+    num_full_chunks = remaining_latents // latents_per_full_chunk
+    tail_latents = remaining_latents % latents_per_full_chunk
+    num_pixel_frames = 1 + num_full_chunks * effective_chunk
+    if tail_latents == 0:
+        return num_pixel_frames
+
+    tail_frames = tail_latents * temporal_compression_factor - 2 * pad_frames
+    if tail_frames <= 0:
+        raise ValueError(
+            "UniAE latent count does not map to a positive noncausal tail: "
+            f"got {num_latent_frames=}, {full_chunk=}, {pad_frames=}, {temporal_compression_factor=}."
+        )
+    return num_pixel_frames + tail_frames
+
+
+def get_uniae_latent_temporal_positions(
+    num_pixel_frames: int,
+    uniae_chunk_frames: int | Mapping[str, int],
+    *,
+    pad_frames: int,
+    temporal_compression_factor: int,
+    resolution: str | None = None,
+    spatial_shape: tuple[int, int] | None = None,
+    target_resolution_key: str | None = None,
+    missing_resolution_message: str = (
+        "spatial_shape or target resolution must be provided for resolution-keyed UniAE chunks"
+    ),
+    num_latent_frames: int | None = None,
+) -> list[float]:
+    """Return UniAE latent temporal coordinates in source-frame / tcf units."""
+    if num_pixel_frames < 1:
+        raise ValueError(f"num_pixel_frames must be positive, got {num_pixel_frames}.")
+    if num_pixel_frames == 1:
+        temporal_positions = [0.0]
+    else:
+        full_chunk = get_uniae_chunk_frames(
+            uniae_chunk_frames,
+            resolution=resolution,
+            spatial_shape=spatial_shape,
+            target_resolution_key=target_resolution_key,
+            missing_resolution_message=missing_resolution_message,
+        )
+        _validate_full_chunk(full_chunk, pad_frames=pad_frames, temporal_compression_factor=temporal_compression_factor)
+
+        effective_chunk = full_chunk - 2 * pad_frames
+        temporal_positions = [0.0]
+        source_start = 1
+        while source_start < num_pixel_frames:
+            source_end = min(source_start + effective_chunk, num_pixel_frames)
+            chunk_source_frames = (
+                [source_start] * pad_frames + list(range(source_start, source_end)) + [source_end - 1] * pad_frames
+            )
+            if len(chunk_source_frames) % temporal_compression_factor != 0:
+                raise ValueError(
+                    "UniAE frame count is not valid for noncausal chunking: "
+                    f"got {num_pixel_frames=}, {full_chunk=}, {pad_frames=}, {temporal_compression_factor=}."
+                )
+            temporal_positions.extend(
+                chunk_source_frames[i + temporal_compression_factor - 1] / temporal_compression_factor
+                for i in range(0, len(chunk_source_frames), temporal_compression_factor)
+            )
+            source_start = source_end
+
+    expected_latent_frames = get_uniae_latent_num_frames(
+        num_pixel_frames,
+        uniae_chunk_frames,
+        pad_frames=pad_frames,
+        temporal_compression_factor=temporal_compression_factor,
+        resolution=resolution,
+        spatial_shape=spatial_shape,
+        target_resolution_key=target_resolution_key,
+        missing_resolution_message=missing_resolution_message,
+        invalid_frame_message_prefix="UniAE frame count is not valid for noncausal chunking",
+    )
+    if num_latent_frames is not None and num_latent_frames != expected_latent_frames:
+        raise ValueError(
+            "UniAE latent temporal position count does not match encoded latent frames: "
+            f"got {num_latent_frames=}, expected {expected_latent_frames} for {num_pixel_frames=}."
+        )
+    if len(temporal_positions) != expected_latent_frames:
+        raise ValueError(
+            "UniAE latent temporal position helper produced an inconsistent count: "
+            f"got {len(temporal_positions)}, expected {expected_latent_frames}."
+        )
+    return temporal_positions
+
+
+def align_uniae_num_video_frames(
+    num_video_frames: int,
+    uniae_chunk_frames: int | Mapping[str, int],
+    *,
+    pad_frames: int,
+    temporal_compression_factor: int,
+    resolution: str | None = None,
+    spatial_shape: tuple[int, int] | None = None,
+    target_resolution_key: str | None = None,
+    missing_resolution_message: str = (
+        "spatial_shape or target resolution must be provided for resolution-keyed UniAE chunks"
+    ),
+) -> int:
+    """Trim a video frame count down to the nearest valid UniAE noncausal count."""
+    if num_video_frames < 1:
+        return 0
+
+    full_chunk = get_uniae_chunk_frames(
+        uniae_chunk_frames,
+        resolution=resolution,
+        spatial_shape=spatial_shape,
+        target_resolution_key=target_resolution_key,
+        missing_resolution_message=missing_resolution_message,
+    )
+    _validate_full_chunk(full_chunk, pad_frames=pad_frames, temporal_compression_factor=temporal_compression_factor)
+
+    effective_chunk = full_chunk - 2 * pad_frames
+    target_r = (-2 * pad_frames) % temporal_compression_factor
+    remainder = (num_video_frames - 1) % effective_chunk
+    if remainder != 0 and remainder % temporal_compression_factor != target_r:
+        delta = (remainder - target_r) % temporal_compression_factor
+        if remainder - delta < 0:
+            delta = remainder
+        num_video_frames -= delta
+    return num_video_frames
+
+
+def _validate_full_chunk(
+    full_chunk: int,
+    *,
+    pad_frames: int,
+    temporal_compression_factor: int,
+) -> None:
+    if full_chunk % temporal_compression_factor != 0:
+        raise ValueError(
+            "full_chunk must be divisible by temporal compression factor, "
+            f"got {full_chunk=} and {temporal_compression_factor=}."
+        )
+    if full_chunk <= 2 * pad_frames:
+        raise ValueError(f"full_chunk must be greater than 2 * pad_frames, got {full_chunk=} and {pad_frames=}.")
diff --git a/cosmos_framework/model/vfm/tokenizers/uniae/noncausal_4x16x16.py b/cosmos_framework/model/vfm/tokenizers/uniae/noncausal_4x16x16.py
index 7c1ba17..57b168e 100644
--- a/cosmos_framework/model/vfm/tokenizers/uniae/noncausal_4x16x16.py
+++ b/cosmos_framework/model/vfm/tokenizers/uniae/noncausal_4x16x16.py
@@ -1,9 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-"""UniAE S1 tokenizer wrapper for diffusion training (4x16x16 compression).
+"""UniAE S3 tokenizer wrapper for diffusion training (4x16x16 compression).
 
-Wraps the UniAE sparse autoencoder with DenseAutoencoderRuntime (SDPA compiled)
+Wraps the UniAE sparse autoencoder with DenseAutoencoderRuntime (batched backend)
 to provide a VideoTokenizerInterface compatible with diffusion model training.
 
 Usage:
@@ -13,10 +13,12 @@
         vae_pth="s3://bucket0/pretrained/tokenizers/video/cosmos/...",
         object_store_credential_path_pretrained="credentials/gcp_checkpoint.secret",
     )
-    latents = vae.encode(video)   # [B, 3, T, H, W] -> [B, 48, T//4, H//16, W//16]
-    recon = vae.decode(latents)   # [B, 48, T//4, H//16, W//16] -> [B, 3, T, H, W]
+    latents = vae.encode(video)   # [B, 3, T, H, W] -> [B, 48, ceil(T/4), H//16, W//16]
+    recon = vae.decode(latents)   # [B, 48, T_p, H//16, W//16] -> [B, 3, 4*T_p, H, W]
 """
 
+from collections.abc import Mapping, Sequence
+
 import torch
 
 from cosmos_framework.utils import log
@@ -25,32 +27,38 @@
 from cosmos_framework.model.tokenizer.models.dense_runtime import DenseAutoencoderRuntime
 from cosmos_framework.model.tokenizer.models.sparse_autoencoder import AutoencoderKL
 from cosmos_framework.model.vfm.tokenizers.interface import VideoTokenizerInterface
+from cosmos_framework.model.vfm.tokenizers.uniae.frame_math import (
+    get_uniae_latent_num_frames,
+    get_uniae_latent_temporal_positions,
+    get_uniae_pixel_num_frames,
+    normalize_resolution_int_mapping,
+)
+from cosmos_framework.utils.vfm.data_utils import get_vision_data_resolution
 
-# S1 architecture config (avoids importing cosmos_framework/configs/base which pulls in loss deps)
-_S1_ARCH = dict(
+# S3 architecture config (avoids importing configs/base which pulls in loss deps)
+_S3_ARCH = dict(
     patch_size=(4, 16, 16),
     in_channels=3072,
     out_channels=3072,
+    # Encoder
     encoder_model_channels=1152,
     encoder_num_blocks=27,
     encoder_num_heads=16,
     encoder_mlp_channels=4304,
-    encoder_attn_mode="full",
-    encoder_window_size=None,
     encoder_pe_mode="joint",
     encoder_qk_rms_norm=False,
     encoder_use_bias=True,
     encoder_use_rms_norm=False,
+    # Decoder
     decoder_model_channels=1152,
     decoder_num_blocks=27,
     decoder_num_heads=16,
     decoder_mlp_channels=4304,
-    decoder_attn_mode="full",
-    decoder_window_size=None,
     decoder_pe_mode="joint",
     decoder_qk_rms_norm=True,
     decoder_use_bias=False,
     decoder_use_rms_norm=True,
+    # Common settings
     use_decoder=True,
     quantizer_type="rq",
     quantizer_codebook_size=65536,
@@ -66,17 +74,20 @@
     inference_kv_cache_size=0,
     use_quantizer=False,
     use_dual_latent=False,
-    use_text_alignment=True,
-    use_post_text_alignment=True,
+    use_text_alignment=False,
+    use_post_text_alignment=False,
 )
 
 
 class UniAEVAE:
-    """UniAE S1 VAE wrapper for diffusion training.
+    """UniAE S3 VAE wrapper for diffusion training.
 
     Loads the UniAE sparse autoencoder checkpoint, wraps it with
-    DenseAutoencoderRuntime (SDPA backend for compile-friendly inference),
+    DenseAutoencoderRuntime (batched backend for compile-friendly inference),
     and provides encode/decode in the standard [B, C, T, H, W] format.
+
+    Latents are normalized per-channel using statistics computed from 10K images
+    and 10K videos: ``normalized = (latent - mean) / std``.
     """
 
     def __init__(
@@ -86,30 +97,109 @@ def __init__(
         object_store_credential_path_pretrained: str = "",
         dtype: torch.dtype = torch.bfloat16,
         device: str = "cuda",
-        backend: str = "sdpa",
+        backend: str = "batched",
+        pad_frames: int = 1,
+        pixel_trim: bool = True,
+        chunk_size: int | Mapping[str, int] = 16,
+        encode_chunk_batch_size: int | Mapping[str, int] = 1,
     ):
+        chunk_size = normalize_resolution_int_mapping(chunk_size, name="chunk_size")
+        if any(chunk_frames % 4 != 0 for chunk_frames in chunk_size.values()):
+            raise ValueError("chunk_size values must be multiples of 4.")
+        if any(chunk_frames <= 2 * pad_frames for chunk_frames in chunk_size.values()):
+            raise ValueError(
+                f"chunk_size values must be greater than 2 * pad_frames, got {chunk_size=} and {pad_frames=}."
+            )
+        encode_chunk_batch_size = normalize_resolution_int_mapping(
+            encode_chunk_batch_size,
+            name="encode_chunk_batch_size",
+            default_keys=chunk_size.keys(),
+            required_keys=chunk_size.keys(),
+        )
+        if any(batch_size < 1 for batch_size in encode_chunk_batch_size.values()):
+            raise ValueError("encode_chunk_batch_size values must be >= 1.")
+        self.chunk_size = chunk_size
+        self.encode_chunk_batch_size = encode_chunk_batch_size
         self.dtype = dtype
         self.device = device
         self.z_dim = z_dim
+        self._pad_frames = pad_frames
+        self._pixel_trim = pixel_trim
         self._spatial_compression_factor = 16
         self._temporal_compression_factor = 4
 
+        # Per-channel latent normalization stats — loaded from a file paired with the
+        # tokenizer checkpoint: <ckpt_stem>_latent_norm.pt (same directory, same bucket).
+        # Storing stats alongside the checkpoint prevents silent divergence when the
+        # checkpoint is updated.
+        if not vae_pth:
+            raise ValueError("vae_pth must be provided to load latent normalization stats")
+        vae_pth_str = str(vae_pth)
+        # Derive stats path: strip .pt suffix, append _latent_norm.pt
+        if vae_pth_str.endswith(".pt"):
+            norm_pth = vae_pth_str[:-3] + "_latent_norm.pt"
+        else:
+            norm_pth = vae_pth_str + "_latent_norm.pt"
+        if norm_pth.startswith("s3://"):
+            norm_backend_args = {
+                "backend": "s3",
+                "s3_credential_path": object_store_credential_path_pretrained,
+            }
+        else:
+            norm_backend_args = None
+        norm_stats = easy_io.load(norm_pth, backend_args=norm_backend_args, map_location="cpu", weights_only=False)
+        mean = norm_stats["mean"].to(dtype=dtype, device=device)
+        std = norm_stats["std"].to(dtype=dtype, device=device)
+        self._latent_mean = mean.view(1, z_dim, 1, 1, 1)
+        self._latent_inv_std = (1.0 / std).view(1, z_dim, 1, 1, 1)
+
         # make compatible with meta device
         autoencoder = AutoencoderKL(
-            **_S1_ARCH,
+            **_S3_ARCH,
             latent_channels=z_dim,
             quantizer_feature_dim=z_dim,
         )
-
         autoencoder.eval()
         autoencoder.to(device=device, dtype=dtype)
 
         # Load checkpoint
+        if vae_pth and get_rank() == 0:
+            if str(vae_pth).startswith("s3://"):
+                backend_args = {"backend": "s3", "s3_credential_path": object_store_credential_path_pretrained}
+            else:
+                backend_args = None
+            state_dict = easy_io.load(vae_pth, backend_args=backend_args, map_location="cpu", weights_only=False)
+            if "model" in state_dict:
+                model_state = state_dict["model"]
+            elif "state_dict" in state_dict:
+                model_state = state_dict["state_dict"]
+            else:
+                model_state = state_dict
+            # Checkpoint may be saved from a wrapper with a 'network.' prefix — strip it.
+            if any(k.startswith("network.") for k in model_state):
+                model_state = {
+                    k[len("network.") :] if k.startswith("network.") else k: v for k, v in model_state.items()
+                }
+            missing, unexpected = autoencoder.load_state_dict(model_state, strict=False)
+            if missing:
+                log.warning(f"Missing keys: {len(missing)} (e.g., {missing[:3]})")
+            if unexpected:
+                log.warning(f"Unexpected keys: {len(unexpected)} (e.g., {unexpected[:3]})")
+            log.info(f"Loaded checkpoint from {vae_pth}")
+        elif vae_pth:
+            autoencoder.to_empty(device=device)
         if vae_pth:
-            self._load_checkpoint(autoencoder, vae_pth, object_store_credential_path_pretrained, device)
+            sync_model_states(autoencoder)
 
         # Wrap with dense runtime for fast inference
-        self.dense_runtime = DenseAutoencoderRuntime.from_autoencoder(autoencoder, backend=backend)
+        self.dense_runtime = DenseAutoencoderRuntime.from_autoencoder(
+            autoencoder,
+            backend=backend,
+            pad_frames=self._pad_frames,
+            pixel_trim=self._pixel_trim,
+            # passing of min value makes sense in order to verify padding is not bigger than smallest chunk size
+            chunk_size=min(chunk_size.values()),
+        )
         self.dense_runtime.eval()
 
         # Freeze all parameters
@@ -117,59 +207,9 @@ def __init__(
             param.requires_grad = False
 
         log.info(
-            f"UniAE S1 loaded: {self.count_param() / 1e6:.1f}M params, "
-            f"backend={backend}, dtype={dtype}, device={device}"
+            f"UniAE loaded: {self.count_param() / 1e6:.1f}M params, backend={backend}, dtype={dtype}, device={device}"
         )
 
-    def _load_checkpoint(self, model, pretrained_path, credential_path, device):
-        """Load checkpoint from local path or S3."""
-        if get_rank() == 0:
-            if pretrained_path.startswith("s3://"):
-                backend_args = {
-                    "backend": "s3",
-                    "s3_credential_path": credential_path,
-                }
-            else:
-                backend_args = None
-
-            ckpt = easy_io.load(
-                pretrained_path,
-                backend_args=backend_args,
-                map_location=device,
-            )
-
-            # Handle different checkpoint formats
-            if isinstance(ckpt, dict):
-                if "model" in ckpt:
-                    state_dict = ckpt["model"]
-                elif "state_dict" in ckpt:
-                    state_dict = ckpt["state_dict"]
-                elif "network" in ckpt:
-                    state_dict = ckpt["network"]
-                else:
-                    state_dict = ckpt
-            else:
-                state_dict = ckpt
-
-            # Strip common prefixes
-            cleaned = {}
-            for k, v in state_dict.items():
-                for prefix in ["network.", "module.", "model."]:
-                    if k.startswith(prefix):
-                        k = k[len(prefix) :]
-                cleaned[k] = v
-
-            missing, unexpected = model.load_state_dict(cleaned, strict=False)
-            if missing:
-                log.warning(f"Missing keys: {len(missing)} (e.g., {missing[:3]})")
-            if unexpected:
-                log.warning(f"Unexpected keys: {len(unexpected)} (e.g., {unexpected[:3]})")
-            log.info(f"Loaded checkpoint from {pretrained_path}")
-        else:
-            model.to_empty(device=device)
-
-        sync_model_states(model)
-
     def count_param(self) -> int:
         return sum(p.numel() for p in self.dense_runtime.parameters())
 
@@ -177,45 +217,58 @@ def count_param(self) -> int:
     def encode(self, video: torch.Tensor) -> torch.Tensor:
         """Encode image or video to latent space.
 
-        For images (T=1 or 4D input), the input is repeated to 4 frames
-        since the non-causal tokenizer requires a full temporal patch.
+        Boundary padding and latent trimming are handled by DenseAutoencoderRuntime
+        via pad_frames and pixel_trim. Non-image inputs use UniAE's noncausal
+        first-frame-alone chunking: the first source frame forms its own latent
+        and the remaining frames are encoded in resolution-specific chunks with
+        pad_frames replicated on both sides.
 
         Args:
-            video: [B, 3, T, H, W] or [B, 3, H, W] (image) in range [-1, 1]
+            video: [B, 3, T, H, W] or [B, 3, H, W] (image) in range [-1, 1].
+                   For videos, (T - 1) must either fill whole content chunks or
+                   leave a tail whose frame count plus 2 * pad_frames is divisible
+                   by the temporal compression factor.
 
         Returns:
-            latent: [B, z_dim, T//4, H//16, W//16]
-                    For single-image input, T//4 = 1.
+            latent: [B, z_dim, ceil(T/4), H//16, W//16]
+                    For single-image input, ceil(T/4) = 1.
         """
-        # Handle image input: [B, C, H, W] -> [B, C, 4, H, W]
-        is_image = video.ndim == 4
-        if is_image:
+        # Handle image input: [B, C, H, W] -> [B, C, 1, H, W].
+        # Do NOT expand here — pass 1 frame so encode_moments detects is_image=True
+        # and handles the temporal padding internally without noncausal chunking.
+        if video.ndim == 4:
             video = video.unsqueeze(2)
-            video = torch.nn.functional.pad(
-                video, (0, 0, 0, 0, 0, self._temporal_compression_factor - 1), mode="constant", value=0.0
-            )
 
         B, C, T, H, W = video.shape
-        tc = self._temporal_compression_factor
-
-        # For non-causal tokenizer, repeat last frame to fill last temporal patch
-        if T % tc != 0:
-            pad_t = tc - T % tc
-            last_frame = video[:, :, -1:].expand(-1, -1, pad_t, -1, -1)
-            video = torch.cat([video, last_frame], dim=2)
-            T = T + pad_t
 
+        res_key = get_vision_data_resolution((H, W))
+        if res_key not in self.chunk_size:
+            raise ValueError(
+                f"Unsupported resolution key '{res_key}' for input shape ({H}, {W}). "
+                f"Supported keys: {list(self.chunk_size.keys())}"
+            )
+        full_chunk_size = self.chunk_size[res_key]
+        chunk_size = full_chunk_size - 2 * self._pad_frames
+        encode_chunk_batch_size = self.encode_chunk_batch_size[res_key]
         # Convert to channels-last [B, T, H, W, C] for dense runtime
         video_cl = video.permute(0, 2, 3, 4, 1).contiguous().to(dtype=self.dtype)
 
-        # Encode: returns [B, T_p, H_p, W_p, 2*z_dim] moments
-        moments = self.dense_runtime.encode(video_cl, sample_posterior=False)
-
-        # Take mean (first half of channels) for deterministic encoding
-        mean, logvar = moments.chunk(2, dim=-1)
+        # Encode with UniAE's content chunk size; dense_runtime adds pad_frames at
+        # noncausal chunk boundaries and trims boundary latents internally.
+        # Returns [B, T_p, H_p, W_p, 2*z_dim].
+        moments = self.dense_runtime.encode(
+            video_cl,
+            sample_posterior=False,
+            chunk_raw_frames=chunk_size,
+            encode_chunk_batch_size=encode_chunk_batch_size,
+        )
 
-        # Convert to [B, z_dim, T_p, H_p, W_p]
-        return mean.permute(0, 4, 1, 2, 3).contiguous()
+        # Take mean for deterministic encoding; convert to [B, z_dim, T_p, H_p, W_p]
+        mean, _ = moments.chunk(2, dim=-1)
+        latent = mean.permute(0, 4, 1, 2, 3).contiguous()
+        # Normalize per-channel: (z - mean) * inv_std
+        latent = (latent - self._latent_mean) * self._latent_inv_std
+        return latent
 
     @torch.inference_mode()
     def decode(self, latent: torch.Tensor) -> torch.Tensor:
@@ -227,21 +280,80 @@ def decode(self, latent: torch.Tensor) -> torch.Tensor:
         Returns:
             video: [B, 3, T, H, W] in range [-1, 1]
         """
+        # Denormalize per-channel: z / inv_std + mean
+        latent = latent / self._latent_inv_std + self._latent_mean
         # Convert to channels-last [B, T_p, H_p, W_p, z_dim]
         latent_cl = latent.permute(0, 2, 3, 4, 1).contiguous().to(dtype=self.dtype)
 
-        # Decode: returns [B, T, H, W, C] channels-last
-        decoded = self.dense_runtime.decode(latent_cl)
+        # Use the resolution-specific encoder chunk size so each chunk is decoded
+        # independently with correct boundary trimming.  Decoding all latents at once
+        # would apply trim only at the outer edges, producing wrong pixel counts for
+        # multi-chunk videos.  Derive resolution from latent spatial dims.
+        _, _, H_p, W_p = latent.shape[1:]
+        res_key = get_vision_data_resolution(
+            (H_p * self._spatial_compression_factor, W_p * self._spatial_compression_factor)
+        )
+        if res_key not in self.chunk_size:
+            raise ValueError(
+                f"Unsupported resolution key '{res_key}' for latent shape ({H_p}, {W_p}). "
+                f"Supported keys: {list(self.chunk_size.keys())}"
+            )
+        chunk_raw_frames = self.chunk_size[res_key]
+        decoded = self.dense_runtime.decode(latent_cl, chunk_raw_frames=chunk_raw_frames)
 
         # Convert to [B, C, T, H, W] and clamp
         video = decoded.permute(0, 4, 1, 2, 3).contiguous()
         return video.clamp(-1, 1).float()
 
-    def get_latent_num_frames(self, num_pixel_frames: int) -> int:
-        return num_pixel_frames // self._temporal_compression_factor
+    def get_latent_num_frames(self, num_pixel_frames: int, resolution: str | None = None) -> int:
+        return get_uniae_latent_num_frames(
+            num_pixel_frames,
+            self.chunk_size,
+            pad_frames=self._pad_frames,
+            temporal_compression_factor=self._temporal_compression_factor,
+            resolution=resolution,
+            missing_resolution_message=(
+                f"resolution must be provided when UniAE uses mixed encode_chunk_frames; got chunk_size={self.chunk_size}."
+            ),
+            invalid_frame_message_prefix="UniAE frame count is not valid for noncausal chunking",
+        )
+
+    def get_pixel_num_frames(self, num_latent_frames: int, resolution: str | None = None) -> int:
+        return get_uniae_pixel_num_frames(
+            num_latent_frames,
+            self.chunk_size,
+            pad_frames=self._pad_frames,
+            temporal_compression_factor=self._temporal_compression_factor,
+            resolution=resolution,
+            missing_resolution_message=(
+                f"resolution must be provided when UniAE uses mixed encode_chunk_frames; got chunk_size={self.chunk_size}."
+            ),
+        )
 
-    def get_pixel_num_frames(self, num_latent_frames: int) -> int:
-        return num_latent_frames * self._temporal_compression_factor
+    def get_latent_temporal_positions(
+        self,
+        num_pixel_frames: int,
+        resolution: str | None = None,
+        num_latent_frames: int | None = None,
+    ) -> torch.Tensor:
+        """Return UniAE latent temporal coordinates in source-frame / tcf units.
+
+        UniAE keeps noncausal padded boundary latents. Those latents should not be
+        assigned uniformly increasing temporal IDs, because each latent summarizes
+        the right edge of its padded temporal patch.
+        """
+        positions = get_uniae_latent_temporal_positions(
+            num_pixel_frames,
+            self.chunk_size,
+            pad_frames=self._pad_frames,
+            temporal_compression_factor=self._temporal_compression_factor,
+            resolution=resolution,
+            missing_resolution_message=(
+                f"resolution must be provided when UniAE uses mixed encode_chunk_frames; got chunk_size={self.chunk_size}."
+            ),
+            num_latent_frames=num_latent_frames,
+        )
+        return torch.tensor(positions, dtype=torch.float32)  # [T_latent]
 
 
 class UniAEVAEInterface(VideoTokenizerInterface):
@@ -252,14 +364,39 @@ def __init__(
         bucket_name: str = "",
         object_store_credential_path_pretrained: str = "",
         vae_path: str = "",
-        encode_chunk_frames: int = 16,
+        encode_chunk_frames: int | Mapping[str, int] = 16,
+        encode_chunk_batch_size: int | Mapping[str, int] = 1,
         spatial_compression_factor: int = 16,
         temporal_compression_factor: int = 4,
+        pad_frames: int = 0,
+        pixel_trim: bool = True,
+        backend: str = "batched_with_padding",
+        causal: bool = False,
     ):
         super().__init__(object_store_credential_path_pretrained)
+        self._causal = causal
+        assert not self._causal, "UniAEVAEInterface is a non-causal tokenizer; causal must be False."
         self._spatial_compression_factor = spatial_compression_factor
         self._temporal_compression_factor = temporal_compression_factor
+        encode_chunk_frames = normalize_resolution_int_mapping(encode_chunk_frames, name="encode_chunk_frames")
+        if any(chunk_frames % temporal_compression_factor != 0 for chunk_frames in encode_chunk_frames.values()):
+            raise ValueError("encode_chunk_frames values must be multiples of temporal_compression_factor.")
+        if any(chunk_frames <= 2 * pad_frames for chunk_frames in encode_chunk_frames.values()):
+            raise ValueError(
+                f"encode_chunk_frames values must be greater than 2 * pad_frames, "
+                f"got {encode_chunk_frames=} and {pad_frames=}."
+            )
         self.encode_chunk_frames = encode_chunk_frames
+        encode_chunk_batch_size = normalize_resolution_int_mapping(
+            encode_chunk_batch_size,
+            name="encode_chunk_batch_size",
+            default_keys=encode_chunk_frames.keys(),
+            required_keys=encode_chunk_frames.keys(),
+        )
+        if any(batch_size < 1 for batch_size in encode_chunk_batch_size.values()):
+            raise ValueError("encode_chunk_batch_size values must be >= 1.")
+        self.encode_chunk_batch_size = encode_chunk_batch_size
+        # unused parameter
         self.use_streaming_encode = False
 
         vae_full_path = vae_path
@@ -269,7 +406,13 @@ def __init__(
         self.vae = UniAEVAE(
             vae_pth=vae_full_path,
             object_store_credential_path_pretrained=object_store_credential_path_pretrained,
+            pad_frames=pad_frames,
+            pixel_trim=pixel_trim,
+            backend=backend,
+            chunk_size=self.encode_chunk_frames,
+            encode_chunk_batch_size=self.encode_chunk_batch_size,
         )
+        self.is_compiled = False
 
     def reset_dtype(self):
         pass
@@ -295,15 +438,49 @@ def compile_encode_for_cudagraphs(
         self.vae.dense_runtime._encode_chunk_core = torch.compile(
             self.vae.dense_runtime._encode_chunk_core, **compile_kwargs
         )
+        self.is_compiled = True
+
+    @torch.inference_mode()
+    def compile_encode(
+        self,
+        warmup_resolutions: Sequence[str],
+        output_dir: str | None = None,
+        aspect_ratio: str | None = None,
+        backend: str | None = "inductor",
+        mode: str | None = "reduce-overhead",
+        fullgraph: bool = False,
+        dynamic: bool = False,
+    ) -> None:
+        """Compile the encode function for the given resolutions."""
+        if self.is_compiled:
+            log.warning("Tokenizer is already compiled, skipping compilation.")
+            return
+
+        if backend is None:
+            raise ValueError("backend must be provided")
+
+        self.compile_encode_for_cudagraphs(mode=mode, fullgraph=fullgraph, dynamic=dynamic, backend=backend)
 
     def decode(self, latent: torch.Tensor) -> torch.Tensor:
         return self.vae.decode(latent)
 
-    def get_latent_num_frames(self, num_pixel_frames: int) -> int:
-        return self.vae.get_latent_num_frames(num_pixel_frames)
+    def get_latent_num_frames(self, num_pixel_frames: int, resolution: str | None = None) -> int:
+        return self.vae.get_latent_num_frames(num_pixel_frames, resolution=resolution)
 
-    def get_pixel_num_frames(self, num_latent_frames: int) -> int:
-        return self.vae.get_pixel_num_frames(num_latent_frames)
+    def get_pixel_num_frames(self, num_latent_frames: int, resolution: str | None = None) -> int:
+        return self.vae.get_pixel_num_frames(num_latent_frames, resolution=resolution)
+
+    def get_latent_temporal_positions(
+        self,
+        num_pixel_frames: int,
+        resolution: str | None = None,
+        num_latent_frames: int | None = None,
+    ) -> torch.Tensor:
+        return self.vae.get_latent_temporal_positions(
+            num_pixel_frames=num_pixel_frames,
+            resolution=resolution,
+            num_latent_frames=num_latent_frames,
+        )
 
     @property
     def spatial_compression_factor(self):
@@ -315,20 +492,25 @@ def temporal_compression_factor(self):
 
     @property
     def spatial_resolution(self):
-        return 512
+        raise NotImplementedError(
+            "spatial_resolution is deprecated for UniAEVAEInterface (resolution is input-dependent). "
+            "Will be removed in a future MR."
+        )
 
     @property
     def pixel_chunk_duration(self):
-        return self.encode_chunk_frames
+        raise NotImplementedError(
+            "pixel_chunk_duration is deprecated for UniAEVAEInterface (chunk size is resolution-dependent). "
+            "Use encode_chunk_frames[res_key] directly. Will be removed in a future MR."
+        )
 
     @property
     def latent_chunk_duration(self):
-        return self.encode_chunk_frames // self._temporal_compression_factor
+        raise NotImplementedError(
+            "latent_chunk_duration is deprecated for UniAEVAEInterface (chunk size is resolution-dependent). "
+            "Use encode_chunk_frames[res_key] // temporal_compression_factor. Will be removed in a future MR."
+        )
 
     @property
     def latent_ch(self) -> int:
-        return 48
-
-    @property
-    def is_causal(self):
-        return False
+        return self.vae.z_dim
diff --git a/cosmos_framework/model/vfm/tokenizers/wan2pt1_vae_4x8x8.py b/cosmos_framework/model/vfm/tokenizers/wan2pt1_vae_4x8x8.py
index d838bd7..542c90f 100644
--- a/cosmos_framework/model/vfm/tokenizers/wan2pt1_vae_4x8x8.py
+++ b/cosmos_framework/model/vfm/tokenizers/wan2pt1_vae_4x8x8.py
@@ -757,7 +757,10 @@ def __init__(
         use_channels_last_memory_format: bool = False,
         spatial_compression_factor: int = 8,
         temporal_compression_factor: int = 4,
+        causal: bool = True,
     ):
+        self._causal = causal
+        assert self._causal, "Wan2pt1VAEInterface is a causal tokenizer; causal must be True."
         vae_path_full = f"s3://{bucket_name}/{vae_path}"
         self.keep_decoder_cache = keep_decoder_cache
         self.keep_encoder_cache = keep_encoder_cache
diff --git a/cosmos_framework/model/vfm/tokenizers/wan2pt2_vae_4x16x16.py b/cosmos_framework/model/vfm/tokenizers/wan2pt2_vae_4x16x16.py
index 954f05a..12dc6d5 100644
--- a/cosmos_framework/model/vfm/tokenizers/wan2pt2_vae_4x16x16.py
+++ b/cosmos_framework/model/vfm/tokenizers/wan2pt2_vae_4x16x16.py
@@ -1,8 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
-
 import os
 import time
 from collections.abc import Callable, Generator, Mapping, Sequence
@@ -1323,6 +1321,7 @@ def __init__(
         # with older configurations.
         temporal_window: int | None = None,
         encode_bucket_multiple: int | None = None,
+        causal: bool = True,
     ):
         # Remove temporal_window and encode_bucket_multiple once they have been
         # removed from the uploaded HuggingFace checkpoint.
@@ -1367,6 +1366,8 @@ def __init__(
 
         self._spatial_compression_factor = spatial_compression_factor
         self._temporal_compression_factor = temporal_compression_factor
+        self._causal = causal
+        assert self._causal, "Wan2pt2VAEInterface is a causal tokenizer; causal must be True."
 
     @property
     def dtype(self) -> torch.dtype:
@@ -1417,6 +1418,8 @@ def compile_encode(
         warmup_resolutions: Sequence[str],
         output_dir: str,
         aspect_ratio: str | None = None,
+        # ignores torch compile args
+        **kwargs,
     ) -> None:
         """AOT-compile the tokenizer's chunk-level encode for every resolution.
 
diff --git a/cosmos_framework/model/vfm/upsampler/__init__.py b/cosmos_framework/model/vfm/upsampler/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/upsampler/__init__.py
+++ b/cosmos_framework/model/vfm/upsampler/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/upsampler/prompts.py b/cosmos_framework/model/vfm/upsampler/prompts.py
index 42289da..527fe27 100644
--- a/cosmos_framework/model/vfm/upsampler/prompts.py
+++ b/cosmos_framework/model/vfm/upsampler/prompts.py
@@ -1,5 +1,6 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
 """Canonical V4.2 upsampler prompt templates — inference-team entry point.
 
 Standalone module. No file I/O, no non-stdlib imports. The canonical templates
@@ -19,10 +20,31 @@
         resolution_w=1280, resolution_h=720,
     )
 
-    # Text-to-image (image parameters required; fps/duration omitted)
+    # Text-to-image: the v4.2 default is the EXPRESSIVE body — v4.2 structure
+    # plus a "fill plausibly, leave empties only when truly non-applicable"
+    # rule.  Use without a version override unless you want a different t2i
+    # variant (see below).
+    user_text = build_user_text(
+        task="t2i", description="A photo of a corgi",
+        aspect_ratio="1,1", resolution_w=960, resolution_h=960,
+    )
+
+    # Text-to-image, original v4.2 baseline body (kept for A/B comparisons
+    # against the expressive default — e.g. UGB baseline MR-366).
     user_text = build_user_text(
         task="t2i", description="A photo of a corgi",
         aspect_ratio="1,1", resolution_w=960, resolution_h=960,
+        version="v4.2-original",
+    )
+
+    # Text-to-image, anti-hallucination variant: adds source-anchoring +
+    # rewrite-reflex suppression + person-attribute silence rules. Use
+    # when the source caption is sparse and the upsampler must NOT invent
+    # specifics absent from the source.
+    user_text = build_user_text(
+        task="t2i", description="A photo of a corgi",
+        aspect_ratio="1,1", resolution_w=960, resolution_h=960,
+        version="v4.2-constrained",
     )
 
     # Image-to-video (video parameters required)
@@ -325,6 +347,229 @@
 }
 </output_json_template>"""
 
+_TEMPLATE_T2I_V4_2_CONSTRAINED = r"""<instructions>
+You are a prompt upsampler for a text-to-image model. This instructions block governs the response. Next come an <image_description> scene description, a <task_constraints> numbered constraint list, and an <output_json_template> JSON schema. Produce exactly one fenced JSON object. The object fully populates every field in the template, uses valid JSON, and strictly satisfies each numbered task constraint without omission or deviation.
+</instructions>
+
+<image_description>
+{description}
+</image_description>
+
+<task_constraints>
+1. **SOURCE ANCHORING (highest priority).** Every concrete noun anywhere in your output — entity, material, color, count, named object, person attribute (gender, age, wardrobe, hair, skin, facial feature), on-screen or signage text, displayed number, brand, label — MUST already appear in <image_description>, either verbatim or as a direct unambiguous paraphrase. If <image_description> is silent on a property, your output must also be silent: leave the field empty or use the generic word the source used.
+
+   The rule is categorical, not phrase-specific. Apply it to any input by recognizing the category. Pattern templates:
+   - If source names an OBJECT generically (e.g. "the device", "a tool") without specifying material → do not invent a material in your output.
+   - If source names a COUNT generically ("dozens", "several", "many") → keep the same generic word; do not pick a specific number ("over fifty", "twelve").
+   - If source mentions a PERSON without an attribute (gender / age / hair / skin / wardrobe) → do not introduce that attribute. Use the same generic word the source used.
+   - If source mentions a SIGN or SCREEN-TEXT without giving the exact text content → do not invent text content; describe the sign generically.
+   - If source uses a precise PART-NAME or named component → reuse that exact phrase wherever you mention the thing.
+   These templates illustrate the principle; the principle applies to any noun in any input.
+
+2. **REWRITE-REFLEX SUPPRESSION.** Do NOT rewrite a source-anchored phrase into a different specific. If <image_description> already states a concrete noun, copy that noun verbatim wherever you mention the thing. Do NOT substitute a synonym, do NOT category-shift (e.g. "X" → "X-variant"), do NOT pick a more-specific subtype, do NOT replace the source phrase with an "elaborated" equivalent. Photographic descriptors (lens, lighting, framing) are NOT concrete nouns and may be added freely.
+
+3. **CHAIN-LOCK across fields.** Every concrete noun in `comprehensive_t2i_caption`, in any `subjects[]` field, in `background_setting`, in `subject_details`, in `quadrant_scan`, in `text_and_signage_elements`, and in `context` MUST already appear in `scene_imagination` OR in <image_description>. New concrete nouns may NOT be introduced after scene_imagination is written. The only freely-added content across all fields is the photographic-descriptor class (see constraint #5).
+
+4. **PERSON-ATTRIBUTE SILENCE (HARD RULE for human / humanoid subjects).** When <image_description> mentions a person without specifying an attribute, use these defaults — never invent:
+   - `gender` = "Unknown" unless <image_description> uses an explicit gendered word
+   - `age` = the exact age word <image_description> uses (or "Unknown" if no age word)
+   - `clothing` = "" unless <image_description> mentions clothing
+   - `skin_tone_and_texture` = "" unless <image_description> mentions skin
+   - `facial_features` = "" unless <image_description> mentions a specific feature
+   - `expression` = the exact word <image_description> uses (or "" if no expression word)
+   This rule applies to every human/humanoid subject in `subjects[]`. Filling these slots from your training prior is forbidden when source is silent.
+
+5. **PHOTOGRAPHIC DESCRIPTORS — freely add.** Only the following classes may be invented (because they describe HOW the image is captured, not WHAT is in it):
+   - camera framing / angle / lens / focal length / depth of field
+   - lighting quality / direction (only when source mentions or implies lighting)
+   - composition (rule-of-thirds, leading lines, symmetry, negative space)
+   - rendering style (photoreal / illustration / cartoon — only when source implies)
+   - atmospheric quality (haze, contrast, mood, color palette)
+   - generic shadow / reflection / specular behavior
+
+6. **Order of generation.** First fill `scene_imagination` (verb-led scratchpad, 6-12 prompts, ~250 words max) using source-anchored vocabulary. Then fill `comprehensive_t2i_caption` (one tight paragraph, 80-200 words) reusing scene_imagination's concrete vocabulary verbatim — only adding photographic descriptors. Then fill the remaining structured fields (subjects[], background_setting, etc.), all of which inherit vocabulary from constraint #3.
+
+7. **Output-parameter copy.** Copy these values byte-for-byte:
+   - aspect_ratio: "{aspect_ratio}"
+   - resolution: {"W": {resolution_w}, "H": {resolution_h}}
+
+8. **Internal consistency.** Lighting / setting / time-of-day / framing / mood must be mutually consistent.
+
+9. **Schema completeness.** Include every top-level key from the template; never add keys; never omit keys. Permitted empties: "", 0, [], {}. No null. If any subject is human/humanoid: number_of_hands=2, number_of_fingers=10. If all non-human: both = 0.
+
+10. **subject_details density.** `subject_details` non-empty with 2-5 source-anchored attribute keys. Never `{}`.
+
+11. **Output format.** ONLY one JSON object inside a ```json code fence. No prose outside.
+</task_constraints>
+
+<output_json_template>
+{
+  "scene_imagination": "Per #1, #2, #6 — verb-led scratchpad with source-anchored concrete nouns only; under ~250 words",
+  "comprehensive_t2i_caption": "Per #3, #6 — same concrete vocabulary as scene_imagination plus photographic descriptors only",
+  "subjects": [
+    {
+      "description": "Per #1, #3 — source-anchored only",
+      "appearance_details": "Per #1, #3 — source-anchored only",
+      "relationship": "how this subject relates to others",
+      "location": "where in frame",
+      "relative_size": "size within frame",
+      "orientation": "direction subject faces relative to camera",
+      "pose": "body position and posture",
+      "clothing": "Per #4 — '' if source-silent or non-human",
+      "expression": "Per #4 — '' if source-silent or non-human",
+      "gender": "Per #4 — 'Unknown' if source-silent",
+      "age": "Per #4 — source's word verbatim; 'Unknown' if no age word",
+      "skin_tone_and_texture": "Per #4 — '' if source-silent or non-human",
+      "facial_features": "Per #4 — '' if source-silent or non-human",
+      "number_of_subjects": "int; total in this subject's group; 0 if N/A",
+      "number_of_arms": "int",
+      "number_of_legs": "int",
+      "number_of_hands": "int",
+      "number_of_fingers": "int"
+    }
+  ],
+  "subject_details": { "<key_name>": "Per #1, #3 — source-anchored attribute" },
+  "background_setting": "Per #1, #3 — source-anchored",
+  "lighting": { "conditions": "...", "direction": "...", "shadows": "...", "illumination_effect": "..." },
+  "aesthetics": { "composition": "...", "color_scheme": "...", "mood_atmosphere": "...", "patterns": "" },
+  "cinematography": { "framing": "...", "camera_angle": "...", "depth_of_field": "...", "focus": "...", "lens_focal_length": "..." },
+  "style_medium": "rendering style per source",
+  "artistic_style": "broader style only if source implies",
+  "context": "Per #1, #3 — source-anchored",
+  "text_and_signage_elements": [
+    { "text": "exact source text; entry omitted if source-silent on specific text content", "category": "...", "appearance": "...", "spatial": "...", "context": "..." }
+  ],
+  "quadrant_scan": { "top_left": "Per #1, #3", "top_right": "Per #1, #3", "bottom_left": "Per #1, #3", "bottom_right": "Per #1, #3", "absolute_center": "Per #1, #3" },
+  "resolution": "Per #7",
+  "aspect_ratio": "Per #7"
+}
+</output_json_template>"""
+
+
+_TEMPLATE_T2I_V4_2_EXPRESSIVE = r"""<instructions>
+You are a prompt upsampler for a text-to-image model. Your job is to UPSAMPLE — take a sparse natural-language request and expand it into a rich, dense, structured JSON description of the target image. This instructions block governs the response. Next come an <image_description> scene description, a <task_constraints> numbered constraint list, and an <output_json_template> JSON schema. Produce exactly one fenced JSON object that fully populates every top-level key, satisfies every numbered task constraint, and is internally consistent with the request.
+
+The output is always DENSE. Even when the request is brief, infer plausible, scene-consistent details for every field. Do not leave fields empty merely because the request did not mention them — the purpose of upsampling is to turn a sparse request into a complete, image-ready annotation. Be creative but stay grounded: additions must be physically plausible and internally consistent with the request's setting, subjects, mood, and context.
+</instructions>
+
+<image_description>
+{description}
+</image_description>
+
+<task_constraints>
+1. **Scene imagination first.** Begin by filling `scene_imagination` first, before any other field, as one single string made of short verb-led prompts (e.g., focus:, define:, refine:, visualize:, analyze:). Write ~6-12 prompts, under ~250 words total. Use this as your scratchpad for the whole scene: focus the main subject, define key elements, refine details, visualize lighting/camera/atmosphere, analyze coherence. Every later field must be consistent with what you wrote here. (Operational note: at deployment, the inference team strips `scene_imagination` before the JSON is passed downstream.)
+
+2. **Comprehensive T2I caption — pinned 2nd, dense, downstream-actionable.** After `scene_imagination`, populate `comprehensive_t2i_caption` immediately (MUST remain the 2nd top-level key). This is the natural-language prose passed to the downstream image generator; all other JSON keys exist to support it.
+
+   - **Density**: 80-200 words as a SINGLE tight paragraph (1-3 sentences). Not a one-line synopsis; not a list.
+   - **Integration**: merge EVERY concrete detail from the structured fields you populate below — primary and secondary subjects (appearance, wardrobe, expression, pose), background_setting, lighting (conditions, direction, shadow behavior, illumination effect), aesthetics (composition, palette, mood, patterns), cinematography (framing, angle, depth-of-field, focus, lens), style_medium, artistic_style, and any visible text/signage. Do not exclude any concrete item present in `subjects[]` or other populated fields.
+   - **Phrasing**: be immediate and literal. Start with the subject in the setting — NEVER begin with "this image shows", "an image of", "a picture of", "depicting", "we see", or any meta-intro.
+     DO  : "A young woman in a crimson dress stands at the rim of a moonlit canyon..."
+     DON'T: "This image shows a young woman in a crimson dress standing at the rim..."
+   - **Specificity**: use visually-executable adjectives. Swap vague terms ("good lighting") for concrete directives ("warm late-afternoon golden light raking across...").
+
+3. **Output-parameter copy.** Copy these exact values into the matching output JSON keys, byte-for-byte:
+   - aspect_ratio: "{aspect_ratio}"
+   - resolution: {"W": {resolution_w}, "H": {resolution_h}}
+   Do not modify, normalize, or relocate them. (T2I has no duration or fps.)
+
+4. **Internal consistency.** Lighting / setting / time-of-day / camera / framing / mood must be mutually consistent. No contradictions (e.g. "harsh noon sun" with "dim candlelit interior" unless justified).
+
+5. **Faithfulness.** Do not contradict the provided image description. Every concrete element it mentions (subjects, actions/poses, wardrobe, props, background features, environment, style cues) must appear in the JSON or be extended in a way that is clearly plausible and non-conflicting.
+
+6. **EXPRESSIVE DENSITY (highest priority for empties).** The purpose of the upsampler is to FILL the structured annotation, not echo it. Even when the request is brief, infer plausible details for every field consistent with the request's scene, subjects, and mood. Be creative but stay grounded:
+   - Additions must be physically plausible and internally consistent.
+   - For realistic scenes, additions are physically plausible and context-appropriate.
+   - For animation/sci-fi/fantasy/surreal, additions follow that genre's conventions and visual language.
+   - Inferences must support the comprehensive_t2i_caption — not contradict source, not introduce conflicting elements.
+
+7. **Schema completeness and permitted empties.** Include every top-level key from the template exactly once. Never add keys beyond the template. Populate every field with specific, image-grounded detail. Empty values are permitted ONLY for truly inapplicable fields:
+   - Human-only subject fields (clothing, expression, gender, age, skin_tone_and_texture, facial_features, number_of_arms, number_of_legs, number_of_hands, number_of_fingers) when the subject is non-human.
+   - `text_and_signage_elements = []` when no visible text or signage is present.
+   - `aesthetics.patterns = ""` when there are no notable repeating patterns.
+   - `subject_details = {}` when no image-specific structured attributes apply.
+   The only permitted empty literals are exactly: `""`, `0`, `[]`, `{}`. Do not use `null`.
+   - If any subject is human/humanoid: set `number_of_hands = 2` and `number_of_fingers = 10`. If all subjects are non-human, set both to 0.
+
+8. **subject_details density (T2I-only).** Top-level `subject_details` dict is present and non-empty: 2-5 image-specific attribute keys with concrete descriptive string values (e.g. `"hairstyle": "wavy auburn shoulder-length"`, `"footwear": "tan leather Chelsea boots"`, `"hand_props": "antique brass pocket watch in right hand"`). Vary keys per image; never reuse `subjects[].*` field names; never output `{}` when at least one human/humanoid subject is present.
+
+9. **Output format.** Return ONLY the single JSON object, wrapped inside a ```json code fence. No prose, explanations, comments, or text outside the fence.
+</task_constraints>
+
+<output_json_template>
+{
+  "scene_imagination": "single string; verb-led scratchpad (focus:, define:, refine:, visualize:, analyze:); ~6-12 prompts; under ~250 words",
+  "comprehensive_t2i_caption": "Per task constraint #2 — dense single paragraph, 80-200 words, integrates every concrete item from structured fields, starts with subject-in-setting, no meta intros",
+  "subjects": [
+    {
+      "description": "full visual description of the subject (appearance, identifying features, distinctive traits)",
+      "appearance_details": "secondary visual details (accessories, textures, surface character)",
+      "relationship": "how this subject relates to others or to the scene",
+      "location": "where in frame (e.g., 'Center foreground', 'Top right')",
+      "relative_size": "size within frame (e.g., 'Small within frame', 'Medium within frame', 'Large within frame')",
+      "orientation": "direction subject faces relative to camera",
+      "pose": "body position and posture",
+      "clothing": "clothing and accessories; '' if non-human",
+      "expression": "facial expression; '' if non-human or not visible",
+      "gender": "one of 'Male', 'Female', 'Unknown', 'N/A'",
+      "age": "age category (e.g., 'Child', 'Young adult', 'Adult', 'Middle-aged', 'Elderly')",
+      "skin_tone_and_texture": "skin tone and texture description; '' if non-human",
+      "facial_features": "notable facial features incl. eye shape/color, hair color/style/length, lip shape, wrinkles, moles, scars, freckles, facial hair, glasses, makeup, and other visible fine-grained facial attributes; '' if non-human or not visible",
+      "number_of_subjects": "int; total in this subject's group; 0 if N/A",
+      "number_of_arms": "int; 2 for humans, 0 if non-human",
+      "number_of_legs": "int; 2 for humans, 0 if non-human",
+      "number_of_hands": "int; 2 for humans, 0 if non-human",
+      "number_of_fingers": "int; 10 for humans, 0 if non-human"
+    }
+  ],
+  "subject_details": {
+    "<key_name>": "free-form image-specific structured attribute; keys vary per image; '' value strings allowed but never the whole dict empty"
+  },
+  "background_setting": "full prose description of the environment / setting / context behind the main subject(s)",
+  "lighting": {
+    "conditions": "type and quality of light (e.g., 'Bright daylight', 'Overcast', 'Studio lighting', 'Golden hour')",
+    "direction": "primary light direction (e.g., 'top-lit', 'front-lit', 'side-lit from right')",
+    "shadows": "shadow character (e.g., 'soft', 'hard', 'long-cast')",
+    "illumination_effect": "any notable illumination effect (e.g., 'rim-light', 'god rays', 'lens flare', 'soft fill')"
+  },
+  "aesthetics": {
+    "composition": "compositional choices (e.g., 'rule-of-thirds', 'symmetric', 'leading lines', 'center-weighted')",
+    "color_scheme": "dominant color palette and mood",
+    "mood_atmosphere": "emotional tone of the image",
+    "patterns": "notable repeating visual patterns; '' if none"
+  },
+  "cinematography": {
+    "framing": "shot framing (e.g., 'wide', 'medium', 'close-up')",
+    "camera_angle": "camera angle (e.g., 'eye-level', 'high-angle', 'Dutch angle')",
+    "depth_of_field": "depth-of-field choice (e.g., 'shallow', 'deep', 'uniform focus')",
+    "focus": "what is in sharp focus (e.g., 'subject in foreground; background bokeh')",
+    "lens_focal_length": "focal length style (e.g., 'wide-angle 24mm', 'telephoto 85mm')"
+  },
+  "style_medium": "rendering style and medium (e.g., 'photoreal photograph', 'oil painting', 'cel-shaded animation', 'digital presentation slide', 'screenshot')",
+  "artistic_style": "broader artistic style if applicable (e.g., 'noir', 'pastoral painterly', 'cyberpunk')",
+  "context": "broader narrative or situational context (brief)",
+  "text_and_signage_elements": [
+    {
+      "text": "the exact text/sign content",
+      "category": "one of 'physical_in_scene', 'scene_sign', 'ui_text', 'body_text', 'caption', 'logo', 'label'",
+      "appearance": "how the text appears (font style, color, size, weight)",
+      "spatial": "where in the image the text appears",
+      "context": "narrative or situational context for the text"
+    }
+  ],
+  "quadrant_scan": {
+    "top_left": "what is in the top-left region",
+    "top_right": "what is in the top-right region",
+    "bottom_left": "what is in the bottom-left region",
+    "bottom_right": "what is in the bottom-right region",
+    "absolute_center": "what is in the dead-center of the frame"
+  },
+  "resolution": "Per task constraint #3",
+  "aspect_ratio": "Per task constraint #3"
+}
+</output_json_template>"""
+
+
 _TEMPLATE_I2V_V4_2 = r"""<instructions>
 Your function is to operate as a prompt upsampler for an image-to-video model. You will be provided with several inputs: (a) an attached starting frame image, which serves as the definitive visual ground truth for subjects, setting, lighting, and color palette; (b) this instruction block; (c) a <video_description> detailing the scene's temporal and action-based intent; (d) a numbered <task_constraints> list; and (e) an <output_json_template> schema. Your sole output is one fenced JSON object. This object must populate every required field from the template and meticulously satisfy every numbered task constraint. Fields pertaining to visual information (`subjects`, `background_setting`, `lighting`, `aesthetics`, `style_medium`, `artistic_style`) must be entirely consistent with the attached image and must not contradict it. Fields pertaining to temporal information (`actions`, `segments`, `transitions`, `temporal_caption`) should be derived from the <video_description>, allowing for plausible extrapolation of events beyond the static first frame. The duration value from task constraint #2 establishes a strict upper limit for all time-based values in the JSON, which includes the latest action end time and the closing `time_range` of the final segment; all scheduling must occur within this duration.
 
@@ -612,12 +857,22 @@
 }
 
 
+# Registry of every (version, task) → template body. `build_user_text` /
+# `build_messages` select an entry via the `version=` kwarg (default
+# "v4.2"). For t2i, the v4.2 default body is the EXPRESSIVE variant — the
+# v4.2-baseline body remains addressable as ("v4.2-original", "t2i") for
+# baseline / A-B comparisons. The constrained anti-hallucination variant is
+# at ("v4.2-constrained", "t2i"). Transfer has an analogous structured
+# variant at ("v4.2-structured", "transfer").
 CANONICAL_TEMPLATES: dict[tuple[str, str], str] = {
     ("v4.2", "t2v"): _TEMPLATE_T2V_V4_2,
-    ("v4.2", "t2i"): _TEMPLATE_T2I_V4_2,
+    ("v4.2", "t2i"): _TEMPLATE_T2I_V4_2_EXPRESSIVE,
     ("v4.2", "i2v"): _TEMPLATE_I2V_V4_2,
     ("v4.2", "transfer"): _TEMPLATE_TRANSFER_V4_2,
     ("v4.2-structured", "transfer"): _TEMPLATE_TRANSFER_STRUCTURED_V4_2,
+    ("v4.2-expressive", "t2i"): _TEMPLATE_T2I_V4_2_EXPRESSIVE,
+    ("v4.2-original", "t2i"): _TEMPLATE_T2I_V4_2,
+    ("v4.2-constrained", "t2i"): _TEMPLATE_T2I_V4_2_CONSTRAINED,
 }
 
 
@@ -908,7 +1163,7 @@ def is_upsampled_prompt(prompt: str) -> bool:
     the native upsampler again.
 
     Used by inference callers (e.g.
-    ``cosmos_framework.inference.OmniInference._iter_predictions``) to decide
+    ``cosmos3.inference.OmniInference._iter_predictions``) to decide
     per-batch whether to pass a native prompt-upsample task to
     :meth:`OmniMoTModel.generate_samples_from_batch`.  Two motivating
     cases produce already-upsampled prompts:
diff --git a/cosmos_framework/model/vfm/utils/__init__.py b/cosmos_framework/model/vfm/utils/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/utils/__init__.py
+++ b/cosmos_framework/model/vfm/utils/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/utils/data_and_condition.py b/cosmos_framework/model/vfm/utils/data_and_condition.py
index 32650a3..44aaa85 100644
--- a/cosmos_framework/model/vfm/utils/data_and_condition.py
+++ b/cosmos_framework/model/vfm/utils/data_and_condition.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """
 Unified data and condition interface where we save the tokenized states and/or
 noised latent states for diffusion/flow-matching training.
@@ -25,6 +26,7 @@ class GenerationDataClean:
     raw_state_vision: list[torch.Tensor] | None = None  # raw state in pixel space
     x0_tokens_vision: list[torch.Tensor] | None = None  # tokenized latent state
     fps_vision: torch.Tensor | None = None
+    temporal_positions_vision: list[torch.Tensor] | None = None  # one [T] tensor per vision latent item
 
     # Image editing: number of vision items per sample.
     # When set, x0_tokens_vision is a flat list of individually-encoded image latents
diff --git a/cosmos_framework/model/vfm/utils/memory.py b/cosmos_framework/model/vfm/utils/memory.py
index 0487882..b2671fa 100644
--- a/cosmos_framework/model/vfm/utils/memory.py
+++ b/cosmos_framework/model/vfm/utils/memory.py
@@ -93,11 +93,10 @@ def is_gen_only(self) -> bool:
         Used for autoregressive frame-by-frame generation of video.
         """
 
-    @property
-    def uses_rolling_kv_cache(self) -> bool:
-        """Whether this memory uses the rolling KV-cache / compile-safe path.
+    def requires_natten_metadata(self) -> bool:
+        """Whether the packed-sequence builder should create NATTEN metadata.
 
-        When ``True``, the network skips NATTEN metadata computation because
-        temporal causality is handled inside three-way attention instead.
+        Memory paths whose attention implementation handles temporal
+        visibility itself return ``False``.
         """
-        return False
+        return True
diff --git a/cosmos_framework/model/vfm/utils/safetensors_loader.py b/cosmos_framework/model/vfm/utils/safetensors_loader.py
index 19ae6b3..15b8774 100644
--- a/cosmos_framework/model/vfm/utils/safetensors_loader.py
+++ b/cosmos_framework/model/vfm/utils/safetensors_loader.py
@@ -988,7 +988,7 @@ def load_language_model(
 
     if tie_embeddings:
         # The `*ForCausalLM` classes in
-        # `projects/cosmos3/vfm/models/mot/unified_mot.py` override
+        # `cosmos_framework/model/vfm/mot/unified_mot.py` override
         # `get_input_embeddings` (canonical HF idiom) to return the inner
         # `model.embed_tokens`, so this call returns a real `nn.Embedding`
         # rather than raising `NotImplementedError`.
@@ -1200,7 +1200,7 @@ def load_vfm_model(
     r"""Load a complete Cosmos3 VFM checkpoint (safetensors) into a Cosmos3VFMNetwork.
 
     Loads the *entire* state of a
-    :class:`~projects.cosmos3.vfm.models.mot.cosmos3_vfm_network.Cosmos3VFMNetwork`
+    :class:`~cosmos_framework.model.vfm.mot.cosmos3_vfm_network.Cosmos3VFMNetwork`
     in one shot:
 
     - the language tower (``language_model.*``), which carries the
diff --git a/cosmos_framework/model/vfm/utils/safetensors_loader_test.py b/cosmos_framework/model/vfm/utils/safetensors_loader_test.py
index 151db3c..657bccc 100644
--- a/cosmos_framework/model/vfm/utils/safetensors_loader_test.py
+++ b/cosmos_framework/model/vfm/utils/safetensors_loader_test.py
@@ -1,7 +1,6 @@
-# -----------------------------------------------------------------------------
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# -----------------------------------------------------------------------------
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
 """
 Unit tests for safetensors_loader helpers and load_vlm_model.
 
@@ -64,12 +63,12 @@ def _make_safetensors(tmp_path: Path, tensors: dict[str, torch.Tensor]) -> Path:
     return ckpt_dir
 
 
-
+# NOTE on ``parallel_dims`` in ``load_vlm_model`` tests:
 #
 # The single-rank CPU fallback is reached by passing ``parallel_dims=None``
 # (the documented escape hatch — see ``load_vlm_model`` docstring). All
 # end-to-end tests below use that path; multi-rank behavior is covered in
-# the GPU-marked tests under ``projects/cosmos3/vfm/models/mot/``.
+# the GPU-marked tests under ``cosmos_framework/model/vfm/mot/``.
 #
 # Do NOT introduce a "fake" ``ParallelDims`` MagicMock fixture for this
 # fallback: ``MagicMock.__getitem__`` returns another MagicMock rather than
diff --git a/cosmos_framework/model/vfm/vlm/__init__.py b/cosmos_framework/model/vfm/vlm/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/__init__.py b/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/nemotron_3_dense_vl_test.py b/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/nemotron_3_dense_vl_test.py
deleted file mode 100644
index 4a5bcf0..0000000
--- a/cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/nemotron_3_dense_vl_test.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: OpenMDW-1.1
-
-"""Component-level tests for the Nemotron 3 Dense VL text backbone modules.
-
-CPU-only, no GPU or credentials needed — covers the config, RMSNorm, MLP, and
-rotary-embedding building blocks.
-
-Usage:
-    pytest cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/nemotron_3_dense_vl_test.py -s -v
-"""
-
-import torch
-
-from cosmos_framework.model.vfm.vlm.nemotron_3_dense_vl.configuration_nemotron_3_dense_vl import (
-    Nemotron3DenseVLTextConfig,
-)
-from cosmos_framework.model.vfm.vlm.nemotron_3_dense_vl.nemotron_3_dense_vl import (
-    MultiModalRotaryEmbedding,
-    Nemotron3DenseVLMLP,
-    Nemotron3DenseVLPreTrainedModel,
-    Nemotron3DenseVLRMSNorm,
-    apply_rotary_pos_emb_partial,
-    rotate_half,
-)
-
-
-CONFIG_JSON = "cosmos_framework/model/vfm/vlm/nemotron_3_dense_vl/configs/Nemotron-2B-Dense-VL.json"
-
-
-def _make_small_config(**overrides) -> Nemotron3DenseVLTextConfig:
-    """Build a small config suitable for fast CPU tests."""
-    defaults = dict(
-        vocab_size=256,
-        hidden_size=64,
-        intermediate_size=128,
-        num_hidden_layers=2,
-        num_attention_heads=4,
-        head_dim=16,
-        num_key_value_heads=2,
-        max_position_embeddings=512,
-        mlp_hidden_act="relu2",
-    )
-    defaults.update(overrides)
-    return Nemotron3DenseVLTextConfig(**defaults)
-
-
-# ---------------------------------------------------------------------------
-# Component-level tests (CPU-only, no credentials)
-# ---------------------------------------------------------------------------
-
-
-class TestNemotron3DenseVLTextConfig:
-    def test_defaults(self) -> None:
-        cfg = Nemotron3DenseVLTextConfig()
-        assert cfg.vocab_size == 131072
-        assert cfg.hidden_size == 2048
-        assert cfg.intermediate_size == 9216
-        assert cfg.num_hidden_layers == 28
-        assert cfg.num_attention_heads == 16
-        assert cfg.head_dim == 128
-        assert cfg.num_key_value_heads == 8
-        assert cfg.mlp_hidden_act == "relu2"
-        assert cfg.mlp_bias is False
-        assert cfg.attention_bias is False
-        assert cfg.enable_rope is True
-        assert cfg.enable_mrope is True
-        assert cfg.mrope_section == [24, 20, 20]
-        assert cfg.rope_theta == 100_000_000.0
-        assert cfg.tie_word_embeddings is False
-
-    def test_rms_norm_eps_alias(self) -> None:
-        cfg = Nemotron3DenseVLTextConfig(layer_norm_epsilon=1e-6)
-        assert cfg.rms_norm_eps == 1e-6
-
-    def test_from_json_file(self) -> None:
-        cfg = Nemotron3DenseVLTextConfig.from_json_file(CONFIG_JSON)
-        assert cfg.vocab_size == 131072
-        assert cfg.hidden_size == 2048
-        assert cfg.num_hidden_layers == 28
-        assert cfg.mlp_hidden_act == "relu2"
-        assert cfg.mrope_section == [24, 20, 20]
-
-    def test_custom_overrides(self) -> None:
-        cfg = Nemotron3DenseVLTextConfig(
-            hidden_size=512,
-            num_hidden_layers=4,
-            num_attention_heads=8,
-            head_dim=64,
-        )
-        assert cfg.hidden_size == 512
-        assert cfg.num_hidden_layers == 4
-        assert cfg.num_attention_heads == 8
-        assert cfg.head_dim == 64
-
-
-class TestNemotron3DenseVLRMSNorm:
-    def test_output_shape(self) -> None:
-        norm = Nemotron3DenseVLRMSNorm(hidden_size=64, eps=1e-5)
-        x = torch.randn(2, 10, 64)
-        out = norm(x)
-        assert out.shape == x.shape
-
-    def test_dtype_preservation(self) -> None:
-        norm = Nemotron3DenseVLRMSNorm(hidden_size=32)
-        x_fp16 = torch.randn(1, 5, 32, dtype=torch.float16)
-        out = norm(x_fp16)
-        assert out.dtype == torch.float16
-
-    def test_unit_weight_is_identity_for_normalized(self) -> None:
-        """With weight=1 and input already unit-norm, output should closely match input."""
-        norm = Nemotron3DenseVLRMSNorm(hidden_size=16)
-        x = torch.randn(1, 1, 16)
-        rms = x.pow(2).mean(-1, keepdim=True).sqrt()
-        x_unit = x / rms
-        out = norm(x_unit)
-        assert torch.allclose(out.float(), x_unit.float(), atol=1e-4)
-
-    def test_extra_repr(self) -> None:
-        norm = Nemotron3DenseVLRMSNorm(hidden_size=64, eps=1e-6)
-        s = norm.extra_repr()
-        assert "(64,)" in s
-        assert "1e-06" in s
-
-
-class TestNemotron3DenseVLMLP:
-    def test_output_shape(self) -> None:
-        cfg = _make_small_config()
-        mlp = Nemotron3DenseVLMLP(cfg)
-        x = torch.randn(2, 10, cfg.hidden_size)
-        out = mlp(x)
-        assert out.shape == x.shape
-
-    def test_relu2_activation_is_nonnegative(self) -> None:
-        """relu(x)^2 is always >= 0."""
-        cfg = _make_small_config()
-        mlp = Nemotron3DenseVLMLP(cfg)
-        x = torch.randn(4, 8, cfg.hidden_size)
-        intermediate = mlp.act_fn(mlp.up_proj(x))
-        assert (intermediate >= 0).all()
-
-    def test_no_bias_by_default(self) -> None:
-        cfg = _make_small_config(mlp_bias=False)
-        mlp = Nemotron3DenseVLMLP(cfg)
-        assert mlp.up_proj.bias is None
-        assert mlp.down_proj.bias is None
-
-    def test_with_bias(self) -> None:
-        cfg = _make_small_config(mlp_bias=True)
-        mlp = Nemotron3DenseVLMLP(cfg)
-        assert mlp.up_proj.bias is not None
-        assert mlp.down_proj.bias is not None
-
-
-class TestRotateHalf:
-    def test_output_shape(self) -> None:
-        x = torch.randn(2, 4, 8)
-        out = rotate_half(x)
-        assert out.shape == x.shape
-
-    def test_self_inverse_with_negation(self) -> None:
-        """rotate_half(rotate_half(x)) == -x."""
-        x = torch.randn(3, 5, 16)
-        out = rotate_half(rotate_half(x))
-        assert torch.allclose(out, -x)
-
-
-class TestApplyRotaryPosEmbPartial:
-    def test_full_rotation(self) -> None:
-        """When rot_dim == head_dim, all channels are rotated."""
-        seq_len, n_heads, head_dim = 10, 4, 16
-        q = torch.randn(seq_len, n_heads, head_dim)
-        k = torch.randn(seq_len, n_heads, head_dim)
-        cos = torch.randn(seq_len, head_dim)
-        sin = torch.randn(seq_len, head_dim)
-
-        q_out, k_out = apply_rotary_pos_emb_partial(q, k, cos, sin, unsqueeze_dim=1)
-        assert q_out.shape == q.shape
-        assert k_out.shape == k.shape
-
-    def test_partial_rotation_passthrough(self) -> None:
-        """When rot_dim < head_dim, the remainder channels pass through unchanged."""
-        seq_len, n_heads, head_dim = 8, 2, 32
-        rot_dim = 16
-        q = torch.randn(seq_len, n_heads, head_dim)
-        k = torch.randn(seq_len, n_heads, head_dim)
-        cos = torch.randn(seq_len, rot_dim)
-        sin = torch.randn(seq_len, rot_dim)
-
-        q_out, k_out = apply_rotary_pos_emb_partial(q, k, cos, sin, unsqueeze_dim=1)
-
-        assert torch.allclose(q_out[..., rot_dim:], q[..., rot_dim:])
-        assert torch.allclose(k_out[..., rot_dim:], k[..., rot_dim:])
-
-    def test_zero_angle_is_identity(self) -> None:
-        """With cos=1, sin=0, the rotated output should equal the input."""
-        seq_len, n_heads, head_dim = 6, 2, 16
-        q = torch.randn(seq_len, n_heads, head_dim)
-        k = torch.randn(seq_len, n_heads, head_dim)
-        cos = torch.ones(seq_len, head_dim)
-        sin = torch.zeros(seq_len, head_dim)
-
-        q_out, k_out = apply_rotary_pos_emb_partial(q, k, cos, sin, unsqueeze_dim=1)
-        assert torch.allclose(q_out, q, atol=1e-6)
-        assert torch.allclose(k_out, k, atol=1e-6)
-
-
-class TestMultiModalRotaryEmbedding:
-    def test_output_shapes(self) -> None:
-        cfg = _make_small_config()
-        rope = MultiModalRotaryEmbedding(cfg)
-        seq_len = 12
-        x = torch.randn(1, seq_len, cfg.hidden_size)
-        position_ids = torch.arange(seq_len).unsqueeze(0)
-
-        cos, sin = rope(x, position_ids)
-        assert cos.shape[-1] == cfg.head_dim
-        assert sin.shape[-1] == cfg.head_dim
-
-    def test_mrope_3d_position_ids(self) -> None:
-        """With 3D position_ids (3, batch, seq_len) the mrope interleaving path runs."""
-        cfg = _make_small_config()
-        rope = MultiModalRotaryEmbedding(cfg)
-        seq_len = 8
-        x = torch.randn(1, seq_len, cfg.hidden_size)
-        position_ids = torch.arange(seq_len).unsqueeze(0).unsqueeze(0).expand(3, 1, -1)
-
-        cos, sin = rope(x, position_ids)
-        assert cos.shape[-1] == cfg.head_dim
-        assert sin.shape[-1] == cfg.head_dim
-
-    def test_init_weights(self) -> None:
-        cfg = _make_small_config()
-        rope = MultiModalRotaryEmbedding(cfg)
-        orig_inv_freq = rope.inv_freq.clone()
-        rope.init_weights(buffer_device=None)
-        assert torch.allclose(rope.inv_freq, orig_inv_freq)
-
-    def test_deterministic(self) -> None:
-        cfg = _make_small_config()
-        rope = MultiModalRotaryEmbedding(cfg)
-        seq_len = 10
-        x = torch.randn(1, seq_len, cfg.hidden_size)
-        pos = torch.arange(seq_len).unsqueeze(0)
-        cos1, sin1 = rope(x, pos)
-        cos2, sin2 = rope(x, pos)
-        assert torch.allclose(cos1, cos2)
-        assert torch.allclose(sin1, sin2)
-
-
-class TestNemotron3DenseVLPreTrainedModel:
-    def test_config_class(self) -> None:
-        assert Nemotron3DenseVLPreTrainedModel.config_class == Nemotron3DenseVLTextConfig
-
-    def test_base_model_prefix(self) -> None:
-        assert Nemotron3DenseVLPreTrainedModel.base_model_prefix == "model"
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/__init__.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/configs/__init__.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/configs/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/configs/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/configs/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/configuration_qwen3_vl.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/configuration_qwen3_vl.py
index 9eaa380..cec3721 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/configuration_qwen3_vl.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/configuration_qwen3_vl.py
@@ -1,18 +1,4 @@
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Source Repository: https://github.com/huggingface/transformers
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/qwen3_vl.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/qwen3_vl.py
index 46f6f98..6a4d772 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/qwen3_vl.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/qwen3_vl.py
@@ -1,18 +1,4 @@
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Source Repository: https://github.com/huggingface/transformers
@@ -33,6 +19,21 @@
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_outputs import BaseModelOutputWithPast, ModelOutput
 from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+
+# "default" rope type was removed from ROPE_INIT_FUNCTIONS in transformers>=5.x
+if "default" not in ROPE_INIT_FUNCTIONS:
+
+    def _default_rope_init(config, device=None, **kwargs):
+        base = config.rope_theta
+        head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+        dim = int(head_dim * partial_rotary_factor)
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
+        )
+        return inv_freq, 1.0
+
+    ROPE_INIT_FUNCTIONS["default"] = _default_rope_init
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from transformers.processing_utils import Unpack
 from transformers.utils import is_torchdynamo_compiling
@@ -320,11 +321,7 @@ def __init__(self, config: Qwen3VLTextConfig):
         self.original_max_seq_len = config.max_position_embeddings
 
         self.config = config
-        rope_type = self.rope_type
-        if rope_type not in ROPE_INIT_FUNCTIONS and rope_type == "default":
-            # transformers>=5 renamed "default" RoPE entry to "proportional".
-            rope_type = "proportional"
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[rope_type]
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
 
         self.mrope_section = (
             config.rope_scaling.get("mrope_section", [24, 20, 20]) if config.rope_scaling is not None else [24, 20, 20]
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
index 5a8ecff..e39ad3f 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
@@ -1,9 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-# Core masking functions extracted from transformers.masking_utils for BAGEL compatibility
-# Original Copyright 2025 HuggingFace Inc. team. Licensed under the Apache License, Version 2.0
-
 from typing import Any, Callable, ClassVar, Optional, cast
 
 import torch
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/video_processing_qwen3_vl.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/video_processing_qwen3_vl.py
index 717ef0a..a118f34 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/video_processing_qwen3_vl.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/video_processing_qwen3_vl.py
@@ -1,18 +1,4 @@
-# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 # Source Repository: https://github.com/huggingface/transformers
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/__init__.py b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/configs/__init__.py b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/configs/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/configs/__init__.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/configs/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe.py b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe.py
index fc52a9a..5794457 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-
 from typing import Callable
 
 import torch
@@ -104,6 +103,7 @@ def forward(
         sentinel = torch.tensor([num_tokens], device=hidden_states.device)  # for padding slots
         token_indices_ext = torch.cat([token_indices_sorted, sentinel])
         combined_indices = token_indices_ext[permuted_indices.long()]
+        combined_indices = combined_indices.unsqueeze(-1).expand(-1, dim)
 
         # Pad scores with a zero sentinel so padding slots contribute nothing
         scores_ext = torch.cat([topk_scores_sorted, topk_scores_sorted.new_zeros(1)])
@@ -111,7 +111,7 @@ def forward(
 
         # Single gather (with a zero-padded sentinel row)
         input_padded = torch.cat([hidden_states, hidden_states.new_zeros(1, dim)])
-        routed_input = input_padded.index_select(dim=0, index=combined_indices)
+        routed_input = input_padded.gather(dim=0, index=combined_indices)
 
         # Run experts
         routed_output = _run_experts_grouped_mm(
@@ -124,7 +124,7 @@ def forward(
         )
 
         output_padded = torch.zeros_like(input_padded)
-        output_padded.index_add_(dim=0, index=combined_indices, source=routed_output)
+        output_padded.scatter_add_(dim=0, index=combined_indices, src=routed_output)
         return output_padded[:-1]
 
     def _reorder_tokens(
@@ -219,8 +219,9 @@ def forward(
                 assert weighted_output.dtype == hidden_states.dtype
                 next_states.index_add_(0, token_idx, weighted_output)
         else:
-            hidden_states = hidden_states.unsqueeze(0).expand(
-                self.num_experts, -1, -1
+            hidden_states = hidden_states.repeat(self.num_experts, 1)  # [num_experts*num_tokens,hidden_size]
+            hidden_states = hidden_states.view(
+                self.num_experts, -1, self.hidden_size
             )  # [num_experts,num_tokens,hidden_size]
             gate_up = torch.bmm(hidden_states, self.gate_up_proj)  # [num_experts,num_tokens,2*moe_intermediate_size]
             gate, up = gate_up.chunk(
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe_test.py b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe_test.py
index d31545b..fdb455c 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe_test.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/moe_test.py
@@ -12,26 +12,18 @@
 from cosmos_framework.model.vfm.vlm.qwen3_vl_moe.moe import create_text_experts
 
 
-def run_moe(
-    mod: nn.Module,
-    hidden_states: torch.Tensor,
-    topk_scores: torch.Tensor,
-    expert_indices: torch.Tensor,
-    num_tokens_per_expert: torch.Tensor,
-):
+def run_moe(mod: nn.Module, hidden_states: torch.Tensor, topk_scores: torch.Tensor, expert_indices: torch.Tensor):
     num_warmup_iterations = 10
     num_timing_iterations = 100
 
     for _ in range(num_warmup_iterations):
         with torch.no_grad():
-            output = mod(hidden_states, topk_scores, expert_indices, num_tokens_per_expert)
-    torch.cuda.synchronize()
+            output = mod(hidden_states, topk_scores, expert_indices)
 
     start_time = time.time()
     for _ in range(num_timing_iterations):
         with torch.no_grad():
-            output = mod(hidden_states, topk_scores, expert_indices, num_tokens_per_expert)
-    torch.cuda.synchronize()
+            output = mod(hidden_states, topk_scores, expert_indices)
     end_time = time.time()
 
     time_taken = (end_time - start_time) / num_timing_iterations
@@ -54,7 +46,7 @@ def main():
     control = create_text_experts(config, implementation_type="naive")
     exp = create_text_experts(config, implementation_type="grouped_mm")
 
-    control.init_weights(torch.device("cpu"))
+    control.init_weights()
     exp.load_state_dict(control.state_dict())
 
     control = control.to(device="cuda", dtype=torch.bfloat16)
@@ -66,36 +58,31 @@ def main():
         dtype=torch.bfloat16,
         device="cuda",
     )
-    topk_scores = torch.rand(
+    topk_scores = torch.randn(
         num_tokens,
         config.num_experts_per_tok,
         dtype=torch.bfloat16,
         device="cuda",
     )
     topk_scores = topk_scores / topk_scores.sum(dim=-1, keepdim=True)
-    expert_indices = torch.stack(
-        [torch.randperm(config.num_experts, device="cuda")[: config.num_experts_per_tok] for _ in range(num_tokens)]
-    ).to(torch.int64)
-    num_tokens_per_expert = torch.histc(
-        expert_indices.to(dtype=torch.int32).view(-1),
-        bins=config.num_experts,
-        min=0,
-        max=config.num_experts - 1,
+    expert_indices = torch.randint(
+        0,
+        config.num_experts,
+        (num_tokens, config.num_experts_per_tok),
+        dtype=torch.int64,
+        device="cuda",
     )
 
     print(
         f"hidden_states: {hidden_states.norm().detach().cpu().item()} {hidden_states.shape} {hidden_states.dtype} {hidden_states.device}"
     )
 
-    control_output, control_time_taken = run_moe(
-        control, hidden_states, topk_scores, expert_indices, num_tokens_per_expert
-    )
-    exp_output, exp_time_taken = run_moe(exp, hidden_states, topk_scores, expert_indices, num_tokens_per_expert)
+    control_output, control_time_taken = run_moe(control, hidden_states, topk_scores, expert_indices)
+    exp_output, exp_time_taken = run_moe(exp, hidden_states, topk_scores, expert_indices)
 
     diff = (control_output.detach().cpu() - exp_output.detach().cpu()).norm() / control_output.detach().cpu().norm()
     print(f"Diff: {diff}")
     print(f"Speedup: {control_time_taken / exp_time_taken}")
-    torch.testing.assert_close(control_output, exp_output, rtol=1e-2, atol=1e-2)
 
 
 if __name__ == "__main__":
diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/qwen3_vl_moe.py b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/qwen3_vl_moe.py
index d9c01f6..bd02d70 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/qwen3_vl_moe.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl_moe/qwen3_vl_moe.py
@@ -94,13 +94,22 @@ def extra_repr(self):
 
 
 class Qwen3VLMoeTextSparseMoeBlock(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, noisy_gating: bool = False):
         super().__init__()
         self.config = config
         self.hidden_size = config.hidden_size
         self.num_experts = config.num_experts
         self.top_k = config.num_experts_per_tok
         self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False)
+        # Noisy top-k gating (Shazeer 2017): a second projection produces a
+        # per-token, per-expert noise magnitude. During training the top-k
+        # selection is made on clean_logits + N(0,1) * softplus(gate_noise(x)),
+        # which keeps under-used experts in play and fights routing collapse.
+        # Gen-tower only; the und tower constructs this block with
+        # noisy_gating=False so it has no gate_noise parameter.
+        self.noisy_gating = noisy_gating
+        if noisy_gating:
+            self.gate_noise = nn.Linear(config.hidden_size, config.num_experts, bias=False)
         self.experts = create_text_experts(config, implementation_type="grouped_mm")
 
         # ── Heatmap tracking ──────────────────────────────────────────────────────
@@ -236,10 +245,25 @@ def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, LBLMetadat
         num_tokens = hidden_states.shape[0]
 
         router_logits = self.gate(hidden_states)  # [num_tokens,num_experts]
+        # Clean router distribution. Always used for monitoring (entropy/stability
+        # buffers) and the load-balancing-loss probability term so those stay
+        # comparable regardless of whether noisy gating is enabled.
         routing_weights = torch.nn.functional.softmax(
             router_logits, dim=-1, dtype=torch.float32
         )  # [num_tokens,num_experts]
-        expert_weights, expert_indices = torch.topk(routing_weights, self.top_k, dim=-1)
+
+        # Noisy top-k gating: only the expert *selection* (and the combine
+        # weights over the selected experts) sees the noise. When noise is off
+        # or at eval time, selection_weights == routing_weights, so behavior is
+        # identical to plain top-k gating.
+        if self.noisy_gating and self.training:
+            noise_std = torch.nn.functional.softplus(self.gate_noise(hidden_states))  # [num_tokens,num_experts]
+            noisy_logits = router_logits + torch.randn_like(router_logits) * noise_std
+            selection_weights = torch.nn.functional.softmax(noisy_logits, dim=-1, dtype=torch.float32)
+        else:
+            selection_weights = routing_weights
+
+        expert_weights, expert_indices = torch.topk(selection_weights, self.top_k, dim=-1)
         # expert_weights: [num_tokens,top_k], expert_indices: [num_tokens,top_k]
 
         expert_weights = expert_weights / expert_weights.sum(dim=-1, keepdim=True)  # [num_tokens,top_k]
@@ -385,6 +409,10 @@ def init_weights(self, buffer_device: torch.device | None = None):
         nn.init.normal_(self.gate.weight, mean=0.0, std=std)
         nn.init.normal_(self.experts.gate_up_proj, mean=0.0, std=std)
         nn.init.normal_(self.experts.down_proj, mean=0.0, std=std)
+        if self.noisy_gating:
+            # Zero-init so the initial per-expert noise std is softplus(0)=ln(2)
+            # uniformly, giving symmetric exploration before gate_noise learns.
+            nn.init.zeros_(self.gate_noise.weight)
 
 
 def rotate_half(x):
diff --git a/cosmos_framework/model/vfm/vlm_model.py b/cosmos_framework/model/vfm/vlm_model.py
index 163904e..e3d311a 100644
--- a/cosmos_framework/model/vfm/vlm_model.py
+++ b/cosmos_framework/model/vfm/vlm_model.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """VLMModel: config-instantiable ImaginaireModel for VLM training.
 
 Config usage (in vfm/configs/base/vlm/defaults/model.py):
@@ -124,7 +125,7 @@ def _get_overlay_config(model_type: str) -> tuple[list[str], Callable[[str], boo
 
 def _get_vision_encoder_modules(model: nn.Module, model_type: str) -> list:
     if model_type in _QWEN_VL_TYPES:
-
+        # NOTE: intentional semantic change from `model_utils.get_model_vision_encoder`,
         # which returns only [patch_embed, blocks]. Qwen3-VL adds a learnable `pos_embed`
         # (nn.Embedding — see qwen3_vl.py Qwen3VLVisionModel); leaving it trainable while
         # freezing the rest of the vision encoder contradicts the intent of
@@ -420,6 +421,7 @@ def _init_vlm(self, config: VLMModelConfig, checkpoint) -> None:
             # model.language_model.*, so no temp-dir remap is needed.
             # Mirrors legacy vlm/train.py:221-233 semantics.
             llm_path = policy.backbone.pretrained_weights.backbone_path
+
             if llm_path:
                 overlay_skip_patterns, is_lm_key = _get_overlay_config(hf_model.hf_config.model_type)
                 llm_local_path = maybe_download_hf_model_from_s3(
diff --git a/cosmos_framework/tools/flops/qwen3_vl.py b/cosmos_framework/tools/flops/qwen3_vl.py
index e4663cc..c84a244 100644
--- a/cosmos_framework/tools/flops/qwen3_vl.py
+++ b/cosmos_framework/tools/flops/qwen3_vl.py
@@ -516,7 +516,7 @@ def compute_qwen3vl_flops(
         flops_breakdown["vision_encoder"] = 0
 
     # Embedding layer FLOPs
-
+    # NOTE: Only text tokens need embeddings. Visual tokens are already embedded by vision encoder.
     text_tokens = total_tokens - visual_tokens
     if include_embeddings:
         # Embedding lookup: typically counted as 0 or hidden_size operations per token
diff --git a/cosmos_framework/tools/visualize/video.py b/cosmos_framework/tools/visualize/video.py
index 889bb7e..3c7680f 100644
--- a/cosmos_framework/tools/visualize/video.py
+++ b/cosmos_framework/tools/visualize/video.py
@@ -13,6 +13,8 @@
 
 
 def save_video(grid, video_name, fps=30):
+    # Remove ffmpegcv for license issue
+    # Use imageio instead
     import imageio
 
     grid = (grid * 255).astype(np.uint8)
diff --git a/cosmos_framework/trainer/__init__.py b/cosmos_framework/trainer/__init__.py
index 878d499..5a63f22 100644
--- a/cosmos_framework/trainer/__init__.py
+++ b/cosmos_framework/trainer/__init__.py
@@ -224,7 +224,6 @@ def train(
             model_ddp = model
         else:
             raise ValueError(f"Unknown distributed parallelism mode: {self.config.trainer.distributed_parallelism}")
-
         log.info("Starting training...")
         sm_carveout = int(os.environ.get("GROUPED_MM_SM_CARVEOUT", "0"))
         if sm_carveout:
diff --git a/cosmos_framework/utils/__init__.py b/cosmos_framework/utils/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/utils/__init__.py
+++ b/cosmos_framework/utils/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/utils/callback.py b/cosmos_framework/utils/callback.py
index 0f2a219..b940692 100644
--- a/cosmos_framework/utils/callback.py
+++ b/cosmos_framework/utils/callback.py
@@ -391,7 +391,7 @@ def on_training_step_end(
         loss: torch.Tensor,
         iteration: int = 0,
     ) -> None:
-
+        # FIXME - this is not correct when using gradient accumulation since self.start_iteration_time is updated every batch
         # but this is only called when the optimizer is updated, so it's only the time for the last batch.
         self.elapsed_iteration_time += time.time() - self.start_iteration_time
 
diff --git a/cosmos_framework/utils/checkpoint_db.py b/cosmos_framework/utils/checkpoint_db.py
index e036ffb..580f92f 100644
--- a/cosmos_framework/utils/checkpoint_db.py
+++ b/cosmos_framework/utils/checkpoint_db.py
@@ -148,6 +148,8 @@ def _hf_download(cmd_args: list[str]) -> str:
     is_rank0 = os.environ.get("RANK", "0") == "0"
     cmd = [
         "uvx",
+        "--with",
+        "click",
         f"hf@{HF_VERSION}",
         "download",
         "--format=json",
@@ -291,7 +293,6 @@ class CheckpointConfig(pydantic.BaseModel):
     """Config for checkpoint on S3."""
     hf: CheckpointHf
     """Config for checkpoint on Hugging Face."""
-
     post_download: Callable[[str], None] | None = pydantic.Field(default=None, exclude=True)
     """Optional callback invoked with the local path after a successful download.
 
@@ -424,6 +425,11 @@ def download_checkpoint(checkpoint_uri: str, *, check_exists: bool = True) -> st
     - HuggingFace URI: hf://org/repo/path/to/file.pth
     - Local path: /path/to/checkpoint
     """
+    # Local-path short-circuit: if the URI exists on disk, return it as-is
+    # without consulting the registry. Prevents the registry from rewriting
+    # a known basename (e.g. Wan2.2_VAE.pth) into an s3:// URI we can't open.
+    if os.path.exists(checkpoint_uri):
+        return checkpoint_uri
     if INTERNAL:
         return checkpoint_uri
     if (checkpoint := CheckpointConfig.maybe_from_uri(checkpoint_uri)) is not None:
@@ -441,11 +447,6 @@ def download_checkpoint_v2(checkpoint_uri: str, *, check_exists: bool = True) ->
 
     Similar to 'download_checkpoint', but unknown S3 URIs are passed through.
     """
-    # Local-path short-circuit: if the URI exists on disk, return it as-is
-    # without consulting the registry. Prevents the registry from rewriting
-    # a known basename (e.g. Wan2.2_VAE.pth) into an s3:// URI we can't open.
-    if os.path.exists(checkpoint_uri):
-        return checkpoint_uri
     if INTERNAL:
         return checkpoint_uri
     if (checkpoint := CheckpointConfig.maybe_from_uri(sanitize_uri(checkpoint_uri))) is not None:
diff --git a/cosmos_framework/utils/checkpointer.py b/cosmos_framework/utils/checkpointer.py
index 5561e01..71cd202 100644
--- a/cosmos_framework/utils/checkpointer.py
+++ b/cosmos_framework/utils/checkpointer.py
@@ -39,9 +39,6 @@ def __init__(self, config_checkpoint: CheckpointConfig, config_job: JobConfig, c
         """
         # Set the callback functions.
         self.callbacks = callbacks
-
-
-
         self.checkpoint_dir_local = f"{config_job.path_local}/checkpoints"
         self.checkpoint_dir_object_store = f"{config_job.path}/checkpoints"
         self.save_to_object_store = config_checkpoint.save_to_object_store.enabled
diff --git a/cosmos_framework/utils/config.py b/cosmos_framework/utils/config.py
index 441d1b8..c59d689 100644
--- a/cosmos_framework/utils/config.py
+++ b/cosmos_framework/utils/config.py
@@ -346,7 +346,7 @@ class NVTXConfig:
 @make_freezable
 @attrs.define(slots=False)
 class StragglerDetectionConfig:
-    """Config for the Straggler detection tool."""
+    """Config for Straggler detection tool: https://invalid_url"""
 
     # Enable the Straggler Detection.
     enabled: bool = False
@@ -512,7 +512,6 @@ def validate(self) -> None:
         distributed.broadcast(job_name_tensor, 0)
         self.job.name = job_name_tensor.cpu().numpy().tobytes().decode("utf-8")
 
-
         assert self.job.project != ""
         assert self.job.group != ""
         assert self.job.name != ""
@@ -551,7 +550,7 @@ def load_config(config_path: str, opts: list[str], enable_one_logger: bool = Fal
 
 
 def _load_py_config(config_path: str, opts: list[str], validate: bool = True) -> Config:
-
+    # NOTE: circular dependency
     from cosmos_framework.utils.config_helper import get_config_module, override
 
     t1 = time.monotonic_ns()
diff --git a/cosmos_framework/utils/device.py b/cosmos_framework/utils/device.py
index e87674d..7bc2f88 100644
--- a/cosmos_framework/utils/device.py
+++ b/cosmos_framework/utils/device.py
@@ -85,20 +85,16 @@ def gpu0_has_80gb_or_less():
 
 class Device:
 
-
     _nvml_affinity_elements = math.ceil(os.cpu_count() / 64)  # type: ignore
 
     def __init__(self, device_idx: int):
-
         super().__init__()
         self.handle = pynvml.nvmlDeviceGetHandleByIndex(device_idx)
 
     def get_name(self) -> str:
-
         return pynvml.nvmlDeviceGetName(self.handle)
 
     def get_cpu_affinity(self) -> list[int]:
-
         affinity_string = ""
         for j in pynvml.nvmlDeviceGetCpuAffinity(self.handle, Device._nvml_affinity_elements):
             # assume nvml returns list of 64 bit ints
diff --git a/cosmos_framework/utils/distributed.py b/cosmos_framework/utils/distributed.py
index 9f95223..64e0de7 100644
--- a/cosmos_framework/utils/distributed.py
+++ b/cosmos_framework/utils/distributed.py
@@ -54,7 +54,7 @@ def init() -> int | None:
         timeout_timedelta = timedelta(seconds=int(timeout_seconds))
         dist.init_process_group(backend="nccl", init_method="env://", timeout=timeout_timedelta)
         log.critical(
-            f"Initialized distributed runtime with local rank {local_rank} with timeout {timeout_seconds}",
+            f"Initialized distributed training with local rank {local_rank} with timeout {timeout_seconds}",
             rank0_only=False,
         )
     # Increase the L2 fetch granularity for faster speed.
@@ -65,7 +65,7 @@ def init() -> int | None:
         p_value = ctypes.cast((ctypes.c_int * 1)(), ctypes.POINTER(ctypes.c_int))
         _libcudart.cudaDeviceSetLimit(ctypes.c_int(0x05), ctypes.c_int(128))
         _libcudart.cudaDeviceGetLimit(p_value, ctypes.c_int(0x05))
-    log.info(f"Distributed setup with {get_world_size()} GPUs.")
+    log.info(f"Training with {get_world_size()} GPUs.")
 
 
 def get_rank(group: Optional[dist.ProcessGroup] = None) -> int:
diff --git a/cosmos_framework/utils/easy_io/backends/base_backend.py b/cosmos_framework/utils/easy_io/backends/base_backend.py
index 1eb5009..94484bb 100644
--- a/cosmos_framework/utils/easy_io/backends/base_backend.py
+++ b/cosmos_framework/utils/easy_io/backends/base_backend.py
@@ -70,7 +70,7 @@ def isfile(self, filepath: Union[str, Path]) -> bool:
         pass
 
     @abstractmethod
-    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> Union[str, Path]:
+    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str:
         pass
 
     @abstractmethod
diff --git a/cosmos_framework/utils/easy_io/backends/boto3_backend.py b/cosmos_framework/utils/easy_io/backends/boto3_backend.py
index ce4ba5f..86b1ab0 100644
--- a/cosmos_framework/utils/easy_io/backends/boto3_backend.py
+++ b/cosmos_framework/utils/easy_io/backends/boto3_backend.py
@@ -284,7 +284,7 @@ def join_path(
         self,
         filepath: Union[str, Path],
         *filepaths: Union[str, Path],
-    ) -> Union[str, Path]:
+    ) -> str:
         r"""Concatenate all file paths.
 
         Join one or more filepath components intelligently. The return value
@@ -294,7 +294,7 @@ def join_path(
             filepath (str or Path): Path to be concatenated.
 
         Returns:
-            str or Path: The result after concatenation.
+            str: The result after concatenation.
 
         Examples:
             >>> backend = Boto3Backend()
diff --git a/cosmos_framework/utils/easy_io/backends/http_backend.py b/cosmos_framework/utils/easy_io/backends/http_backend.py
index 593c9ac..32be908 100644
--- a/cosmos_framework/utils/easy_io/backends/http_backend.py
+++ b/cosmos_framework/utils/easy_io/backends/http_backend.py
@@ -112,7 +112,7 @@ def isdir(self, filepath: Union[str, Path]) -> bool:
     def isfile(self, filepath: Union[str, Path]) -> bool:
         raise NotImplementedError(f"isfile not supported in {self.name}")
 
-    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> Union[str, Path]:
+    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str:
         raise NotImplementedError(f"join_path not supported in {self.name}")
 
     @contextmanager
diff --git a/cosmos_framework/utils/easy_io/backends/local_backend.py b/cosmos_framework/utils/easy_io/backends/local_backend.py
index 7599314..886d6ab 100644
--- a/cosmos_framework/utils/easy_io/backends/local_backend.py
+++ b/cosmos_framework/utils/easy_io/backends/local_backend.py
@@ -187,7 +187,7 @@ def isfile(self, filepath: Union[str, Path]) -> bool:
         """
         return osp.isfile(filepath)
 
-    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> Union[str, Path]:
+    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str:
         r"""Concatenate all file paths.
 
         Join one or more filepath components intelligently. The return value
@@ -197,7 +197,7 @@ def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) ->
             filepath (str or Path): Path to be concatenated.
 
         Returns:
-            str or Path: The result of concatenation. Returns a Path if any input is a Path.
+            str: The result of concatenation.
 
         Examples:
             >>> backend = LocalBackend()
diff --git a/cosmos_framework/utils/easy_io/backends/msc_backend.py b/cosmos_framework/utils/easy_io/backends/msc_backend.py
index 72ac654..7c3ddf8 100644
--- a/cosmos_framework/utils/easy_io/backends/msc_backend.py
+++ b/cosmos_framework/utils/easy_io/backends/msc_backend.py
@@ -23,6 +23,7 @@
 
 # {scheme}://
 _URL_PREFIX_REGEX = r"[a-zA-Z0-9+.-]*:\/\/"
+_DEFAULT_MAX_ATTEMPTS = 50
 
 
 def _get_telemetry_config_from_msc_secret() -> Optional[dict[str, Any]]:
@@ -126,7 +127,7 @@ def _get_telemetry_config_from_msc_secret() -> Optional[dict[str, Any]]:
                     },
                     # Progressive enhancement for Slurm environments.
                     #
-                    # https://slurm.schedmd.com/prolog_epilog.html#environment_variables
+                    # https://invalid_url
                     {
                         "type": "environment_variables",
                         "options": {
@@ -267,6 +268,11 @@ def __init__(
                             "base_path": "",
                             "endpoint_url": legacy_boto3_config["endpoint_url"],
                             "region_name": legacy_boto3_config["region_name"],
+                            "retries": {
+                                "mode": "standard",
+                                "total_max_attempts": _DEFAULT_MAX_ATTEMPTS,
+                            }
+                            | legacy_boto3_config.get("retries", {}),
                         },
                     }
 
@@ -554,7 +560,7 @@ def join_path(
         self,
         filepath: Union[str, Path],
         *filepaths: Union[str, Path],
-    ) -> Union[str, Path]:
+    ) -> str:
         r"""Concatenate all file paths.
 
         Join one or more filepath components intelligently. The return value
@@ -564,7 +570,7 @@ def join_path(
             filepath (str or Path): Path to be concatenated.
 
         Returns:
-            str or Path: The result after concatenation.
+            str: The result after concatenation.
 
         Examples:
             >>> backend = MSCBackend()
diff --git a/cosmos_framework/utils/easy_io/easy_io.py b/cosmos_framework/utils/easy_io/easy_io.py
index 1963764..521ba64 100644
--- a/cosmos_framework/utils/easy_io/easy_io.py
+++ b/cosmos_framework/utils/easy_io/easy_io.py
@@ -137,7 +137,6 @@ def get_file_backend(
         prefix = ""
 
     if enable_singleton:
-
         unique_key = f"{prefix}:{json.dumps(backend_args)}"
         if unique_key in backend_instances:
             return backend_instances[unique_key]
@@ -424,7 +423,7 @@ def join_path(
         backend_key (str, optional): The key to get the backend from register.
 
     Returns:
-        str or Path: The result of concatenation. Returns a Path if any input is a Path.
+        str: The result of concatenation.
 
     Examples:
         >>> filepath1 = '/path/of/dir1'
diff --git a/cosmos_framework/utils/easy_io/file_client.py b/cosmos_framework/utils/easy_io/file_client.py
index 4a33328..650489f 100644
--- a/cosmos_framework/utils/easy_io/file_client.py
+++ b/cosmos_framework/utils/easy_io/file_client.py
@@ -375,7 +375,7 @@ def isfile(self, filepath: Union[str, Path]) -> bool:
         """
         return self.client.isfile(filepath)
 
-    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> Union[str, Path]:
+    def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) -> str:
         r"""Concatenate all file paths.
 
         Join one or more filepath components intelligently. The return value
@@ -385,7 +385,7 @@ def join_path(self, filepath: Union[str, Path], *filepaths: Union[str, Path]) ->
             filepath (str or Path): Path to be concatenated.
 
         Returns:
-            str or Path: The result of concatenation. Returns a Path if any input is a Path.
+            str: The result of concatenation.
         """
         return self.client.join_path(filepath, *filepaths)
 
diff --git a/cosmos_framework/utils/easy_io/handlers/imageio_video_handler.py b/cosmos_framework/utils/easy_io/handlers/imageio_video_handler.py
index 0f98593..fdc944f 100644
--- a/cosmos_framework/utils/easy_io/handlers/imageio_video_handler.py
+++ b/cosmos_framework/utils/easy_io/handlers/imageio_video_handler.py
@@ -112,8 +112,12 @@ def dump_to_fileobj(
         file: IO[bytes],
         format: str = "mp4",  # pylint: disable=redefined-builtin
         fps: int = 17,
-        quality: int = 5,
+        quality: int | None = 5,
         ffmpeg_params=None,
+        crf: int | None = None,
+        codec: str = "libx264",
+        preset: str = "medium",
+        pix_fmt: str = "yuv420p",
         **kwargs,
     ):
         """
@@ -124,50 +128,49 @@ def dump_to_fileobj(
             file (IO[bytes]): A file-like object to which the video data will be written.
             format (str): Format of the video file (default 'mp4').
             fps (int): Frames per second of the output video (default 17).
-            quality (int): Quality of the video (0-10, default 5).
+            quality (int): Quality of the video (0-10, default 5). Maps to libx264 ``-qscale:v`` (VBR).
+                Ignored when ``crf`` is set (qscale and CRF are mutually exclusive for libx264).
             ffmpeg_params (list): Additional parameters to pass to ffmpeg.
-
+            crf (int | None): Constant Rate Factor for H.264 (0-51, lower = higher quality / larger
+                file). When set, switches to CRF rate control, which yields far smaller files at a
+                matched perceptual quality than the ``quality`` (qscale) path. Defaults to ``None``
+                (legacy qscale behavior, fully backward-compatible).
+            codec (str): Video codec, used only on the CRF path (default 'libx264').
+            preset (str): x264 speed/efficiency preset, used only on the CRF path (default 'medium').
+            pix_fmt (str): Pixel format, used only on the CRF path (default 'yuv420p' for broad
+                playback compatibility).
         """
         if isinstance(obj, torch.Tensor):
             assert obj.dtype == torch.uint8, "Tensor must be of type uint8"
             obj = obj.cpu().numpy()
         h, w = obj.shape[1:-1]
 
-        # Encode as universally playable H.264: yuv420p chroma + a finite CRF.
-        #
-        # We deliberately bypass imageio-ffmpeg's `quality` knob here. At its top
-        # setting (quality=10, used for action rollouts) it requests *lossless*
-        # x264, and lossless x264 is only available under the "High 4:4:4
-        # Predictive" profile. The result is a file whose pixels are yuv420p but
-        # whose profile header advertises 4:4:4 — a mismatch that most players,
-        # browsers, and hardware decoders render as an all-black video.
-        #
-        # Forcing `-crf 18` (visually lossless) keeps the encode out of lossless
-        # mode, so x264 emits a standard "High" profile that plays everywhere.
-        compat_output_params = ["-pix_fmt", "yuv420p", "-crf", "18"]
-
         # Default ffmpeg params that ensure width and height are set
         default_ffmpeg_params = ["-s", f"{w}x{h}"]
 
-        # Use provided ffmpeg_params if any, otherwise use defaults
-        final_ffmpeg_params = ffmpeg_params if ffmpeg_params is not None else default_ffmpeg_params
-        final_ffmpeg_params = list(final_ffmpeg_params) + compat_output_params
-
-        mimsave_kwargs = {
-            "fps": fps,
-            "macro_block_size": 1,
-            "codec": "libx264",
-            # Output pixel format is set via `-pix_fmt` in `compat_output_params`
-            # below; we don't pass `pixelformat` here to avoid a duplicate
-            # `-pix_fmt` on the ffmpeg command line.
-            "ffmpeg_params": final_ffmpeg_params,
-            "output_params": ["-f", "mp4"],
-        }
+        if crf is not None:
+            # CRF rate control. ``quality`` (qscale) and ``-crf`` are mutually exclusive for
+            # libx264, so the qscale ``quality`` kwarg is intentionally not forwarded here.
+            mimsave_kwargs = {
+                "fps": fps,
+                "codec": codec,
+                "pixelformat": pix_fmt,
+                "macro_block_size": 1,
+                "ffmpeg_params": (ffmpeg_params or []) + ["-crf", str(crf), "-preset", preset] + default_ffmpeg_params,
+                "output_params": ["-f", "mp4"],
+            }
+        else:
+            # Use provided ffmpeg_params if any, otherwise use defaults
+            final_ffmpeg_params = ffmpeg_params if ffmpeg_params is not None else default_ffmpeg_params
+            mimsave_kwargs = {
+                "fps": fps,
+                "quality": quality,
+                "macro_block_size": 1,
+                "ffmpeg_params": final_ffmpeg_params,
+                "output_params": ["-f", "mp4"],
+            }
         # Update with any other kwargs
         mimsave_kwargs.update(kwargs)
-        # Drop the caller's `quality` so it can't reintroduce lossless x264 and the
-        # broken 4:4:4 profile; our explicit `-crf` governs quality instead.
-        mimsave_kwargs.pop("quality", None)
         log.debug(f"mimsave_kwargs: {mimsave_kwargs}")
 
         imageio.mimsave(file, obj, format, **mimsave_kwargs)
diff --git a/cosmos_framework/utils/easy_io/handlers/registry_utils.py b/cosmos_framework/utils/easy_io/handlers/registry_utils.py
index d49c38a..3d9f783 100644
--- a/cosmos_framework/utils/easy_io/handlers/registry_utils.py
+++ b/cosmos_framework/utils/easy_io/handlers/registry_utils.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 from cosmos_framework.utils.flags import TRAINING
 from cosmos_framework.utils.easy_io.handlers.base import BaseFileHandler
 from cosmos_framework.utils.easy_io.handlers.byte_handler import ByteHandler
diff --git a/cosmos_framework/utils/ema.py b/cosmos_framework/utils/ema.py
index 5ae9c4d..85bb817 100644
--- a/cosmos_framework/utils/ema.py
+++ b/cosmos_framework/utils/ema.py
@@ -104,7 +104,7 @@ class EMAModelTracker(torch.nn.Module):
     The EMA weights are registered as buffers, which are extractable as state dicts. The names follow those of the
     regular weights, except all "." are replaced with "-" (limitation of register_buffer()). This is similar to SDXL's
     implementation of EMA. There are no optimizable parameters.
-    TODO: multi-EMA weights.
+    TODO(snah): multi-EMA weights.
 
     Attributes:
         collected_params (list): temporarily stores the regular weights while in EMA mode.
diff --git a/cosmos_framework/utils/env_parsers/cred_env_parser.py b/cosmos_framework/utils/env_parsers/cred_env_parser.py
index 04810d9..68e7ede 100644
--- a/cosmos_framework/utils/env_parsers/cred_env_parser.py
+++ b/cosmos_framework/utils/env_parsers/cred_env_parser.py
@@ -9,7 +9,7 @@ class CredentialEnvParser(EnvParser):
     APP_ENV = String(default="")
     PROD_FT_AWS_CREDS_ACCESS_KEY_ID = String(default="")
     PROD_FT_AWS_CREDS_SECRET_ACCESS_KEY = String(default="")
-    PROD_FT_AWS_CREDS_ENDPOINT_URL = String(default="https://s3.us-west-2.amazonaws.com")
+    PROD_FT_AWS_CREDS_ENDPOINT_URL = String(default="https://invalid_url")
     PROD_FT_AWS_CREDS_REGION_NAME = String(default="us-west-2")
 
     PROD_S3_CHECKPOINT_ACCESS_KEY_ID = String(default="")
@@ -33,7 +33,7 @@ class CredentialEnvParser(EnvParser):
     PROD_TEAM_DIR_REGION_NAME = String(default="")
 
     PICASSO_AUTH_MODEL_REGISTRY_API_KEY = String(default="")
-    PICASSO_API_ENDPOINT_URL = String(default="https://invalid")
+    PICASSO_API_ENDPOINT_URL = String(default="https://invalid_url")
 
 
 CRED_ENVS = CredentialEnvParser()
diff --git a/cosmos_framework/utils/lazy_config/__init__.py b/cosmos_framework/utils/lazy_config/__init__.py
index 34c3c42..e677b35 100644
--- a/cosmos_framework/utils/lazy_config/__init__.py
+++ b/cosmos_framework/utils/lazy_config/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
 import os
@@ -15,7 +15,7 @@
 PLACEHOLDER = None
 
 
-class LazyDict(DictConfig):
+class LazyDict(DictConfig):  # NOTE: to differentiate between LazyDict & DictConfig
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
diff --git a/cosmos_framework/utils/lazy_config/file_io.py b/cosmos_framework/utils/lazy_config/file_io.py
index 5acf76d..1cb9650 100644
--- a/cosmos_framework/utils/lazy_config/file_io.py
+++ b/cosmos_framework/utils/lazy_config/file_io.py
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-
 from iopath.common.file_io import HTTPURLHandler, OneDrivePathHandler, PathHandler
 from iopath.common.file_io import PathManager as PathManagerBase
 
diff --git a/cosmos_framework/utils/lazy_config/lazy.py b/cosmos_framework/utils/lazy_config/lazy.py
index a2af0e7..f1ae6a2 100644
--- a/cosmos_framework/utils/lazy_config/lazy.py
+++ b/cosmos_framework/utils/lazy_config/lazy.py
@@ -110,7 +110,7 @@ def _patch_import():
     old_import = builtins.__import__
 
     def find_relative_file(original_file, relative_import_path, level):
-
+        # NOTE: "from . import x" is not handled. Because then it's unclear
         # if such import should produce `x` as a python module or DictConfig.
         # This can be discussed further if needed.
         relative_import_err = """
@@ -322,10 +322,7 @@ def is_serializable(item):
                 return False
 
         # For classes / functions / bound methods we want the importable dotted
-        # path, not `repr(obj)` — the latter yields strings like
-        # `<class 'cosmos.X'>` or `<function f at 0x…>` which break any
-        # downstream consumer that calls hydra.utils.instantiate on the loaded
-        # YAML (e.g. cosmos_framework.scripts.export_model).
+        # path, not `repr(obj)`
         from cosmos_framework.utils.lazy_config.registry import convert_target_to_string
 
         def _to_safe_string(value):
diff --git a/cosmos_framework/utils/misc.py b/cosmos_framework/utils/misc.py
index d3d9308..3022e0a 100644
--- a/cosmos_framework/utils/misc.py
+++ b/cosmos_framework/utils/misc.py
@@ -66,7 +66,6 @@ def to(
     assert device is not None or dtype is not None or memory_format is not None, (
         "at least one of device, dtype, memory_format should be specified"
     )
-
     if isinstance(data, torch.Tensor):
         if (
             memory_format == torch.channels_last
@@ -542,7 +541,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class StragglerDetectorV2:
-    """StragglerDetectorV2 is a class that allows you to easily integrate the "straggler" tool.
+    """StragglerDetectorV2 is a class that allows you to easily integrate "straggler" tool:
+    https://invalid_url.
 
     This tool detects stragglers using low-level CUPTI tool, which can gather kernel execution time with very low overhead.
     The execution times are compared across different ranks, as well as to the execution time of the exact same kernels in the past.
@@ -579,9 +579,9 @@ def __init__(
     def initialize(self):
         if self.enabled:
             if not straggler:
-
                 raise RuntimeError(
-                    "Please install the `straggler` package before using StragglerDetectionV2."
+                    "Please install straggler package before using StragglerDetectionV2."
+                    "Package can be installed from here: https://invalid_url"
                 )
 
             straggler.Detector.initialize(
diff --git a/cosmos_framework/utils/object_store.py b/cosmos_framework/utils/object_store.py
index a79ae8e..58cd2b1 100644
--- a/cosmos_framework/utils/object_store.py
+++ b/cosmos_framework/utils/object_store.py
@@ -11,24 +11,14 @@
 from typing import TYPE_CHECKING, Any, Callable, Optional
 from urllib.parse import urlparse
 
-import boto3
 import numpy as np
 import torch
 import yaml
-from botocore.config import Config
 from PIL import Image
 
-import cosmos_framework.utils.easy_io.backends.auto_auth as auto
 from cosmos_framework.utils import distributed, log
 from cosmos_framework.utils.easy_io import easy_io
 
-GLOBAL_S3_CONFIG = Config(
-    retries={"max_attempts": 20, "mode": "adaptive"},
-    connect_timeout=10,
-    read_timeout=60,
-    request_checksum_calculation="when_required",
-    response_checksum_validation="when_required",
-)
 Image.MAX_IMAGE_PIXELS = None
 
 if TYPE_CHECKING:
@@ -41,26 +31,18 @@ class ObjectStore:
     **Deprecated**. Use `easy_io` directly instead.
 
     Attributes:
-        client (botocore.client.S3): Object store client object.
         easy_io_backend: easy_io backend.
         bucket (str): Object store bucket name.
     """
 
     def __init__(self, config_object_storage: ObjectStoreConfig):
-
-        #       extracts the easy_io backend instead of the boto3 S3 client.
-        with auto.open_auth(config_object_storage.credentials, "r") as file:
-            object_storage_config = auto.json_load_auth(file)
-            self.client = Boto3Wrapper(
-                "s3",
-                **object_storage_config,
-            )
         self.easy_io_backend = easy_io.get_file_backend(
             backend_args={
                 "backend": "s3",
                 "s3_credential_path": config_object_storage.credentials,
                 "path_mapping": None,
-            }
+            },
+            enable_singleton=True,
         )
         self.bucket = config_object_storage.bucket
 
@@ -158,7 +140,6 @@ def save_object(
         """
         assert type is not None or save_func is not None
         with io.BytesIO() as buffer:
-
             # Write to buffer for common data types.
             if type == "torch":
                 torch.save(object, buffer)
@@ -199,31 +180,6 @@ def object_exists(self, key: str) -> bool:
         return self.easy_io_backend.exists(filepath=self._translate_key(key=key))
 
 
-class Boto3Wrapper:
-    """
-    This class serves as a wrapper around boto3.client in order to make boto3.client serializable. It's required to use
-    spawn method of creating DataLoader workers, which is in turn required to avoid segfaults when using Triton, e.g.
-    for torch.compile or custom kernels.
-    """
-
-    def __init__(self, *args, **kwargs):
-        self._args = args
-        self._kwargs = kwargs
-        self.client = None
-
-    def __setstate__(self, state):
-        self.__dict__ = state
-
-    def __getattr__(self, item):
-        is_worker = torch.utils.data.get_worker_info() is not None
-        client = (
-            boto3.client(*self._args, **self._kwargs, config=GLOBAL_S3_CONFIG) if self.client is None else self.client
-        )
-        if is_worker:
-            self.client = client
-        return getattr(client, item)
-
-
 def sync_s3_dir_to_local(
     s3_dir: str,
     s3_credential_path: str,
@@ -241,7 +197,7 @@ def sync_s3_dir_to_local(
             ALL distributed workers using `distributed.barrier()`. Defaults to True.
         cache_dir (str, optional): The cache folder to sync the S3 directory to.
             If None, the environment variable `IMAGINAIRE_CACHE_DIR` (defaulting
-            to "~/.cache/imaginaire") will be used.
+            to "~/.cache/cosmos_framework") will be used.
         local_rank_sync (bool, optional): Whether to synchronize download across
             workers within the same node using a node-level barrier. This is useful
             when the cache directory is not shared across nodes. Defaults to False.
@@ -275,7 +231,7 @@ def sync_s3_dir_to_local(
 
     # If the local directory is not specified, use the default cache directory
     cache_dir = (
-        os.environ.get("IMAGINAIRE_CACHE_DIR", os.path.expanduser("~/.cache/imaginaire"))
+        os.environ.get("IMAGINAIRE_CACHE_DIR", os.path.expanduser("~/.cache/cosmos_framework"))
         if cache_dir is None
         else cache_dir
     )
@@ -363,7 +319,7 @@ def download_from_s3_with_cache(
         }
     )
     cache_dir = (
-        os.environ.get("IMAGINAIRE_CACHE_DIR", os.path.expanduser("~/.cache/imaginaire"))
+        os.environ.get("IMAGINAIRE_CACHE_DIR", os.path.expanduser("~/.cache/cosmos_framework"))
         if cache_dir is None
         else cache_dir
     )
diff --git a/cosmos_framework/utils/one_logger/one_logger_override_utils.py b/cosmos_framework/utils/one_logger/one_logger_override_utils.py
index dd979d7..54e01d3 100644
--- a/cosmos_framework/utils/one_logger/one_logger_override_utils.py
+++ b/cosmos_framework/utils/one_logger/one_logger_override_utils.py
@@ -12,7 +12,7 @@
 
 
 def override_one_logger_callback(config) -> None:
-    """Add OneLoggerCallback to imaginaire config"""
+    """Add OneLoggerCallback to cosmos_framework config"""
 
     # Enable OneLogger by environment variable.
     enable_onelogger = os.environ.get("ENABLE_ONELOGGER", "FALSE").lower() == "true"
diff --git a/cosmos_framework/utils/one_logger/one_logger_utils.py b/cosmos_framework/utils/one_logger/one_logger_utils.py
index 0a0457b..bf3cf7e 100644
--- a/cosmos_framework/utils/one_logger/one_logger_utils.py
+++ b/cosmos_framework/utils/one_logger/one_logger_utils.py
@@ -317,8 +317,10 @@ def _set_one_logger(self):
                 self.one_logger = OneLogger(config=config)
             except BaseException:
                 logger.info(
-                    "WARNING: the `one_logger` package is required to enable e2e metrics tracking, "
-                    "but it is not installed."
+                    "WARNING: one_logger package is required to enable e2e metrics "
+                    "tracking. please go to "
+                    "https://invalid_url"
+                    " for details to install it"
                 )
         else:
             self.one_logger = None
@@ -1142,7 +1144,7 @@ def on_save_checkpoint_end(self, set_barrier: bool = False, **metrics_input_kwar
 
         self._store_set(f"productive_time:{global_step}", productive_time)
 
-
+        # NOTE: If on_save_checkpoint_success is called already, track productive metrics here
         if self._store_has_key(f"on_save_checkpoint_success:{global_step}"):
             successful_save_checkpoint_sync_finish_time = productive_time.pop(
                 "successful_save_checkpoint_sync_finish_time"
@@ -1210,7 +1212,7 @@ def on_save_checkpoint_success(self, set_barrier: bool = False, **metrics_input_
         # Fetch productivity metrics cached on_save_checkpoint_start
         productive_metrics = self.one_logger.store_pop(f"productive_metrics:{global_step}")
 
-
+        # NOTE: Only track *_sync_* metrics after on_save_checkpoint_end is called.
         # Check if on_save_checkpoint_end is called.
         if self._store_has_key(f"on_save_checkpoint_end:{global_step}"):
             productive_time = self._store_get(f"productive_time:{global_step}")
diff --git a/cosmos_framework/utils/serialization.py b/cosmos_framework/utils/serialization.py
index a962983..d009dba 100644
--- a/cosmos_framework/utils/serialization.py
+++ b/cosmos_framework/utils/serialization.py
@@ -6,12 +6,11 @@
 import importlib
 import json
 import os
-import tomllib
 from collections.abc import Callable as Callable2
 from collections.abc import Mapping, Sequence
 from dataclasses import fields, is_dataclass
 from types import UnionType
-from typing import Any, List, Literal, Optional, TypeVar, Union, get_args, get_origin
+from typing import Any, List, Optional, TypeVar, Union, get_args, get_origin
 
 import attrs
 import torch
@@ -39,19 +38,6 @@ def from_yaml(path: str | None = None, clazz: type | None = None, file_like_or_s
         raise ValueError("expected file_like_or_str or path to not be None")
 
 
-def from_toml(path: str | None = None, clazz: type | None = None, file_like_or_str=None) -> T:
-    if path:
-        assert os.path.exists(path), f"{path} does not exist"
-        with open(path, "rb") as in_f:
-            return from_dict(tomllib.load(in_f), clazz=clazz)
-    elif file_like_or_str:
-        if isinstance(file_like_or_str, (bytes, bytearray)):
-            return from_dict(tomllib.loads(file_like_or_str.decode("utf-8")), clazz=clazz)
-        return from_dict(tomllib.loads(file_like_or_str), clazz=clazz)
-    else:
-        raise ValueError("expected file_like_or_str or path to not be None")
-
-
 def _yaml_safe(obj: Any) -> Any:
     # primitives
     if obj is None or isinstance(obj, (bool, int, float, str)):
@@ -167,7 +153,6 @@ def is_optional(x: type) -> bool:
 
 
 def _to_dict_value(x: T, field_type: type, metadata: dict, field_name: str = ""):
-
     t = type(x)
 
     # attrs specific
@@ -196,7 +181,6 @@ def _to_dict_value(x: T, field_type: type, metadata: dict, field_name: str = "")
     # general python types + dataclasses + attrs
     # * meta types
     elif field_type == type or field_type == abc.ABCMeta:
-
         return to_qualitified_name(x)
     elif get_origin(field_type) is type:
         return to_qualitified_name(x)
@@ -267,7 +251,7 @@ def to_dict(x: T, field_name: str = "", hydra_compat: bool = True) -> dict:
         if hydra_compat:
             result["_target_"] = to_qualitified_name(x.__class__)
         for f in fields(x):
-
+            # NOTE: defaults are unnecessary to encode
             if hydra_compat and f.name == "defaults":
                 continue
             result[f.name] = _to_dict_value(
@@ -286,7 +270,7 @@ def to_dict(x: T, field_name: str = "", hydra_compat: bool = True) -> dict:
         if hydra_compat:
             result["_target_"] = to_qualitified_name(x.__class__)
         for f in attrs.fields(x.__class__):
-
+            # NOTE: defaults are unnecessary to encode
             if hydra_compat and f.name == "defaults":
                 continue
             result[f.name] = _to_dict_value(
@@ -306,7 +290,6 @@ def _from_dict_value(
     force_construct_target: bool | None = None,
 ):
 
-
     is_dc_type = is_dataclass(field_type)
     is_attrs_type = is_attrs(field_type)
     origin = get_origin(field_type) or field_type
@@ -337,7 +320,7 @@ def _from_dict_value(
             assert not isinstance(x, str)
             return from_dict(x, field_type, field_name=field_name)
     elif field_type in (DictConfig, LazyDict) or origin in (dict,):
-
+        # NOTE: _recursive_ is the name of the flag for this behaviour
         construct_target = x.get("_recursive_", field_type == DictConfig)
         if force_construct_target is not None:
             construct_target = force_construct_target
@@ -403,13 +386,6 @@ def _from_dict_value(
         return x
     elif field_type is type(None) or field_type == Any:  # no typing
         return x
-    elif origin is Literal:
-        allowed = get_args(field_type)
-        if x not in allowed:
-            raise TypeError(
-                f"value {x!r} not in {field_type} (allowed={allowed}, field={field_name})"
-            )
-        return x
     else:
         raise TypeError(
             f"unexpected type: {field_type} (origin={origin}, concrete_type={concrete_type}, args={args}, x={x})"
diff --git a/cosmos_framework/utils/training_telemetry/callback.py b/cosmos_framework/utils/training_telemetry/callback.py
index 341792f..2951f0b 100644
--- a/cosmos_framework/utils/training_telemetry/callback.py
+++ b/cosmos_framework/utils/training_telemetry/callback.py
@@ -197,8 +197,8 @@ def on_training_step_end(
                 average_forward_time=avg_forward_time,
                 average_backward_time=avg_backward_time,
                 average_dataloader_time=avg_dataloader_time,
-                tflops=0.0,
-                tokens_per_second=0.0,
+                tflops=0.0,  # FIXME: is this available?
+                tokens_per_second=0.0,  # FIXME: is this available?
                 loss=loss.item(),
                 batch_size=batch_size,
             )
diff --git a/cosmos_framework/utils/training_telemetry/utils.py b/cosmos_framework/utils/training_telemetry/utils.py
index 374565e..00e5c91 100644
--- a/cosmos_framework/utils/training_telemetry/utils.py
+++ b/cosmos_framework/utils/training_telemetry/utils.py
@@ -26,7 +26,10 @@ def import_training_telemetry() -> Optional[ModuleType]:
         __training_telemetry_module = importlib.import_module("training_telemetry")
         return __training_telemetry_module
     except ImportError as e:
-        logger.error(f"Telemetry is enabled but the `training_telemetry` package is not installed: {e}")
+        logger.error(f"Heimdall telemetry is enabled but package is not installed: {e}")
+        logger.info(
+            "Please install the package using `pip install aidot-training-telemetry --index-url=https://invalid_url`"
+        )
         return None
 
 
@@ -36,7 +39,9 @@ def set_telemetry_provider(local_path: str) -> Optional[Any]:
     """
     global __enable_telemetry
     if not __enable_telemetry:
-        logger.info("Training telemetry is disabled. Set ENABLE_TELEMETRY=true to enable it.")
+        logger.info(
+            "Heimdall telemetry is disabled, if using Heimdall,consider setting ENABLE_TELEMETRY=true to enable it"
+        )
         return None
 
     global __provider
@@ -46,8 +51,7 @@ def set_telemetry_provider(local_path: str) -> Optional[Any]:
     training_telemetry = import_training_telemetry()
     if training_telemetry is None:
         logger.error(
-            "Training telemetry is enabled but the `training_telemetry` package is not installed. "
-            "Set ENABLE_TELEMETRY=false to disable, or install the package."
+            "Heimdall telemetry is enabled but package is not installed, consider setting ENABLE_TELEMETRY=false to disable it, or install the package using `pip install aidot-training-telemetry --index-url=https://invalid_url`"
         )
         __enable_telemetry = False
         return None
diff --git a/cosmos_framework/utils/vfm/data_utils.py b/cosmos_framework/utils/vfm/data_utils.py
index d9d97be..4ffb1e2 100644
--- a/cosmos_framework/utils/vfm/data_utils.py
+++ b/cosmos_framework/utils/vfm/data_utils.py
@@ -53,13 +53,13 @@ def slice_data_batch(
     data_batch: dict[str, Any],
     start: int,
     limit: int,
-    multi_item_fields: Iterable[str] = ("image", "video", "image_size"),
+    multi_item_fields: Iterable[str] = ("image", "images", "video", "videos", "image_size"),
 ) -> dict[str, Any]:
     """Slice a data batch based on the start and limit indices.
 
     For most fields, the slice ``[start:limit]`` is applied directly along the
-    sample dimension. For fields listed in ``multi_item_fields`` (e.g. ``image``
-    and ``video``), each sample may contribute multiple visual items that are
+    sample dimension. For fields listed in ``multi_item_fields`` (e.g. ``image``,
+    ``images``, ``video``, and ``videos``), each sample may contribute multiple visual items that are
     concatenated in flat order. In that case, when
     ``num_vision_items_per_sample`` is present in ``data_batch``, the slice is
     expanded to cover all visual items belonging to the requested samples.
diff --git a/cosmos_framework/utils/vfm/flash_attn.py b/cosmos_framework/utils/vfm/flash_attn.py
index ce671f9..078dab9 100644
--- a/cosmos_framework/utils/vfm/flash_attn.py
+++ b/cosmos_framework/utils/vfm/flash_attn.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Flash attention initialization for the vfm/ unified VLM training path.
 
 This module replaces `cosmos_rl.policy.kernel.modeling_utils.init_flash_attn_meta`
diff --git a/cosmos_framework/utils/vfm/hf_attention_cosmos.py b/cosmos_framework/utils/vfm/hf_attention_cosmos.py
index 3037610..a25c43e 100644
--- a/cosmos_framework/utils/vfm/hf_attention_cosmos.py
+++ b/cosmos_framework/utils/vfm/hf_attention_cosmos.py
@@ -1,19 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
 
-"""HF ``ALL_ATTENTION_FUNCTIONS`` adapter delegating to ``imaginaire.attention``.
+"""HF ``ALL_ATTENTION_FUNCTIONS`` adapter delegating to ``cosmos_framework.model.attention``.
 
 Registered as the ``"cosmos"`` entry in HF's attention dispatch.
-``imaginaire.attention`` owns backend selection (cuDNN / NATTEN / flash2 /
+``cosmos_framework.model.attention`` owns backend selection (cuDNN / NATTEN / flash2 /
 flash3); to fall back to HF's own flash_attention_2 set
 ``policy.attn_implementation=flash_attention_2``.
 
 Layout: HF passes Q/K/V as BHSD ``[B, num_heads, N, head_dim]`` and expects
-BSHD output. ``imaginaire.attention`` is BSHD throughout, so we transpose on
+BSHD output. ``cosmos_framework.model.attention`` is BSHD throughout, so we transpose on
 entry; output layout already matches HF's expected return.
 
 Strict guards (raise rather than silently break loss parity):
-- ``dropout > 0`` — ``imaginaire.attention`` has no dropout parameter.
+- ``dropout > 0`` — ``cosmos_framework.model.attention`` has no dropout parameter.
   Qwen3-VL has ``attention_dropout=0`` so this never triggers in practice.
 - ``attention_mask is not None`` — adapter expects causal mask via
   ``is_causal=True`` (and no padding, i.e. Qwen3-VL VLM training with
@@ -70,7 +70,7 @@ def hf_attention_cosmos(
     v = value.transpose(1, 2)
 
     # Cast fp32 -> bf16 if needed.
-    # imaginaire's flash2/flash3/cuDNN backends only accept fp16/bf16; NATTEN
+    # cosmos_framework's flash2/flash3/cuDNN backends only accept fp16/bf16; NATTEN
     # also accepts fp32 but routing fp32 attention loses Tensor Core
     # acceleration (10-20x slower). HF's flash_attention_2 internally casts
     # fp32->bf16 and we replicate that so this adapter is a drop-in replacement
diff --git a/cosmos_framework/utils/vfm/model_loader.py b/cosmos_framework/utils/vfm/model_loader.py
index eae2487..c180ca3 100644
--- a/cosmos_framework/utils/vfm/model_loader.py
+++ b/cosmos_framework/utils/vfm/model_loader.py
@@ -280,7 +280,7 @@ def load_model_from_checkpoint(
               * **safetensors**: a directory containing one or more
                 ``*.safetensors`` shards in the native Cosmos3 VFM state-dict
                 layout.  Loaded via
-                :func:`projects.cosmos3.vfm.models.utils.safetensors_loader.load_vfm_model`.
+                :func:`cosmos_framework.model.vfm.utils.safetensors_loader.load_vfm_model`.
                 No ``/model`` suffix is appended.
         credential_path: Path to credentials file (if required for remote storage). Optional.
         enable_gcs_patch_in_boto3: Whether to enable the boto3 patch for GCS S3-compatibility.
diff --git a/cosmos_framework/utils/vfm/optimizer.py b/cosmos_framework/utils/vfm/optimizer.py
index 2fd7695..b9947a6 100644
--- a/cosmos_framework/utils/vfm/optimizer.py
+++ b/cosmos_framework/utils/vfm/optimizer.py
@@ -44,7 +44,7 @@ def _optimizer_cls(
     - ``"adam"`` / ``"adamw"``: forwarded to ``torch.optim.Adam`` /
       ``torch.optim.AdamW``.  ``fused`` (if present in ``optimizer_kwargs``)
       flows through and selects the fused CUDA kernel.
-    - ``"fusedadam"``: NVIDIA's :class:`projects.cosmos3.vfm.utils.fused_adam.FusedAdam`.
+    - ``"fusedadam"``: NVIDIA's :class:`cosmos_framework.utils.vfm.fused_adam.FusedAdam`.
       It is fused by construction and rejects a ``fused`` kwarg, so any
       ``fused`` entry is popped before instantiation.  We also force
       ``capturable=True`` and ``master_weights=True`` because those are the
@@ -341,14 +341,6 @@ def state_dict(self) -> dict[str, Any]:
         )
 
     def load_state_dict(self, state_dict: dict[str, Any]) -> None:
-        # Backward compat with old VLM checkpoints that prefix every key with
-        # "optimizer_0/" (the legacy list-of-optimizers layout; cosmos3 only
-        # ever ran with N=1). Strip the prefix transparently so those
-        # checkpoints continue to resume.
-        legacy_prefix = "optimizer_0/"
-        if any(k.startswith(legacy_prefix) for k in state_dict):
-            state_dict = {k.removeprefix(legacy_prefix): v for k, v in state_dict.items()}
-
         set_optimizer_state_dict(
             model=self.model,
             optimizers=self.optimizers,
diff --git a/cosmos_framework/utils/vfm/parallelism.py b/cosmos_framework/utils/vfm/parallelism.py
index e6fe795..58e88f1 100644
--- a/cosmos_framework/utils/vfm/parallelism.py
+++ b/cosmos_framework/utils/vfm/parallelism.py
@@ -1,5 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
+
 """Unified ParallelDims for Cosmos3 VFM and VLM (multi-mesh, overlay design).
 
 Topology
@@ -27,8 +28,8 @@
 - VFM inference     — ``dp_shard`` + cfgp/cp overlays; replicate forced to 1.
 
 FSDP wrapping for VLM ``HFModel`` instances lives in
-``projects.cosmos3.vfm.models.parallelize_vlm``; MoT wrapping lives in
-``projects.cosmos3.vfm.models.mot.parallelize_unified_mot``.  Both consume
+``cosmos_framework.model.vfm.parallelize_vlm``; MoT wrapping lives in
+``cosmos_framework.model.vfm.mot.parallelize_unified_mot``.  Both consume
 ``ParallelDims`` from this module.
 """
 
diff --git a/cosmos_framework/utils/vfm/video_preprocess.py b/cosmos_framework/utils/vfm/video_preprocess.py
new file mode 100644
index 0000000..1b633e2
--- /dev/null
+++ b/cosmos_framework/utils/vfm/video_preprocess.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: OpenMDW-1.1
+
+import numpy as np
+import torch
+from PIL import Image
+
+
+def tensor_to_pil_images(video_tensor: torch.Tensor) -> list[Image.Image]:
+    """Convert a video tensor of shape (C, T, H, W) or (T, C, H, W) into a list of PIL images.
+
+    Args:
+        video_tensor: Video tensor with shape (C, T, H, W) or (T, C, H, W).
+
+    Returns:
+        One PIL image per frame.
+    """
+    # (C, T, H, W) -> (T, C, H, W)
+    if video_tensor.shape[0] == 3 and video_tensor.shape[1] > 3:
+        video_tensor = video_tensor.permute(1, 0, 2, 3)
+
+    # (T, C, H, W) -> (T, H, W, C) and detach to CPU numpy.
+    video_np = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
+
+    # PIL expects uint8 with values in [0, 255]; rescale floats accordingly.
+    if video_np.dtype == np.float32 or video_np.dtype == np.float64:
+        if video_np.max() <= 1.0:
+            video_np = (video_np * 255).astype(np.uint8)
+        else:
+            video_np = video_np.astype(np.uint8)
+
+    return [Image.fromarray(frame) for frame in video_np]
diff --git a/cosmos_framework/utils/vfm/vlm/__init__.py b/cosmos_framework/utils/vfm/vlm/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/utils/vfm/vlm/__init__.py
+++ b/cosmos_framework/utils/vfm/vlm/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/utils/vfm/vlm/flop_calculator.py b/cosmos_framework/utils/vfm/vlm/flop_calculator.py
index cf112c8..90e8aef 100644
--- a/cosmos_framework/utils/vfm/vlm/flop_calculator.py
+++ b/cosmos_framework/utils/vfm/vlm/flop_calculator.py
@@ -25,7 +25,6 @@ class FlopCalculator:
     # estimator to underestimate per-sample work and the dynamic batcher to
     # pack batches too large. Keep this False until the slope and intercept
     # are refit against is_causal=True benchmark data.
-
     # benchmark runs and flip _IS_CAUSAL_FOR_CALIBRATION to True so this
     # calculator inherits the algorithmically correct FLOP count by default.
     _IS_CAUSAL_FOR_CALIBRATION: bool = False
diff --git a/cosmos_framework/utils/vfm/vlm/pretrained_models_downloader.py b/cosmos_framework/utils/vfm/vlm/pretrained_models_downloader.py
index a3921ae..54c18cb 100644
--- a/cosmos_framework/utils/vfm/vlm/pretrained_models_downloader.py
+++ b/cosmos_framework/utils/vfm/vlm/pretrained_models_downloader.py
@@ -193,16 +193,11 @@ def maybe_download_hf_model_from_s3(
     s3_prefix: str = "cosmos_reason2/hf_models",
     require_s3_exists: bool = False,
 ) -> str:
-    # Short-circuit when model_name_or_path is already a local directory — no
-    # S3 or HF Hub fetch is needed. Prevents opening credentials/*.secret
-    # in OSS/local-checkpoint smoke runs that already have the model on disk.
-    if os.path.isdir(model_name_or_path):
-        return model_name_or_path
     exclude_list = [".safetensors"] if not include_model_weights else []
     s3_prefix = os.path.join(s3_prefix, model_name_or_path)
     # download the model from s3 to local cache
     if cache_dir is None:
-        cache_dir = os.path.expanduser(os.getenv("IMAGINAIRE_CACHE_DIR", "~/.cache/imaginaire"))
+        cache_dir = os.path.expanduser(os.getenv("IMAGINAIRE_CACHE_DIR", "~/.cache/cosmos_framework"))
 
     cache_dir = os.path.join(cache_dir, s3_prefix)
 
diff --git a/cosmos_framework/utils/vlm/__init__.py b/cosmos_framework/utils/vlm/__init__.py
index 503ec1b..28a81be 100644
--- a/cosmos_framework/utils/vlm/__init__.py
+++ b/cosmos_framework/utils/vlm/__init__.py
@@ -1,3 +1,2 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: OpenMDW-1.1
-
diff --git a/cosmos_framework/utils/vlm/compute_flops_qwen3vl.py b/cosmos_framework/utils/vlm/compute_flops_qwen3vl.py
index e64bb74..afe6051 100644
--- a/cosmos_framework/utils/vlm/compute_flops_qwen3vl.py
+++ b/cosmos_framework/utils/vlm/compute_flops_qwen3vl.py
@@ -8,7 +8,7 @@
 given the model configuration and input specifications (total tokens, visual tokens, etc.).
 
 Usage:
-    from cosmos_framework.utils.vlm.compute_flops_qwen3vl import compute_qwen3vl_flops
+    from cosmos_framework.utils.scripts.compute_qwen3vl_flops import compute_qwen3vl_flops
 
     flops = compute_qwen3vl_flops(
         num_text_layers=32,
@@ -480,7 +480,7 @@ def compute_qwen3vl_flops(
         flops_breakdown["vision_encoder"] = 0
 
     # Embedding layer FLOPs
-
+    # NOTE: Only text tokens need embeddings. Visual tokens are already embedded by vision encoder.
     text_tokens = total_tokens - visual_tokens
     if include_embeddings:
         # Embedding lookup: typically counted as 0 or hidden_size operations per token
diff --git a/cosmos_framework/utils/vlm/dcp_checkpointer.py b/cosmos_framework/utils/vlm/dcp_checkpointer.py
index 3ec7eef..6984cae 100644
--- a/cosmos_framework/utils/vlm/dcp_checkpointer.py
+++ b/cosmos_framework/utils/vlm/dcp_checkpointer.py
@@ -286,7 +286,6 @@ def load(
 
         iteration = 0
 
-
         if checkpoint_path is not None:
             self._check_checkpoint_exists(checkpoint_path)
             all_state_dicts = {}
@@ -385,7 +384,6 @@ def _async_with_pinned_memory(self, checkpoint_file: str, state_dict: dict[str,
             self.staging = True
             self.staging_ckpt_file = checkpoint_file
 
-
         self.maybe_wait_for_staging()
 
     def maybe_wait_for_staging(self) -> None:
diff --git a/cosmos_framework/utils/vlm/distributed.py b/cosmos_framework/utils/vlm/distributed.py
index a4cda9c..6b2dfff 100644
--- a/cosmos_framework/utils/vlm/distributed.py
+++ b/cosmos_framework/utils/vlm/distributed.py
@@ -141,7 +141,7 @@ def destroy_distributed():
 #             grads[i] = g.to_local()
 
 #     # create bucket for all grads, we can allreduce them in one go
-#
+#     # NOTE: why we don't set DTensor as bucket view?
 #     # This is becuase we can't be sure that the training framework
 #     # never release grad, or clean grad by set None.
 #     # Create temporary bucket is a more reliable solution.
@@ -215,7 +215,7 @@ def gradient_norm_clipping(
     # If total_norm is a DTensor, the placements must be `torch.distributed._tensor.ops.math_ops._NormPartial`.
     # We can simply reduce the DTensor to get the total norm in this tensor's process group
     # and then convert it to a local tensor.
-
+    # NOTE: It has two purposes:
     #       1. to make sure the total norm is computed correctly when PP is used (see below)
     #       2. to return a reduced total_norm tensor whose .item() would return the correct value
     if isinstance(total_norm, DTensor):
diff --git a/cosmos_framework/utils/vlm/optimizer.py b/cosmos_framework/utils/vlm/optimizer.py
index ebafe7f..fb74a7f 100644
--- a/cosmos_framework/utils/vlm/optimizer.py
+++ b/cosmos_framework/utils/vlm/optimizer.py
@@ -42,7 +42,6 @@ class OptimizerConfig:
 
 def _optimizer_cls(params: list[nn.Parameter], optimizer_kwargs: dict[str, Any], name: str):
     if name.lower() == "adam":
-
         optimizer = torch.optim.Adam(params, **optimizer_kwargs)
     elif name.lower() == "adamw":
         optimizer = torch.optim.AdamW(params, **optimizer_kwargs)
diff --git a/cosmos_framework/utils/vlm/pretrained_models_downloader.py b/cosmos_framework/utils/vlm/pretrained_models_downloader.py
index dbc1667..5fff313 100644
--- a/cosmos_framework/utils/vlm/pretrained_models_downloader.py
+++ b/cosmos_framework/utils/vlm/pretrained_models_downloader.py
@@ -160,7 +160,7 @@ def maybe_download_hf_model_from_s3(
     s3_prefix = os.path.join(s3_prefix, model_name_or_path)
     # download the model from s3 to local cache
     if cache_dir is None:
-        cache_dir = os.path.expanduser(os.getenv("IMAGINAIRE_CACHE_DIR", "~/.cache/imaginaire"))
+        cache_dir = os.path.expanduser(os.getenv("IMAGINAIRE_CACHE_DIR", "~/.cache/cosmos_framework"))
 
     cache_dir = os.path.join(cache_dir, s3_prefix)