From f34031ae881c96113fcf61a18bce0ae9dfb166f8 Mon Sep 17 00:00:00 2001
From: Hao Liang <haolia@nvidia.com>
Date: Tue, 16 Jun 2026 20:40:31 -0700
Subject: [PATCH 1/3] DROID action-policy recipe + multi-node launcher +
 episode-shuffle (rebased on main)

Rebased onto current main. main #34 upstreamed the DROID dataset (joint_pos,
use_state, keep-ranges filter, action_space) so droid_lerobot_dataset.py now
carries only the get_shuffle_blocks helper grafted onto main's version; #29's
recipe change (dropped /cluster override) is incorporated.

Remaining contribution: action_policy_droid_nano recipe (mode=policy,
lr=2e-4 @ 8192 global, max_num_tokens_after_packing=-1, scrubbed comments),
the episode-shuffle stream (action_sft_dataset.py), the multi-node-capable
SFT launcher (NNODES/NODE_RANK/MASTER_ADDR passthrough + EXTRA_TAIL_OVERRIDES),
and the post-train doc.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Hao Liang <haolia@nvidia.com>
---
 .../action_policy_droid_nano.py               |  60 ++++++----
 .../vfm/action/datasets/action_sft_dataset.py |  67 ++++++++++-
 .../action/datasets/droid_lerobot_dataset.py  |  16 +++
 docs/action_policy_droid_posttrain.md         | 111 ++++++++++--------
 examples/_sft_launcher_common.sh              |  14 ++-
 examples/launch_sft_action_policy_droid.sh    |  17 ++-
 6 files changed, 205 insertions(+), 80 deletions(-)

diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
index 4bc0c29..8880295 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
@@ -5,9 +5,8 @@
 
 Mirrors the vision SFT stack (PackingDataLoader + RankPartitionedDataLoader),
 but feeds the DROID action dataset (``joint_pos`` 8D + ``use_state``, raw/
-un-normalized — same as the internal ``droid_lerobot_8b_policy`` run) through
-``ActionTransformPipeline``, and trains the generation + action heads from the
-public ``nvidia/Cosmos3-Nano`` base.
+un-normalized) through ``ActionTransformPipeline``, and trains the generation +
+action heads from the public ``nvidia/Cosmos3-Nano`` base.
 
 Usage (1 node, 8 GPU)::
 
@@ -41,13 +40,10 @@
             {"override /model": "mot_fsdp"},
             {"override /data_train": None},
             {"override /data_val": None},
-            # Match internal droid_lerobot_8b_policy: apex FusedAdam with fp32
-            # master_weights + eps 1e-8. adamw + fused + eps 1e-6 (bf16, no fp32
-            # master) under-steps the small 5x-lr action heads and leaves the action
-            # loss on a noisy high plateau; an exact-match forward/optimizer test
-            # confirmed the convergence gap was the optimizer, not the model.
+            # FusedAdam with fp32 master_weights + eps 1e-8 (bf16 params + eps 1e-6
+            # diverged on the action loss).
             {"override /optimizer": "fusedadamw"},
-            {"override /scheduler": "lambdalinear"},  # matches internal droid_lerobot_8b (was lambdacosine)
+            {"override /scheduler": "lambdalinear"},  # linear LR decay
             {"override /checkpoint": "s3"},
             {
                 "override /callbacks": [
@@ -76,7 +72,7 @@
             betas=[0.9, 0.99],
             eps=1.0e-08,
             fused=True,  # popped by build_optimizer for FusedAdam (fused by construction)
-            # Generation + action heads (mirrors internal droid_lerobot_8b_policy).
+            # Train the generation + action heads.
             keys_to_select=[
                 "moe_gen",
                 "time_embedder",
@@ -86,7 +82,7 @@
                 "llm2action",
                 "action_modality_embed",
             ],
-            lr=2.0e-04,  # matches internal droid_lerobot_8b_policy submit (--lr 2e-4)
+            lr=2.0e-04,  # for the 8192 global batch
             lr_multipliers={
                 "action2llm": 5.0,
                 "llm2action": 5.0,
@@ -96,7 +92,7 @@
             weight_decay=0.05,
         ),
         scheduler=dict(
-            lr_scheduler_type="LambdaLinear",  # matches internal droid_lerobot_8b (was LambdaCosine)
+            lr_scheduler_type="LambdaLinear",
             cycle_lengths=[100],  # smoke: 100 iters (real run sets via TOML)
             f_max=[0.4],
             f_min=[0.0],
@@ -125,7 +121,7 @@
                 device_monitor=dict(
                     every_n=200, log_memory_detail=True, save_s3=False, step_size=1, upload_every_n_mul=5
                 ),
-                grad_clip=dict(clip_norm=1.0, force_finite=True),  # matches internal make_8b
+                grad_clip=dict(clip_norm=1.0, force_finite=True),
                 heart_beat=dict(every_n=200, save_s3=False, step_size=1, update_interval_in_minute=20),
                 iter_speed=dict(every_n=1, hit_thres=50, save_s3=False, save_s3_every_log_n=500),
                 low_precision=dict(update_iter=1),
@@ -140,10 +136,9 @@
             dcp_async_mode_enabled=False,
             enable_gcs_patch_in_boto3=True,
             keys_not_to_resume=[],
-            # Skip net_ema. (→ EMA warm-start copies net→net_ema, see dcp.py) AND the
-            # action heads, so they init fresh from the base — matches internal
-            # make_8b _DEFAULT_KEYS_TO_SKIP (Cosmos3-Nano's action heads are not
-            # DROID-policy-trained).
+            # Skip net_ema. (EMA warm-starts from net, see dcp.py) and the action
+            # heads, so they init fresh from the base (the base has no DROID-trained
+            # action heads).
             keys_to_skip_loading=[
                 "net_ema.",
                 "action2llm",
@@ -171,7 +166,7 @@
         dataloader_train=L(PackingDataLoader)(
             audio_sample_rate=48000,
             dataset_name="action_droid",
-            max_samples_per_batch=128,  # count-based batch (matches internal res480 8B)
+            max_samples_per_batch=128,  # per rank -> 8192 global batch at 64 ranks (16 nodes, shard 8 x replicate 8)
             max_sequence_length=None,  # None disables token packing (TOML can't express null)
             patch_spatial=2,
             sound_latent_fps=0,
@@ -185,6 +180,13 @@
                 pin_memory=True,
                 prefetch_factor=4,
                 sampler=None,
+                # Shuffling is handled by the dataset (iterable_shuffle=True below):
+                # ActionIterableShuffleDataset streams rank x worker-sharded, episode-order-
+                # shuffled, sequential-within-episode. The map-style dataset has no internal
+                # shuffle, so a SequentialSampler would feed every rank the SAME consecutive
+                # overlapping windows -> global batch ~1 episode -> unstable grad-norm; a plain
+                # RandomSampler decorrelates but does random-access I/O -> slow + OOM. The
+                # iterable gives decorrelation with sequential reads.
                 datasets=dict(
                     droid=dict(
                         ratio=1,
@@ -193,15 +195,21 @@
                             fps=15.0,
                             chunk_length=32,
                             action_space="joint_pos",
+                            # Policy-only task mode. "joint" would randomly pick
+                            # forward_dynamics/inverse_dynamics/policy per sample (multi-task),
+                            # which dilutes each per-task loss by ~1/3.
+                            mode="policy",
                             use_state=True,
+                            iterable_shuffle=True,  # rank x worker episode-shuffle stream
+                            episode_shuffle_seed=42,
                             use_image_augmentation=True,  # SR boost (random crop+rescale + color jitter)
                             # Keep-ranges window filter (drops idle/non-task frames). Off by default;
-                            # the launcher sets use_filter_dict=True + filter_dict_path for internal parity.
+                            # set use_filter_dict=True + filter_dict_path to enable.
                             use_filter_dict=False,
                             filter_dict_path=None,
                             action_normalization=None,
                             viewpoint="concat_view",  # wrist 480p (top) + L/R shoulder 320x180 (bottom)
-                            resolution="480",  # 640x360 data @ 480p (matches internal res480 run)
+                            resolution="480",  # 640x360 data @ 480p
                             max_action_dim="${model.config.max_action_dim}",
                             cfg_dropout_rate=0.1,
                             tokenizer_config="${model.config.vlm_config.tokenizer}",
@@ -217,12 +225,18 @@
 )
 
 
-# chunk_length=32 → 33 observation frames; pin the VAE encode duration to match
-# (internal used [17] for chunk_length=16). Set post-construction so it lands on
-# the deep-copied NANO_MODEL_CONFIG.tokenizer.
+# chunk_length=32 -> 33 observation frames; pin the VAE encode duration to match.
+# Set post-construction so it lands on the deep-copied NANO_MODEL_CONFIG.tokenizer.
 action_policy_droid_nano["model"]["config"]["tokenizer"]["encode_exact_durations"] = [33]
 
 
+# Uncap the packed-sequence length. The NANO default (45056) caps the packed sequence,
+# truncating long DROID windows to ~1/4 of their natural length; -1 (uncapped) processes
+# the full vision sequence per step. Does not change the per-token loss; widens the
+# effective vision context per step.
+action_policy_droid_nano["model"]["config"]["max_num_tokens_after_packing"] = -1
+
+
 for _item in [action_policy_droid_nano]:
     _name = [k for k, v in globals().items() if v is _item][0]
     cs.store(group="experiment", package="_global_", name=_name, node=_item)
diff --git a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
index 5d5b74e..1790de5 100644
--- a/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/action_sft_dataset.py
@@ -16,7 +16,7 @@
 
 from typing import Any
 
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, IterableDataset, get_worker_info
 
 from cosmos_framework.data.vfm.action.datasets.droid_lerobot_dataset import DROIDLeRobotDataset
 from cosmos_framework.data.vfm.action.transforms import ActionTransformPipeline
@@ -37,6 +37,55 @@ def __len__(self) -> int:
     def __getitem__(self, idx: int) -> dict[str, Any]:
         return self._transform(self._dataset[idx], self._resolution)
 
+    def get_shuffle_blocks(self):
+        """Delegate to the inner DROIDLeRobotDataset (per-episode/segment flat-index blocks)."""
+        return self._dataset.get_shuffle_blocks()
+
+
+
+class ActionIterableShuffleDataset(IterableDataset):
+    """Streaming view of a map-style ``ActionSFTDataset``.
+
+    Each ``(rank, worker)`` is assigned a DISJOINT subset of episodes (sharded over
+    ``shard_world_size * num_workers``), shuffles its episode ORDER, and streams the
+    windows WITHIN each episode sequentially -> within-rank batch diversity (the N
+    workers of a rank stream N different episodes) AND cross-rank diversity, while
+    keeping reads sequential (I/O locality + COW; no RandomSampler random-access OOM).
+    Re-shuffles each epoch and streams indefinitely (the trainer stops at ``max_iter``).
+
+    ``shard_world_size`` / ``shard_rank`` are set by ``RankPartitionedDataLoader``.
+    """
+
+    def __init__(self, dataset: "ActionSFTDataset", seed: int = 42):
+        super().__init__()
+        self._dataset = dataset
+        self._seed = int(seed)
+        self.shard_world_size = 1
+        self.shard_rank = 0
+
+    def __len__(self) -> int:  # informational only; iteration is infinite
+        return len(self._dataset)
+
+    def __iter__(self):
+        import torch
+
+        blocks = self._dataset.get_shuffle_blocks()
+        wi = get_worker_info()
+        wid = wi.id if wi is not None else 0
+        nw = wi.num_workers if wi is not None else 1
+        global_shard = int(self.shard_rank) * nw + wid
+        total_shards = max(1, int(self.shard_world_size) * nw)
+        epoch = 0
+        while True:
+            g = torch.Generator()
+            g.manual_seed(self._seed + epoch)  # same permutation across all (rank,worker) -> disjoint shard
+            order = torch.randperm(len(blocks), generator=g).tolist()
+            for b in order[global_shard::total_shards]:
+                start, length = blocks[b]
+                for idx in range(start, start + length):
+                    yield self._dataset[idx]
+            epoch += 1
+
 
 def get_action_droid_sft_dataset(
     *,
@@ -44,6 +93,7 @@ def get_action_droid_sft_dataset(
     fps: float = 15.0,
     chunk_length: int = 32,
     action_space: str = "joint_pos",
+    mode: str = "policy",
     use_state: bool = True,
     action_normalization: str | None = None,
     viewpoint: str = "concat_view",
@@ -58,16 +108,18 @@ def get_action_droid_sft_dataset(
     append_duration_fps_timestamps: bool = True,
     append_resolution_info: bool = True,
     append_idle_frames: bool = False,
-) -> ActionSFTDataset:
-    """Build the DROID action SFT dataset (joint_pos 8D by default), matching the
-    internal ``droid_lerobot_8b_policy`` data: ``action_space='joint_pos'`` +
-    ``use_state`` (8D, raw/un-normalized), concat_view, chunk_length 32."""
+    iterable_shuffle: bool = False,
+    episode_shuffle_seed: int = 42,
+) -> Dataset:
+    """Build the DROID action SFT dataset: ``action_space='joint_pos'`` (8D) +
+    ``use_state`` (raw/un-normalized), concat_view, chunk_length 32."""
     dataset = DROIDLeRobotDataset(
         root=root,
         fps=fps,
         chunk_length=chunk_length,
         viewpoint=viewpoint,
         action_space=action_space,
+        mode=mode,
         use_state=use_state,
         action_normalization=action_normalization,
         use_image_augmentation=use_image_augmentation,
@@ -83,4 +135,7 @@ def get_action_droid_sft_dataset(
         append_resolution_info=append_resolution_info,
         append_idle_frames=append_idle_frames,
     )
-    return ActionSFTDataset(dataset, transform, resolution)
+    sft = ActionSFTDataset(dataset, transform, resolution)
+    if iterable_shuffle:
+        return ActionIterableShuffleDataset(sft, seed=episode_shuffle_seed)
+    return sft
diff --git a/cosmos_framework/data/vfm/action/datasets/droid_lerobot_dataset.py b/cosmos_framework/data/vfm/action/datasets/droid_lerobot_dataset.py
index 631f1e9..3bd1859 100644
--- a/cosmos_framework/data/vfm/action/datasets/droid_lerobot_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/droid_lerobot_dataset.py
@@ -351,3 +351,19 @@ def __len__(self) -> int:
         if self._use_filter_dict:
             return int(self._seg_cum[-1]) if self._seg_cum.size else 0
         return int(self._valid_cum[-1]) if self._valid_cum.size else 0
+
+    def get_shuffle_blocks(self) -> list[tuple[int, int]]:
+        """Per-episode (or per kept-segment, when ``use_filter_dict``) flat-index blocks
+        ``(start, length)``. ``ActionIterableShuffleDataset`` shuffles the ORDER of these
+        blocks and shards them disjointly across ranks, while keeping windows *within* a
+        block sequential -> decorrelates batches across ranks without random-access I/O
+        (preserves locality + copy-on-write memory sharing across workers)."""
+        cum = self._seg_cum if self._use_filter_dict else self._valid_cum
+        blocks: list[tuple[int, int]] = []
+        prev = 0
+        for c in np.asarray(cum).tolist():
+            c = int(c)
+            if c > prev:
+                blocks.append((prev, c - prev))
+            prev = c
+        return blocks
diff --git a/docs/action_policy_droid_posttrain.md b/docs/action_policy_droid_posttrain.md
index ac5c3fb..dbd46da 100644
--- a/docs/action_policy_droid_posttrain.md
+++ b/docs/action_policy_droid_posttrain.md
@@ -1,23 +1,32 @@
-<!-- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -->
-<!-- SPDX-License-Identifier: OpenMDW-1.1 -->
+# Cosmos3-Nano-Policy-DROID Post-Training
 
-# DROID Action-Policy Post-Training — `Cosmos3-Nano-Policy-DROID`
+[Cosmos3-Nano-Policy-DROID](https://huggingface.co/nvidia/Cosmos3-Nano-Policy-DROID) is an action policy fine-tuned from [`Cosmos3-Nano`](https://huggingface.co/nvidia/Cosmos3-Nano) (the 8B MoT) on the **DROID LeRobot** dataset, using absolute joint-position actions plus proprioceptive state at 480p. This example reproduces that post-training. The registered `action_policy_droid_nano` experiment, the DROID action dataset class (`joint_pos` 8-D + `use_state`), and the EMA warm-start fix all ship in this package; you supply two external inputs — a prepared DROID LeRobot dataset and a DCP base checkpoint converted from `nvidia/Cosmos3-Nano` (see [Inputs You Provide](#inputs-you-provide)). Validated end-to-end on H200: 1 node / 8 GPU and 2 nodes / 16 ranks (HSDP).
 
-> **STATUS: recipe ships in this package.** The registered experiment, the DROID action
-> dataset class (`joint_pos` 8D + `use_state`), and the EMA warm-start fix land here.
-> To run it you supply two external inputs — a prepared **DROID LeRobot v3.0** dataset and
-> a **DCP base checkpoint** converted from `nvidia/Cosmos3-Nano` (see
-> [Inputs you provide](#inputs-you-provide)). Validated end-to-end on H200: 1 node / 8 GPU
-> and 2 nodes / 16 ranks (HSDP).
+<!--TOC-->
 
-Fine-tune `Cosmos3-Nano` (the 8B MoT) into an action policy on the **DROID LeRobot** dataset,
-reproducing `Cosmos3-Nano-Policy-DROID`. The policy is initialized from **`nvidia/Cosmos3-Nano`**
-(public Hugging Face repo) and trained with absolute joint-position actions + proprioceptive
-state at 480p.
+______________________________________________________________________
+
+**Table of Contents**
+
+- [Inputs You Provide](#inputs-you-provide)
+- [Dataset](#dataset)
+- [Recipe](#recipe)
+- [Full Reproduction](#full-reproduction)
+- [Checkpoints](#checkpoints)
 
 ______________________________________________________________________
 
-## Inputs you provide
+<!--TOC-->
+
+Prerequisites:
+
+- [Setup](../README.md#setup) — clone the repo, install the training extras (`uv sync --all-extras --group=cu130-train`), and activate the environment.
+- [Environment Variables](./environment_variables.md)
+- [FAQ](./faq.md) — troubleshooting (OOM during SFT, defaults), common pitfalls.
+
+The runnable artifacts (TOML recipe, paired launch shell) live in [`examples/`](../examples/README.md); all commands below run from the repo root with the environment activated.
+
+## Inputs You Provide
 
 This package ships the training stack — the registered `action_policy_droid_nano` experiment,
 the DROID action dataset class with the recipe knobs (`action_space=joint_pos`, `use_state`,
@@ -28,45 +37,63 @@ be provided per environment:
    filtering is run out-of-band (not yet in this repo). Point `DROID_ROOT` at the resulting
    `…/droid_lerobot/success` directory (must contain `meta/info.json`).
 2. **DCP base checkpoint** — convert `nvidia/Cosmos3-Nano` to DCP and point
-   `BASE_CHECKPOINT_PATH` at it (see [Full reproduction](#full-reproduction)). Action heads are
+   `BASE_CHECKPOINT_PATH` at it (see [Full Reproduction](#full-reproduction)). Action heads are
    not loaded from it (they init fresh).
 
-## Dataset — DROID LeRobot
+## Dataset
 
-To be released.
+The **DROID LeRobot** dataset. To be released.
 
 ## Recipe
 
-| knob              | value                                                               |
-| ----------------- | ------------------------------------------------------------------- |
-| init              | `nvidia/Cosmos3-Nano` (public Hugging Face repo)                    |
-| action space      | `joint_pos` (absolute joint position, 8-D incl. gripper)            |
-| state             | `use_state=true` (proprioception; valid only with `joint_pos`)      |
-| resolution        | `480`                                                               |
-| viewpoint / video | `concat_view` / `video_mode=null`                                   |
-| chunk length      | `32` (tokenizer `encode_exact_durations=[33]`)                      |
-| lr                | `2e-4`                                                              |
-| samples/rank      | `32` (H200-safe; 64 OOMs at 480p). global batch = `32 × world_size` |
-| eval              | disabled for the reproduction run                                   |
-
-## Full reproduction
+| knob              | value                                                                                                 |
+| ----------------- | ----------------------------------------------------------------------------------------------------- |
+| init              | `nvidia/Cosmos3-Nano` (public Hugging Face repo)                                                      |
+| action space      | `joint_pos` (absolute joint position, 8-D incl. gripper)                                              |
+| state             | `use_state=true` (proprioception; valid only with `joint_pos`)                                        |
+| task mode         | `policy` (single-task; the `joint` multi-task default is avoided)                                     |
+| resolution        | `480`                                                                                                 |
+| viewpoint / video | `concat_view` / `video_mode=null`                                                                     |
+| chunk length      | `32` (tokenizer `encode_exact_durations=[33]`)                                                        |
+| sequence packing  | `max_num_tokens_after_packing=-1` (full vision sequence per step)                                     |
+| shuffle           | episode-shuffle stream (decorrelates the per-step global batch)                                       |
+| window filter     | keep-ranges (`KarlP/droid`) — trains the curated ≈74% window set                                      |
+| lr                | `2e-4`                                                                                                |
+| global batch      | `8192` (e.g. 128 samples/rank × 64 ranks; lower per-rank + raise `grad_accum_iter` to fit GPU memory) |
+| eval              | disabled for the reproduction run                                                                     |
+
+> The dataset streams an **episode-shuffle** order (decorrelates the per-step global batch — a
+> plain sequential read feeds every rank the same overlapping windows → unstable grad-norm). The
+> **keep-ranges window filter** drops idle/non-task frames (trains the curated ≈74% window set);
+> the reproduction enables it by default — see [Full Reproduction](#full-reproduction).
+
+## Full Reproduction
 
 The OSS flow mirrors the other recipes (see [docs/training.md](./training.md)):
 
 ```shell
-# Step 1: prepare DROID LeRobot v3.0 success split -> $DATASET_PATH (see "Inputs you provide")
+# Step 1: prepare DROID LeRobot v3.0 success split -> $DATASET_PATH (see "Inputs You Provide")
 
 # Step 2: convert the base checkpoint -> $BASE_CHECKPOINT_PATH
 python -m cosmos_framework.scripts.convert_model_to_dcp \
-  --checkpoint-path Cosmos3-Nano \
-  -o $BASE_CHECKPOINT_PATH 
+  -o $BASE_CHECKPOINT_PATH \
+  --checkpoint-path Cosmos3-Nano
+
+# Step 3: download the keep-ranges window filter (drops idle/non-task frames -> trains
+# the curated ~74% window set, matching the released model).
+hf download KarlP/droid keep_ranges_1_0_1.json --local-dir $FILTER_DIR
 
-# Step 3: launch. The TOML selects the experiment + scalars; the dataset/action
+# Step 4: launch. The TOML selects the experiment + scalars; the dataset/action
 # knobs come from the registered experiment.
 export DATASET_PATH=/path/to/dataset/success
 export BASE_CHECKPOINT_PATH=/path/to/base_checkpoint
 export WAN_VAE_PATH=/path/to/Wan2.2_VAE.pth
 export NPROC_PER_NODE=8
+# Enable the keep-ranges filter via EXTRA_TAIL_OVERRIDES (space-separated Hydra
+# overrides; an exported string survives `bash <wrapper>`).
+export EXTRA_TAIL_OVERRIDES="\
+dataloader_train.dataloader.datasets.droid.dataset.use_filter_dict=True \
+dataloader_train.dataloader.datasets.droid.dataset.filter_dict_path=$FILTER_DIR/keep_ranges_1_0_1.json"
 bash examples/launch_sft_action_policy_droid.sh
 ```
 
@@ -74,14 +101,11 @@ The recipe TOML (`examples/toml/sft_config/action_policy_droid_repro.toml`) sets
 knobs (`max_iter`, `save_iter`, `grad_clip`, parallelism, wandb); the dataset/action knobs
 (`joint_pos`, `use_state`, `concat_view`, 480p, chunk 32, count-based batch) live in the
 registered `action_policy_droid_nano` experiment per the schema's design. For multi-node HSDP,
-set `model.parallelism.data_parallel_replicate_degree = <num_nodes>` (intra-node shard stays 8).
+set `model.config.parallelism.data_parallel_replicate_degree = <num_nodes>` (intra-node shard stays 8).
 
-## Smoke reproduction
-
-Config/import/data sanity without burning a full run: small node count + a handful of iters via
-`--config-overrides "trainer.max_iter=10" "checkpoint.save_iter=10"` (and a small
-`data_parallel_shard_degree`). Use this to validate the recipe composes and the dataset opens
-before any large allocation.
+The **keep-ranges filter** maps each DROID trajectory key to a list of `[start, end]` frame
+ranges; only windows whose start falls inside a kept range are trained on (episodes absent from
+the dict are dropped). To train on the full window set instead, leave `EXTRA_TAIL_OVERRIDES` unset.
 
 ## Checkpoints
 
@@ -89,8 +113,3 @@ before any large allocation.
   `<bucket>/<project>/<group>/<job.name>/checkpoints/iter_<N>/`.
 - The run is **resumable** from the latest checkpoint (re-launch with the same `job.name`).
 - Export to HF safetensors via `cosmos_framework.scripts.export_model` (see [docs/training.md](./training.md)).
-
-## Non-goals
-
-- **Closed-loop / action evaluation is out of scope** for this reproduction pass (training
-  reproduction only), unless explicitly expanded.
diff --git a/examples/_sft_launcher_common.sh b/examples/_sft_launcher_common.sh
index 684acf8..5920ae8 100644
--- a/examples/_sft_launcher_common.sh
+++ b/examples/_sft_launcher_common.sh
@@ -20,6 +20,10 @@
 #                        (e.g. data_setting.max_tokens=16000 for VLM smokes).
 #   MASTER_PORT          torchrun --master_port; default 50012.
 #   NPROC_PER_NODE       torchrun --nproc_per_node; default 8.
+#   NNODES               torchrun --nnodes; multi-node only (unset = single-node).
+#   NODE_RANK            torchrun --node_rank; this worker's 0-based index.
+#   MASTER_ADDR          torchrun --master_addr; rank-0 host (multi-node only — it
+#                        has no torchrun env fallback, so it must be passed here).
 #   LOG_FILENAME         override $LOG_DIR/${LOG_FILENAME}
 #                        (default <toml-stem>_sft.log).
 #
@@ -83,8 +87,16 @@ if (( ${#TAIL_OVERRIDES[@]} > 0 )); then
     TRAILING_ARGS=(-- "${TAIL_OVERRIDES[@]}")
 fi
 
+# torchrun topology. Single-node by default; a SLURM/Lepton wrapper sets NNODES /
+# NODE_RANK / MASTER_ADDR for multi-node. Each is appended only when set, so with all
+# three unset the invocation is identical to the single-node case.
+TORCHRUN_ARGS=(--nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}")
+[[ -n "${NNODES:-}" ]]      && TORCHRUN_ARGS+=(--nnodes="$NNODES")
+[[ -n "${NODE_RANK:-}" ]]   && TORCHRUN_ARGS+=(--node_rank="$NODE_RANK")
+[[ -n "${MASTER_ADDR:-}" ]] && TORCHRUN_ARGS+=(--master_addr="$MASTER_ADDR")
+
 IMAGINAIRE_OUTPUT_ROOT="$IMAGINAIRE_OUTPUT_ROOT" PYTHONPATH=. \
-    torchrun --nproc_per_node="${NPROC_PER_NODE:-8}" --master_port="${MASTER_PORT:-50012}" -m cosmos_framework.scripts.train \
+    torchrun "${TORCHRUN_ARGS[@]}" -m cosmos_framework.scripts.train \
     --sft-toml="$TOML_FILE" \
     "${TRAILING_ARGS[@]}" \
     2>&1 | tee "$LOG_FILE"
diff --git a/examples/launch_sft_action_policy_droid.sh b/examples/launch_sft_action_policy_droid.sh
index 6ab0bc9..a6b80ee 100755
--- a/examples/launch_sft_action_policy_droid.sh
+++ b/examples/launch_sft_action_policy_droid.sh
@@ -8,7 +8,7 @@
 # examples/toml/sft_config/action_policy_droid_repro.toml (selects the
 # registered `action_policy_droid_nano` experiment; res480, joint_pos 8D +
 # use_state, trains the generation + action heads). See
-# docs/action_policy_droid_posttraining.md.
+# docs/action_policy_droid_posttrain.md.
 #
 # Env vars (override for your filesystem):
 #   DATASET_PATH          DROID LeRobot v3.0 success split (…/droid_lerobot/success)
@@ -16,10 +16,11 @@
 #   WAN_VAE_PATH          Wan2.2 VAE .pth (Wan-AI/Wan2.2-TI2V-5B)
 #   WANDB_API_KEY         for online logging (TOML wandb_mode="online")
 #   NPROC_PER_NODE        torchrun --nproc_per_node (default 8)
+#   EXTRA_TAIL_OVERRIDES  space-separated Hydra overrides (e.g. the keep-ranges filter)
 #
 # Single-node smoke (config/data sanity, a few iters):
-#   TAIL_OVERRIDES=(trainer.max_iter=10 checkpoint.save_iter=10 \
-#                   dataloader_train.max_samples_per_batch=32)
+#   export EXTRA_TAIL_OVERRIDES="trainer.max_iter=10 checkpoint.save_iter=10 \
+#                                dataloader_train.max_samples_per_batch=32"
 #   bash examples/launch_sft_action_policy_droid.sh
 #
 # Multi-node: launch on every worker; the trainer reads torchrun's
@@ -34,6 +35,14 @@ TOML_FILE="examples/toml/sft_config/action_policy_droid_repro.toml"
 # The experiment reads ${oc.env:DROID_ROOT}; bridge the launcher's DATASET_PATH to it.
 export DROID_ROOT="${DROID_ROOT:-$DATASET_PATH}"
 
-EXTRA_DATASET_CHECK='[[ -f "$DROID_ROOT/meta/info.json" ]] || { echo "ERROR: missing $DROID_ROOT/meta/info.json (prepare DROID LeRobot v3.0 — see docs/action_policy_droid_posttraining.md)" >&2; exit 1; }'
+EXTRA_DATASET_CHECK='[[ -f "$DROID_ROOT/meta/info.json" ]] || { echo "ERROR: missing $DROID_ROOT/meta/info.json (prepare DROID LeRobot v3.0 — see docs/action_policy_droid_posttrain.md)" >&2; exit 1; }'
+
+# Extra Hydra overrides from the environment: a space-separated string word-split into
+# the TAIL_OVERRIDES array. An exported string survives `bash <wrapper>` (a child
+# process), unlike a TAIL_OVERRIDES array set in your shell. Use it e.g. to enable the
+# keep-ranges window filter (see docs/action_policy_droid_posttrain.md).
+TAIL_OVERRIDES=(
+    ${EXTRA_TAIL_OVERRIDES:-}
+)
 
 source "$(dirname "${BASH_SOURCE[0]}")/_sft_launcher_common.sh"

From 3d8c662a73314232312a2f28b08f70510756f54c Mon Sep 17 00:00:00 2001
From: Hao Liang <haolia@nvidia.com>
Date: Tue, 16 Jun 2026 22:12:16 -0700
Subject: [PATCH 2/3] action base dataset: skip normalization when
 action_normalization is None
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

joint_pos uses raw (un-normalized) joint actions, so DROIDLeRobotDataset sets
action_normalization=None — but _build_result called normalize_action()
unconditionally, which raises 'Unknown normalization method: None'. Guard it so
None means raw actions (caught by a 2-node sanity run on the rebased branch).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Hao Liang <haolia@nvidia.com>
---
 cosmos_framework/data/vfm/action/datasets/base_dataset.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cosmos_framework/data/vfm/action/datasets/base_dataset.py b/cosmos_framework/data/vfm/action/datasets/base_dataset.py
index 564d48e..56e9599 100644
--- a/cosmos_framework/data/vfm/action/datasets/base_dataset.py
+++ b/cosmos_framework/data/vfm/action/datasets/base_dataset.py
@@ -186,7 +186,11 @@ def _build_result(
         **extras: Any,
     ) -> dict[str, Any]:
         idle_frames = self._compute_idle_frames(action)
-        normalized_action = normalize_action(action, self.action_normalization, self._load_norm_stats())
+        # action_normalization=None -> use raw actions (no normalization), e.g. joint_pos.
+        if self.action_normalization is None:
+            normalized_action = action
+        else:
+            normalized_action = normalize_action(action, self.action_normalization, self._load_norm_stats())
         formatted_video = (video * 255.0).clamp(0.0, 255.0).to(torch.uint8).permute(1, 0, 2, 3)
         return {
             "ai_caption": ai_caption,

From c0fde7934bb33a558558b25d5ad48bd28cd80c8f Mon Sep 17 00:00:00 2001
From: Hao Liang <haolia@nvidia.com>
Date: Wed, 17 Jun 2026 06:28:33 -0700
Subject: [PATCH 3/3] action recipe: default loss_scale=10 (vision FM weight)
 to match the reference

The bare recipe trained with the NANO default loss_scale=1.0, weighting the vision
flow-matching loss 10x lower than the Cosmos3-Nano-Policy-DROID reference (which uses
10.0). Set it post-construction so the recipe reproduces without launcher overrides.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Signed-off-by: Hao Liang <haolia@nvidia.com>
---
 .../action/posttrain_config/action_policy_droid_nano.py     | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
index 8880295..c1972d2 100644
--- a/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
+++ b/cosmos_framework/configs/base/experiment/action/posttrain_config/action_policy_droid_nano.py
@@ -237,6 +237,12 @@
 action_policy_droid_nano["model"]["config"]["max_num_tokens_after_packing"] = -1
 
 
+# Weight the vision flow-matching loss 10x in the total loss (the NANO default is 1.0).
+# loss_scale multiplies only the vision term, balancing it against the action loss
+# (action_loss_weight=10) so both heads train at comparable gradient magnitude.
+action_policy_droid_nano["model"]["config"]["rectified_flow_training_config"]["loss_scale"] = 10.0
+
+
 for _item in [action_policy_droid_nano]:
     _name = [k for k, v in globals().items() if v is _item][0]
     cs.store(group="experiment", package="_global_", name=_name, node=_item)