From 985d80332599ba0f230c150aa067275fe0c4d01b Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Sun, 7 Jun 2026 22:39:37 -0700
Subject: [PATCH 01/20] Add design spec: video input for reasoner model-mode
 inference

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../2026-06-07-video-reasoner-input-design.md | 257 ++++++++++++++++++
 1 file changed, 257 insertions(+)
 create mode 100644 docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md

diff --git a/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md b/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
new file mode 100644
index 0000000..60b82da
--- /dev/null
+++ b/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
@@ -0,0 +1,257 @@
+# Video input for the `reasoner` model-mode of inference — design
+
+**Date:** 2026-06-07
+**Branch:** `maoshengl/video_reasoner_inference`
+**Status:** approved design, ready for implementation plan
+
+## Goal
+
+Let `model_mode=reasoner` in the Cosmos inference engine
+(`python -m cosmos_framework.scripts.inference`) accept a **local mp4 video**
+as conditioning input, producing text that reasons over the clip — for both
+`Cosmos3-Nano` and `Cosmos3-Super`. Today the reasoner accepts only a text
+prompt or a single still image.
+
+## Background: why this is a gap
+
+The reasoner text-generation path runs entirely inside the Cosmos engine:
+
+```
+inference.py:_get_reasoner_sample_data        # loads ONE PIL image via Image.open
+  -> OmniMoTModel.generate_reasoner_text      # builds {"type":"image",...} chat block
+    -> net.generate_reasoner_text             # pass-through
+      -> unified_mot._impl_generate_reasoner_text   # pixel_values + image_grid_thw only
+        -> prepare_multimodal_reasoner_inputs       # image recipe only
+```
+
+Two hard blocks:
+
+1. `_get_reasoner_sample_data` (`cosmos_framework/inference/inference.py`) calls
+   `Image.open(vision_path)` unconditionally — PIL cannot decode mp4.
+2. `_impl_generate_reasoner_text` and `prepare_multimodal_reasoner_inputs`
+   **explicitly reject video** ("for I2V conditioning, frames must be passed as
+   images" — they have no `pixel_values_videos` / `video_grid_thw` params).
+
+Separately, `cosmos_framework/scripts/vlm/eval_videophy2.py` *does* consume
+video, but through a **different, standalone path**: a raw HuggingFace
+`Qwen3VLForConditionalGeneration` + `processor.apply_chat_template([{"type":
+"video",...}])` + `model.generate()`. It never touches the Cosmos engine, so it
+does not satisfy the goal of supporting `model_mode=reasoner` in
+`scripts.inference`.
+
+**Key enabling fact:** the vendored Qwen3-VL model under
+`cosmos_framework/model/vfm/vlm/qwen3_vl/` already implements video end to end —
+`get_video_features`, `get_rope_index(video_grid_thw=...)`,
+`get_placeholder_mask(pixel_values_videos=...)`, a `video_token_id`, and a full
+`video_processing_qwen3_vl.py`. Only the Cosmos reasoner **wrapper layers** are
+hardcoded to images. So the change is additive plumbing, not new model logic.
+
+## Approach (chosen)
+
+**B1 — add a parallel video lane through the existing reasoner stack.**
+
+Add optional video parameters alongside the existing image parameters through
+the wrapper layers, leaving the image and text-only paths bit-identical. A given
+prompt carries **either** an image, **or** a video, **or** neither — never both.
+No mixed image+video support (not needed).
+
+Approaches considered and rejected:
+
+- **B2 — unify image+video into one "media item" abstraction.** Cleaner
+  long-term and enables mixed media in one prompt, but larger blast radius, more
+  validation/tests, and supports a capability not requested (YAGNI).
+- **B3 — expose the HF `Qwen3VLForConditionalGeneration` route instead.** Bypasses
+  the Cosmos engine entirely (no `model_mode=reasoner`, no parallelism /
+  guardrails / output plumbing) — does not meet the goal.
+
+## Data flow
+
+```
+inputs/reasoner/reasoner_video.json
+  { model_mode: "reasoner", prompt, vision_path: "clip.mp4", video_*: ... }
+        |
+        v  args.py: vision_path resolves; extension -> ConditionVisionMode.VIDEO (already detected)
+_get_reasoner_sample_data()
+        |  detect .mp4 -> {prompt, "reasoner_videos": [path], "<video sampling kwargs>"}
+        v                (vs "reasoner_images" for the image branch)
+_generate_reasoner_batch()
+        |  route videos -> model.generate_reasoner_text(videos=[...], video_* kwargs)
+        v
+OmniMoTModel.generate_reasoner_text(videos=..., video_* kwargs)
+        |  build {"type":"video","video":path, <sampling kwargs>} chat block
+        |  apply_chat_template -> pixel_values_videos, video_grid_thw
+        v
+net.generate_reasoner_text(pixel_values_videos=..., video_grid_thw=...)   [pass-through]
+        v
+unified_mot._impl_generate_reasoner_text(... video tensors ...)
+        v
+prepare_multimodal_reasoner_inputs(...)   NEW video branch:
+        get_video_features -> get_placeholder_mask(video) -> get_rope_index(video_grid_thw)
+        v
+reasoner_forward -> AR decode -> text   (unchanged)
+```
+
+## Component changes
+
+All new params are optional and default to `None`/absent, so existing callers
+and the image/text-only paths are unchanged.
+
+### 1. `qwen3_vl/utils.py` — `prepare_multimodal_reasoner_inputs` (the one real seam)
+
+Add optional `pixel_values_videos` / `video_grid_thw` params. When they are set
+(and the image params are not), run the video recipe using helpers that already
+exist:
+
+- `get_video_features(causal_lm, pixel_values_videos, video_grid_thw)` instead of
+  `get_image_features`
+- `get_placeholder_mask(..., video_features=video_embeds)` -> use the returned
+  `_video_mask`
+- `get_rope_index(..., video_grid_thw=video_grid_thw)` instead of the image grid
+
+The `masked_scatter`, `visual_pos_masks`, deepstack alignment, and return shape
+all stay identical — only which features and which grid feed in change. The
+image branch is untouched. Update the docstring that currently says videos are
+not supported.
+
+### 2. `unified_mot.py` — `_impl_generate_reasoner_text`
+
+Add `pixel_values_videos` / `video_grid_thw` params. Extend the pairing guard
+(currently `(pixel_values is None) != (image_grid_thw is None)`) to also validate
+the video pair and to reject image+video supplied together. Branch: if video
+tensors present -> call `prepare_multimodal_reasoner_inputs` with them; else
+existing behavior. Update the "Videos are not supported" docstring.
+
+### 3. `unified_mot.py` + `cosmos3_vfm_network.py` — the two `generate_reasoner_text` pass-throughs
+
+Add the two video params and forward verbatim. Pure plumbing.
+
+### 4. `omni_mot_model.py` — `OmniMoTModel.generate_reasoner_text`
+
+Add `videos: list[Any] | None = None` (parallel to `images`) plus the optional
+video sampling kwargs (see schema below). Validate not-both (image and video).
+When `videos` is set, build the last user message with a
+`{"type": "video", "video": videos[idx], <sampling kwargs>}` block instead of the
+image block, then read `pixel_values_videos` / `video_grid_thw` out of the
+`apply_chat_template` output and pass them down. Same per-prompt `B=1` loop, same
+CP/CFGP output broadcast.
+
+### 5. `inference.py` — `_get_reasoner_sample_data` + `_generate_reasoner_batch`
+
+- Builder: branch on `Path(vision_path).suffix`. Image extension keeps
+  `Image.open` + `reasoner_images`. Video extension passes the **path string**
+  under `reasoner_videos` (the processor decodes it — see "Frame sampling"
+  below), and carries the resolved `video_*` sampling kwargs.
+- Batch: read whichever key is present, apply the homogeneity check (no mixing
+  within a batch), and call `generate_reasoner_text(videos=...)` with the
+  sampling kwargs when videos are present.
+
+### 6. `args.py` — schema (`SamplingArgs` / `SamplingOverrides`) + reasoner `sample_args.json`
+
+Add the input-video sampling knobs. They are named with a `video_` prefix to
+avoid colliding with the existing **output**-oriented `fps` / `num_frames`
+fields (which mean output rate/length and are otherwise unused by the reasoner).
+
+| New reasoner sample-arg | Maps to processor kwarg | Default     |
+| ----------------------- | ----------------------- | ----------- |
+| `video_fps`             | `fps`                   | `None` (->2)|
+| `video_num_frames`      | `num_frames`            | `None`      |
+| `video_min_frames`      | `min_frames`            | `None` (->4)|
+| `video_max_frames`      | `max_frames`            | `None`(->768)|
+| `video_min_pixels`      | `min_pixels`            | `None`      |
+| `video_max_pixels`      | `max_pixels`            | `None`      |
+
+`None` means "use the processor default," so the no-override behavior is
+identical to relying purely on processor defaults. Only non-`None` values are
+forwarded into the video block / processor kwargs.
+
+## Frame sampling
+
+The Qwen3-VL processor decodes the mp4 and samples frames itself; we pass the
+**path string** straight into the `{"type":"video",...}` block (matching
+`eval_videophy2.py`) rather than pre-decoding frames ourselves. The optional
+`video_*` knobs above tune that sampling.
+
+## Validation & error handling (fail fast, clear messages)
+
+- **Image + video together** — rejected at `_impl_generate_reasoner_text` and at
+  `OmniMoTModel.generate_reasoner_text`. The reasoner conditions on one medium at
+  a time.
+- **Video pairing** — `pixel_values_videos` and `video_grid_thw` must both be
+  present or both absent (mirrors the existing image-pair guard).
+- **`video_fps` + `video_num_frames` together** — rejected in the schema,
+  mirroring the processor's own mutual-exclusion rule.
+- **Batch homogeneity** — extend the current "no mixing image-conditioned and
+  text-only" check in `_generate_reasoner_batch` to three kinds: a batch is
+  all-text, all-image, or all-video. Mixed -> `ValueError` telling the user to
+  split inputs.
+- **No vision tower** — already handled: `_impl` raises if `causal_lm.visual` is
+  missing.
+- **Placeholder-token mismatch** — already handled: `get_placeholder_mask`
+  raises if the video token count != produced features.
+- **Extension routing** — relies on the existing `VIDEO_EXTENSIONS` /
+  `IMAGE_EXTENSIONS` sets in `args.py`; an unrecognized extension already raises
+  `Invalid vision extension`.
+
+## Non-goals / notes
+
+- **Mixed image+video in one prompt** — out of scope.
+- **Input-video content-safety guardrail** — none today; not added. The reasoner
+  emits only text, never video, so the text guardrail on prompt and output is
+  unchanged and sufficient.
+- **Video decode backend** — the processor needs a video backend
+  (decord / torchvision) to read the mp4; if missing, the failure surfaces inside
+  `apply_chat_template`. We do not add our own decode path. This is an
+  environment dependency to document, not code we write.
+- **Unused output vision fields** — `fps` / `num_frames` / resolution remain
+  unused by the reasoner (already defaulted in `args.py`).
+
+## Verification (manual only)
+
+No automated test for now. The implementation ships the artifacts to verify by
+hand.
+
+Example input `inputs/reasoner/reasoner_video.json`:
+
+```json
+{
+    "model_mode": "reasoner",
+    "prompt": "Describe what happens in this video in one sentence.",
+    "vision_path": "/abs/path/to/clip.mp4",
+    "video_fps": 2,
+    "video_max_pixels": 200704
+}
+```
+
+(`video_*` fields optional — omit to use processor defaults.)
+
+Run (Nano; Super identical but `--checkpoint-path Cosmos3-Super`):
+
+```bash
+torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
+    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
+    --cp-size=1 --cfgp-size=1 \
+    -i "inputs/reasoner/reasoner_video.json" \
+    -o outputs/reasoner_video --checkpoint-path Cosmos3-Nano --seed=0
+```
+
+Expected: `outputs/reasoner_video/reasoner_video/reasoner_text.txt` contains
+non-empty, on-topic text describing the clip; no crash; image and text-only
+reasoner inputs still work unchanged.
+
+A parity check against the HF `eval_videophy2.py` path is a possible future
+hardening step, out of scope here.
+
+## Files touched
+
+| File | Change |
+| ---- | ------ |
+| `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py` | `prepare_multimodal_reasoner_inputs`: add video branch |
+| `cosmos_framework/model/vfm/mot/unified_mot.py` | `_impl_generate_reasoner_text` + wrapper `generate_reasoner_text`: add/forward video params |
+| `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py` | `generate_reasoner_text`: forward video params |
+| `cosmos_framework/model/vfm/omni_mot_model.py` | `generate_reasoner_text`: `videos` param, video chat block, sampling kwargs |
+| `cosmos_framework/inference/inference.py` | `_get_reasoner_sample_data` + `_generate_reasoner_batch`: route mp4 |
+| `cosmos_framework/inference/args.py` | add `video_*` sampling fields + mutual-exclusion validation |
+| `cosmos_framework/inference/defaults/reasoner/sample_args.json` | add `video_*` defaults (`null`) |
+| `inputs/reasoner/reasoner_video.json` | new example input |
+| `docs/inference.md` | document video input + `video_*` fields for `reasoner` mode |
+```
\ No newline at end of file

From 2d2490f59ec3e9c33fc5c8a0edf3e140213c195b Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:32:25 -0700
Subject: [PATCH 02/20] Add implementation plan: video input for reasoner
 model-mode

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-06-08-video-reasoner-input.md  | 916 ++++++++++++++++++
 1 file changed, 916 insertions(+)
 create mode 100644 docs/superpowers/plans/2026-06-08-video-reasoner-input.md

diff --git a/docs/superpowers/plans/2026-06-08-video-reasoner-input.md b/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
new file mode 100644
index 0000000..a0edd25
--- /dev/null
+++ b/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
@@ -0,0 +1,916 @@
+# Video input for `reasoner` model-mode — Implementation Plan
+
+> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
+
+**Goal:** Let `model_mode=reasoner` in `python -m cosmos_framework.scripts.inference` accept a local mp4 video as conditioning input (Cosmos3-Nano and Cosmos3-Super), producing text that reasons over the clip.
+
+**Architecture:** Additive "video lane" alongside the existing image lane through the reasoner wrapper stack. The vendored Qwen3-VL model + `video_processing_qwen3_vl.py` already implement video end to end (`get_video_features`, `get_rope_index(video_grid_thw=…)`, `get_placeholder_mask(video_features=…)`, `video_token_id`); only the Cosmos wrapper layers are hardcoded to images. We thread optional `pixel_values_videos` / `video_grid_thw` (and a high-level `videos` list + `video_*` sampling knobs) through five layers. A prompt carries either an image, a video, or neither — never both.
+
+**Tech Stack:** Python, PyTorch, pydantic, HuggingFace transformers (`Qwen3VLProcessor`), torchrun. Repo lives at `cosmos-framework/`; spec at `docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md`.
+
+**Verification policy (read before starting):** Per the spec, the **end-to-end video path is verified manually on GPU** (Task 9) — there is no automated GPU test, because it requires real checkpoints + multi-GPU. The two pure-Python/logic changes (args schema in Task 1, builder routing in Task 7) DO get real `pytest` unit tests (CPU-only, no checkpoints). Model-layer tasks (3–6) are verified by import/lint checks plus the Task 9 manual run.
+
+**How to run tests/commands:** Python/pytest must run inside the i4 container (`bob_echo_dev`). Use the `cosmos3-run-env` skill to author the wrapper shell and the `slurm-node` skill to execute. Where a step says `pytest …`, it means "run that inside the container."
+
+---
+
+## File Structure
+
+| File | Responsibility | Change |
+| ---- | -------------- | ------ |
+| `cosmos_framework/inference/args.py` | Sample-arg schema | Add `video_*` reasoner fields + mutual-exclusion validation |
+| `cosmos_framework/inference/args_test.py` | Schema unit tests | Add tests for new fields/validation |
+| `cosmos_framework/inference/defaults/reasoner/sample_args.json` | Reasoner defaults | Add `video_*` keys (null) |
+| `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py` | Multimodal prefill | `prepare_multimodal_reasoner_inputs`: add video branch |
+| `cosmos_framework/model/vfm/mot/unified_mot.py` | Reasoner decode | `_impl_generate_reasoner_text` + 3 wrapper `generate_reasoner_text`: add/forward video params + guards |
+| `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py` | Network pass-through | `generate_reasoner_text`: forward video params |
+| `cosmos_framework/model/vfm/omni_mot_model.py` | High-level entry | `generate_reasoner_text`: `videos` param, video chat block, sampling kwargs |
+| `cosmos_framework/inference/inference.py` | Inference engine | `_get_reasoner_sample_data` route mp4; `_generate_reasoner_batch` homogeneity + video forward |
+| `cosmos_framework/inference/inference_test.py` | Builder unit test | Add routing test (CPU) |
+| `inputs/reasoner/reasoner_video.json` | Example input | New file |
+| `docs/inference.md` | User docs | Document video input + `video_*` fields |
+
+Implementation order: Task 1 (schema) → Task 2 (defaults) → Tasks 3–6 (model layers, bottom-up) → Task 7 (inference wiring) → Task 8 (docs/example) → Task 9 (manual GPU verification).
+
+---
+
+## Task 1: Add `video_*` reasoner sample-arg fields + validation
+
+**Files:**
+- Modify: `cosmos_framework/inference/args.py` (class `ReasonerDataArgs` ~600-611, class `ReasonerDataOverrides` ~614-638)
+- Test: `cosmos_framework/inference/args_test.py`
+
+The new fields control how the Qwen3-VL processor samples frames from the mp4. They are `video_`-prefixed to avoid colliding with the existing output-oriented `fps`/`num_frames` fields. `video_fps` and `video_num_frames` are mutually exclusive (the processor itself raises if both are set).
+
+- [ ] **Step 1: Write the failing tests**
+
+Add to `cosmos_framework/inference/args_test.py` (match the existing test style/imports in that file; these construct a reasoner override and resolve it). If the file already has a helper to build an `OmniSampleOverrides`/model config, reuse it; otherwise mirror the nearest existing reasoner test.
+
+```python
+def test_reasoner_video_fields_default_none():
+    ov = ReasonerDataOverrides()
+    assert ov.video_fps is None
+    assert ov.video_num_frames is None
+    assert ov.video_min_frames is None
+    assert ov.video_max_frames is None
+    assert ov.video_min_pixels is None
+    assert ov.video_max_pixels is None
+
+
+def test_reasoner_video_fps_and_num_frames_mutually_exclusive():
+    import pytest
+    ov = ReasonerDataOverrides(video_fps=2, video_num_frames=16)
+    # _validate_video_sampling is called from _build_reasoner_data; call it directly
+    with pytest.raises(ValueError, match="video_fps.*video_num_frames|mutually exclusive"):
+        ov._validate_video_sampling()
+
+
+def test_reasoner_video_fps_alone_ok():
+    ov = ReasonerDataOverrides(video_fps=2)
+    ov._validate_video_sampling()  # no raise
+```
+
+Add the import for `ReasonerDataOverrides` to the test file's import block if not present:
+
+```python
+from cosmos_framework.inference.args import ReasonerDataOverrides
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run (inside container): `pytest cosmos_framework/inference/args_test.py -k reasoner_video -v`
+Expected: FAIL — `ReasonerDataOverrides` has no `video_fps` (AttributeError / unexpected-keyword), and no `_validate_video_sampling`.
+
+- [ ] **Step 3: Add the fields to `ReasonerDataArgs`**
+
+In `args.py`, append to class `ReasonerDataArgs` (after `presence_penalty: float | None = None`, ~line 611):
+
+```python
+    video_fps: float | None = None
+    video_num_frames: pydantic.PositiveInt | None = None
+    video_min_frames: pydantic.PositiveInt | None = None
+    video_max_frames: pydantic.PositiveInt | None = None
+    video_min_pixels: pydantic.PositiveInt | None = None
+    video_max_pixels: pydantic.PositiveInt | None = None
+```
+
+- [ ] **Step 4: Add the fields + validation to `ReasonerDataOverrides`**
+
+In `args.py`, append to class `ReasonerDataOverrides` (after `presence_penalty`, ~line 631, before `_build_reasoner_data`):
+
+```python
+    video_fps: float | None = None
+    """Frames per second to sample from a video vision_path. Mutually exclusive with video_num_frames. None -> processor default."""
+    video_num_frames: pydantic.PositiveInt | None = None
+    """Fixed number of frames to sample from a video vision_path. Mutually exclusive with video_fps. None -> processor default."""
+    video_min_frames: pydantic.PositiveInt | None = None
+    """Lower bound on sampled frame count. None -> processor default."""
+    video_max_frames: pydantic.PositiveInt | None = None
+    """Upper bound on sampled frame count. None -> processor default."""
+    video_min_pixels: pydantic.PositiveInt | None = None
+    """Lower bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
+    video_max_pixels: pydantic.PositiveInt | None = None
+    """Upper bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
+
+    def _validate_video_sampling(self) -> None:
+        if self.video_fps is not None and self.video_num_frames is not None:
+            raise ValueError(
+                "video_fps and video_num_frames are mutually exclusive — set at most one."
+            )
+```
+
+Then call it from `_build_reasoner_data` so resolution-time validation fires. Replace the body of `_build_reasoner_data` (~lines 633-638) with:
+
+```python
+    def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta: SampleMeta):
+        if not sample_meta.model_mode.is_reasoner:
+            return
+        self = cast("SampleDataOverrides", self)
+        if not self.prompt.strip():
+            raise ValueError("Reasoner inference requires a non-empty 'prompt'.")
+        self._validate_video_sampling()
+```
+
+- [ ] **Step 5: Run tests to verify they pass**
+
+Run: `pytest cosmos_framework/inference/args_test.py -k reasoner_video -v`
+Expected: PASS (3 tests).
+
+- [ ] **Step 6: Lint + commit**
+
+```bash
+ruff check cosmos_framework/inference/args.py cosmos_framework/inference/args_test.py
+git add cosmos_framework/inference/args.py cosmos_framework/inference/args_test.py
+git commit -m "feat(reasoner): add video_* sampling fields + mutual-exclusion validation"
+```
+
+---
+
+## Task 2: Add `video_*` defaults to the reasoner defaults file
+
+**Files:**
+- Modify: `cosmos_framework/inference/defaults/reasoner/sample_args.json`
+
+`None` defaults already live in the schema; adding explicit `null` keys here documents the knobs and keeps the defaults file self-describing.
+
+- [ ] **Step 1: Edit the JSON**
+
+Replace the file contents with:
+
+```json
+{
+    "model_mode": "reasoner",
+    "max_new_tokens": 64,
+    "do_sample": false,
+    "temperature": 1.0,
+    "top_k": null,
+    "top_p": null,
+    "repetition_penalty": 1.0,
+    "presence_penalty": 0.0,
+    "video_fps": null,
+    "video_num_frames": null,
+    "video_min_frames": null,
+    "video_max_frames": null,
+    "video_min_pixels": null,
+    "video_max_pixels": null
+}
+```
+
+- [ ] **Step 2: Verify it loads**
+
+Run (inside container):
+`python -c "from cosmos_framework.inference.args import _load_modality_defaults; print(_load_modality_defaults('reasoner'))"`
+Expected: prints the dict including the `video_*` keys; no exception.
+
+- [ ] **Step 3: Commit**
+
+```bash
+git add cosmos_framework/inference/defaults/reasoner/sample_args.json
+git commit -m "feat(reasoner): add video_* defaults (null) to reasoner sample_args"
+```
+
+---
+
+## Task 3: Add a video branch to `prepare_multimodal_reasoner_inputs`
+
+**Files:**
+- Modify: `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py:497-604`
+
+This is the one real seam. The image recipe (lines 577-604) is: `get_image_features` → `get_placeholder_mask(image_features=…)` → `masked_scatter(image_mask)` → `get_rope_index(image_grid_thw=…)`. The video recipe is identical but uses the video helpers — and `get_video_features` is literally "same implementation as for images" (`qwen3_vl.py:1243`), so we reuse the existing free `get_image_features` helper with the video tensors. `get_placeholder_mask` and `get_rope_index` already accept video arguments.
+
+- [ ] **Step 1: Add optional video params to the signature**
+
+Change the signature (lines 497-509) to add two params after `image_grid_thw`:
+
+```python
+def prepare_multimodal_reasoner_inputs(
+    causal_lm: Any,
+    input_ids: torch.Tensor,  # [B,T_prompt]
+    pixel_values: torch.Tensor | None = None,  # [N_patches,C,H,W]
+    image_grid_thw: torch.Tensor | None = None,  # [num_images,3]
+    pixel_values_videos: torch.Tensor | None = None,  # [N_patches,C,H,W]
+    video_grid_thw: torch.Tensor | None = None,  # [num_videos,3]
+    attention_mask: Optional[torch.Tensor] = None,
+) -> tuple[
+    torch.Tensor,  # inputs_embeds [B,T_prompt,hidden_size]
+    torch.Tensor,  # visual_pos_masks [B,T_prompt] bool
+    list[torch.Tensor],  # deepstack_visual_embeds (per deepstack layer)
+    torch.Tensor,  # position_ids
+    torch.Tensor,  # mrope_position_deltas
+]:
+```
+
+(Note: `pixel_values`/`image_grid_thw` are now defaulted to `None`; existing callers pass them positionally/by keyword so behavior is unchanged.)
+
+- [ ] **Step 2: Replace the body (lines 577-604) with image/video branching**
+
+```python
+    is_video = pixel_values_videos is not None
+    inputs_embeds = causal_lm.model.embed_tokens(input_ids).clone()  # [B,T_prompt,hidden_size]
+
+    if is_video:
+        pixel_values_videos = pixel_values_videos.to(device=inputs_embeds.device)
+        video_grid_thw = video_grid_thw.to(device=inputs_embeds.device)
+        # get_video_features == get_image_features (same visual tower); reuse the free helper.
+        video_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values_videos, video_grid_thw)
+        video_embeds = torch.cat(video_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+        _image_mask, video_mask = get_placeholder_mask(
+            causal_lm,
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            video_features=video_embeds,
+        )
+        inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # [B,T_prompt,hidden_size]
+        visual_pos_masks = video_mask[..., 0]  # [B,T_prompt]
+    else:
+        pixel_values = pixel_values.to(device=inputs_embeds.device)
+        image_grid_thw = image_grid_thw.to(device=inputs_embeds.device)
+        image_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values, image_grid_thw)
+        image_embeds = torch.cat(image_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+        image_mask, _video_mask = get_placeholder_mask(
+            causal_lm,
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            image_features=image_embeds,
+        )
+        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # [B,T_prompt,hidden_size]
+        visual_pos_masks = image_mask[..., 0]  # [B,T_prompt]
+
+    deepstack_visual_embeds = [
+        embed.to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) for embed in deepstack_visual_embeds
+    ]
+
+    position_ids, mrope_position_deltas = get_rope_index(
+        causal_lm,
+        input_ids=input_ids,
+        image_grid_thw=None if is_video else image_grid_thw,
+        video_grid_thw=video_grid_thw if is_video else None,
+        attention_mask=attention_mask,
+    )
+
+    return inputs_embeds, visual_pos_masks, deepstack_visual_embeds, position_ids, mrope_position_deltas
+```
+
+- [ ] **Step 3: Update the docstring**
+
+In the docstring (lines 528-532), replace the sentence "Videos and dual image+video paths are not supported here; only `image_grid_thw` is consumed…" with:
+
+```
+    Either the image pair (``pixel_values`` + ``image_grid_thw``) or the
+    video pair (``pixel_values_videos`` + ``video_grid_thw``) is consumed —
+    not both. The video recipe mirrors the image recipe but routes through
+    the video placeholder mask and ``video_grid_thw`` rope index.
+```
+
+- [ ] **Step 4: Import/lint check (no GPU test — verified end-to-end in Task 9)**
+
+Run (inside container):
+`python -c "import cosmos_framework.model.vfm.vlm.qwen3_vl.utils"`
+Expected: no ImportError / SyntaxError.
+`ruff check cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py`
+
+- [ ] **Step 5: Commit**
+
+```bash
+git add cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
+git commit -m "feat(reasoner): video branch in prepare_multimodal_reasoner_inputs"
+```
+
+---
+
+## Task 4: Thread video params through `_impl_generate_reasoner_text`
+
+**Files:**
+- Modify: `cosmos_framework/model/vfm/mot/unified_mot.py:1490-1675`
+
+- [ ] **Step 1: Add params to the signature**
+
+In `_impl_generate_reasoner_text` (lines 1490-1508), add two params after `image_grid_thw` (line 1496):
+
+```python
+    pixel_values_videos: torch.Tensor | None = None,
+    video_grid_thw: torch.Tensor | None = None,
+```
+
+- [ ] **Step 2: Extend the validation guard**
+
+Replace the guard at lines 1644-1645:
+
+```python
+    if (pixel_values is None) != (image_grid_thw is None):
+        raise ValueError("pixel_values and image_grid_thw must be provided together.")
+```
+
+with:
+
+```python
+    if (pixel_values is None) != (image_grid_thw is None):
+        raise ValueError("pixel_values and image_grid_thw must be provided together.")
+    if (pixel_values_videos is None) != (video_grid_thw is None):
+        raise ValueError("pixel_values_videos and video_grid_thw must be provided together.")
+    if pixel_values is not None and pixel_values_videos is not None:
+        raise ValueError("Reasoner conditions on one medium at a time: pass image OR video, not both.")
+```
+
+- [ ] **Step 3: Route to the prefill helper for both media**
+
+Replace the prefill branch at lines 1650-1667:
+
+```python
+    if pixel_values is None:
+        hidden = model.reasoner_forward(input_ids, cache=cache)  # [B,T_prompt,hidden_size]
+    else:
+        if not hasattr(causal_lm, "visual"):
+            raise ValueError("Combined checkpoint does not include a visual module on the reasoner language model.")
+        (
+            inputs_embeds,
+            visual_pos_masks,
+            deepstack_visual_embeds,
+            position_ids,
+            mrope_position_deltas,
+        ) = prepare_multimodal_reasoner_inputs(
+            causal_lm,
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            attention_mask=attention_mask,
+        )
+```
+
+with:
+
+```python
+    if pixel_values is None and pixel_values_videos is None:
+        hidden = model.reasoner_forward(input_ids, cache=cache)  # [B,T_prompt,hidden_size]
+    else:
+        if not hasattr(causal_lm, "visual"):
+            raise ValueError("Combined checkpoint does not include a visual module on the reasoner language model.")
+        (
+            inputs_embeds,
+            visual_pos_masks,
+            deepstack_visual_embeds,
+            position_ids,
+            mrope_position_deltas,
+        ) = prepare_multimodal_reasoner_inputs(
+            causal_lm,
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+            attention_mask=attention_mask,
+        )
+```
+
+- [ ] **Step 4: Update the docstring**
+
+In the `pixel_values` docstring (lines 1553-1556), replace "Videos are *not* supported here — this function has no `pixel_values_videos` / `video_grid_thw` parameters; for I2V conditioning, frames must be passed as images." with:
+
+```
+            For video conditioning, pass ``pixel_values_videos`` +
+            ``video_grid_thw`` instead (mutually exclusive with the image
+            pair).
+```
+
+- [ ] **Step 5: Import/lint check**
+
+Run (inside container):
+`python -c "import cosmos_framework.model.vfm.mot.unified_mot"`
+`ruff check cosmos_framework/model/vfm/mot/unified_mot.py`
+Expected: no errors.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add cosmos_framework/model/vfm/mot/unified_mot.py
+git commit -m "feat(reasoner): accept video tensors in _impl_generate_reasoner_text"
+```
+
+---
+
+## Task 5: Forward video params through the wrapper `generate_reasoner_text` pass-throughs
+
+**Files:**
+- Modify: `cosmos_framework/model/vfm/mot/unified_mot.py` — three wrappers at lines 1932 (`Qwen3VLTextForCausalLM`), 2060 (`Qwen3VLMoeTextForCausalLM`), 2184 (`Nemotron3DenseVLTextForCausalLM`)
+- Modify: `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py:272-341`
+
+All four are pure pass-throughs to `_impl_generate_reasoner_text` (the three unified_mot wrappers) and to `self.language_model.generate_reasoner_text` (the network). Each needs the two new params added to its signature and forwarded.
+
+- [ ] **Step 1: Update the three unified_mot wrappers**
+
+For EACH of the three `generate_reasoner_text` methods (lines 1932, 2060, 2184): add after `image_grid_thw: torch.Tensor | None = None,` in the signature:
+
+```python
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
+```
+
+and add to the `_impl_generate_reasoner_text(...)` call (after `image_grid_thw=image_grid_thw,`):
+
+```python
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+```
+
+(The three methods are textually identical in this region; apply the same two-line additions to each.)
+
+- [ ] **Step 2: Update the network pass-through**
+
+In `cosmos3_vfm_network.py`, add to the `generate_reasoner_text` signature (after `image_grid_thw: torch.Tensor | None = None,`, ~line 278):
+
+```python
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
+```
+
+and to the forwarded call (after `image_grid_thw=image_grid_thw,`, ~line 329):
+
+```python
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
+```
+
+- [ ] **Step 3: Import/lint check**
+
+Run (inside container):
+`python -c "import cosmos_framework.model.vfm.mot.unified_mot, cosmos_framework.model.vfm.mot.cosmos3_vfm_network"`
+`ruff check cosmos_framework/model/vfm/mot/unified_mot.py cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py`
+Expected: no errors.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add cosmos_framework/model/vfm/mot/unified_mot.py cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
+git commit -m "feat(reasoner): forward video tensors through generate_reasoner_text pass-throughs"
+```
+
+---
+
+## Task 6: Add `videos` + sampling kwargs to `OmniMoTModel.generate_reasoner_text`
+
+**Files:**
+- Modify: `cosmos_framework/model/vfm/omni_mot_model.py:3760-4007`
+
+This builds a `{"type":"video", ...}` chat block (parallel to the existing image block at lines 3959-4008), extracts `pixel_values_videos` / `video_grid_thw` from `apply_chat_template`, and passes them down.
+
+- [ ] **Step 1: Add params to the signature**
+
+In `generate_reasoner_text` (lines 3760-3774), add after `images: list[Any] | None = None,` (line 3765):
+
+```python
+        videos: list[Any] | None = None,
+        video_sampling_kwargs: dict[str, Any] | None = None,
+```
+
+- [ ] **Step 2: Validate not-both and set the multimodal flag**
+
+Replace the validation block at lines 3907-3922 (`use_multimodal = images is not None` … through the `apply_chat_template` RuntimeError) with:
+
+```python
+        if images is not None and videos is not None:
+            raise ValueError("generate_reasoner_text conditions on one medium at a time: pass `images` OR `videos`, not both.")
+        use_image = images is not None
+        use_video = videos is not None
+        use_multimodal = use_image or use_video
+        media = images if use_image else videos
+        if use_multimodal:
+            assert media is not None  # narrowed by `use_multimodal`
+            if len(media) != len(inputs):
+                raise ValueError(
+                    f"generate_reasoner_text: media length ({len(media)}) "
+                    f"must equal `inputs` length ({len(inputs)}) for the "
+                    "vision-conditioned flow."
+                )
+            if not callable(getattr(self.vlm_processor, "apply_chat_template", None)):
+                raise RuntimeError(
+                    "generate_reasoner_text(images=/videos=...) requires a multimodal "
+                    "VLM processor (e.g. Qwen3VLProcessor) but the live processor "
+                    f"{type(self.vlm_processor).__name__!r} does not implement "
+                    "apply_chat_template — the live VLM is configured as text-only."
+                )
+        video_kwargs = {k: v for k, v in (video_sampling_kwargs or {}).items() if v is not None}
+```
+
+- [ ] **Step 3: Build the image-or-video chat block and extract tensors**
+
+Replace the multimodal block construction at lines 3959-4008 (`if use_multimodal:` … through the `out_ids = self.net.generate_reasoner_text(...)` image call) with:
+
+```python
+            if use_multimodal:
+                assert media is not None  # narrowed by `use_multimodal`
+                # Replace the LAST user message's content with a Qwen3-VL
+                # multimodal block. Earlier messages (system, prior turns)
+                # are kept verbatim.
+                last_user = messages[-1]
+                last_text = last_user["content"] if isinstance(last_user.get("content"), str) else ""
+                if use_video:
+                    media_item: dict[str, Any] = {"type": "video", "video": media[idx]}
+                else:
+                    media_item = {"type": "image", "image": media[idx]}
+                multimodal_messages = list(messages[:-1])
+                multimodal_messages.append(
+                    {
+                        "role": "user",
+                        "content": [media_item, {"type": "text", "text": last_text}],
+                    }
+                )
+                # NOTE: `video_kwargs` (fps/num_frames/min_frames/max_frames/
+                # min_pixels/max_pixels) are forwarded to the processor here.
+                # The exact kwarg surface depends on the installed transformers
+                # Qwen3VLProcessor; if a key is rejected, route via the
+                # processor's video-loading kwargs. Verified manually in the
+                # plan's Task 9.
+                processor_inputs = self.vlm_processor.apply_chat_template(
+                    multimodal_messages,
+                    tokenize=True,
+                    add_generation_prompt=True,
+                    return_tensors="pt",
+                    **(video_kwargs if use_video else {}),
+                )
+                inner_input_ids = processor_inputs["input_ids"].to(device).unsqueeze(0)
+                inner_attention_mask = processor_inputs["attention_mask"].to(device).unsqueeze(0)
+                if use_video:
+                    inner_pixel_values_videos = processor_inputs["pixel_values_videos"].to(device)
+                    inner_video_grid_thw = processor_inputs["video_grid_thw"].to(device)
+                    out_ids = self.net.generate_reasoner_text(
+                        input_ids=inner_input_ids,
+                        max_new_tokens=max_new_tokens,
+                        pixel_values_videos=inner_pixel_values_videos,
+                        video_grid_thw=inner_video_grid_thw,
+                        attention_mask=inner_attention_mask,
+                        eos_token_id=eos_id,
+                        pad_token_id=pad_id,
+                        do_sample=do_sample,
+                        temperature=temperature if temperature is not None else 1.0,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        presence_penalty=presence_penalty,
+                        seed=seed,
+                        return_only_new_tokens=True,
+                    )
+                else:
+                    inner_pixel_values = processor_inputs["pixel_values"].to(device)  # [N_patches,C,H,W]
+                    inner_image_grid_thw = processor_inputs["image_grid_thw"].to(device)  # [num_images,3]
+                    out_ids = self.net.generate_reasoner_text(
+                        input_ids=inner_input_ids,
+                        max_new_tokens=max_new_tokens,
+                        pixel_values=inner_pixel_values,
+                        image_grid_thw=inner_image_grid_thw,
+                        attention_mask=inner_attention_mask,
+                        eos_token_id=eos_id,
+                        pad_token_id=pad_id,
+                        do_sample=do_sample,
+                        temperature=temperature if temperature is not None else 1.0,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        presence_penalty=presence_penalty,
+                        seed=seed,
+                        return_only_new_tokens=True,
+                    )
+```
+
+(The text-only `else:` branch at lines 4009+ is unchanged.)
+
+- [ ] **Step 4: Update the docstring**
+
+In the `images:` Args entry (~lines 3828-3837), add a sibling paragraph:
+
+```
+            videos: Optional per-prompt conditioning videos (mutually
+                exclusive with ``images``). Each entry is forwarded into a
+                ``{"type": "video", "video": ...}`` chat block; the
+                processor decodes/samples frames and produces
+                ``pixel_values_videos`` / ``video_grid_thw``.
+            video_sampling_kwargs: Optional dict of non-None frame-sampling
+                controls (fps, num_frames, min_frames, max_frames,
+                min_pixels, max_pixels) forwarded to the processor.
+```
+
+- [ ] **Step 5: Import/lint check**
+
+Run (inside container):
+`python -c "import cosmos_framework.model.vfm.omni_mot_model"`
+`ruff check cosmos_framework/model/vfm/omni_mot_model.py`
+Expected: no errors.
+
+- [ ] **Step 6: Commit**
+
+```bash
+git add cosmos_framework/model/vfm/omni_mot_model.py
+git commit -m "feat(reasoner): videos param + video chat block in OmniMoTModel.generate_reasoner_text"
+```
+
+---
+
+## Task 7: Wire mp4 routing into the inference engine
+
+**Files:**
+- Modify: `cosmos_framework/inference/inference.py` — `_get_reasoner_sample_data:466-474`, `_generate_reasoner_batch:1644-1696`
+- Test: `cosmos_framework/inference/inference_test.py`
+
+The builder detects an mp4 `vision_path` by extension and returns it under a `reasoner_videos` key (path string, not decoded) plus the resolved `video_*` sampling kwargs; the batch method routes videos to `generate_reasoner_text(videos=…)`.
+
+- [ ] **Step 1: Write the failing routing test**
+
+Add to `cosmos_framework/inference/inference_test.py` (use `types.SimpleNamespace` to avoid constructing a full model/args; the builder only reads `vision_path`, `prompt`, and `video_*` off `sample_args`, and `input_caption_key` off `model`):
+
+```python
+import types
+from cosmos_framework.inference.inference import _get_reasoner_sample_data
+
+
+def _fake_sa(vision_path, **video_kw):
+    base = dict(
+        prompt="describe",
+        vision_path=vision_path,
+        video_fps=None, video_num_frames=None, video_min_frames=None,
+        video_max_frames=None, video_min_pixels=None, video_max_pixels=None,
+    )
+    base.update(video_kw)
+    return types.SimpleNamespace(**base)
+
+
+_fake_model = types.SimpleNamespace(input_caption_key="caption")
+
+
+def test_reasoner_sample_data_text_only():
+    out = _get_reasoner_sample_data(_fake_sa(None), _fake_model)
+    assert out["caption"] == ["describe"]
+    assert out["reasoner_images"] == [None]
+    assert "reasoner_videos" not in out
+
+
+def test_reasoner_sample_data_video_routes_to_videos(tmp_path):
+    clip = tmp_path / "clip.mp4"
+    clip.write_bytes(b"\x00")  # not decoded by the builder
+    out = _get_reasoner_sample_data(_fake_sa(str(clip), video_fps=2), _fake_model)
+    assert out["caption"] == ["describe"]
+    assert out["reasoner_videos"] == [str(clip)]
+    assert out["reasoner_images"] == [None]
+    assert out["video_sampling_kwargs"] == {"fps": 2}
+```
+
+- [ ] **Step 2: Run tests to verify they fail**
+
+Run (inside container): `pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
+Expected: FAIL — current builder always calls `Image.open` and has no `reasoner_videos`/`video_sampling_kwargs` keys.
+
+- [ ] **Step 3: Add the `VIDEO_EXTENSIONS` import**
+
+`VIDEO_EXTENSIONS` is exported from `cosmos_framework.inference.common.args` (the same module `args.py` imports it from). `inference.py` already imports `Path`, `Any`, `cast`, and `Image`, so this is the only new import. Add near the top of `inference.py`:
+
+```python
+from cosmos_framework.inference.common.args import VIDEO_EXTENSIONS
+```
+
+(If `inference.py` already imports other names from `cosmos_framework.inference.common.args`, append `VIDEO_EXTENSIONS` to that existing import instead of adding a new line.)
+
+- [ ] **Step 4: Rewrite `_get_reasoner_sample_data`**
+
+Replace lines 466-474:
+
+```python
+def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel) -> dict[str, Any]:
+    """Sample batch for reasoner text generation: prompt + optional conditioning image or video."""
+    image: Image.Image | None = None
+    video: str | None = None
+    if sample_args.vision_path is not None:
+        if Path(sample_args.vision_path).suffix.lower() in VIDEO_EXTENSIONS:
+            video = str(sample_args.vision_path)
+        else:
+            image = Image.open(sample_args.vision_path).convert("RGB")
+    out: dict[str, Any] = {
+        model.input_caption_key: [sample_args.prompt],
+        "reasoner_images": [image],
+    }
+    if video is not None:
+        out["reasoner_videos"] = [video]
+        out["video_sampling_kwargs"] = {
+            k: v
+            for k, v in {
+                "fps": sample_args.video_fps,
+                "num_frames": sample_args.video_num_frames,
+                "min_frames": sample_args.video_min_frames,
+                "max_frames": sample_args.video_max_frames,
+                "min_pixels": sample_args.video_min_pixels,
+                "max_pixels": sample_args.video_max_pixels,
+            }.items()
+            if v is not None
+        }
+    return out
+```
+
+- [ ] **Step 5: Run the routing tests**
+
+Run: `pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
+Expected: PASS (2 tests).
+
+- [ ] **Step 6: Update `_generate_reasoner_batch` to route videos**
+
+In `_generate_reasoner_batch` (lines 1656-1696), after `raw_images: list[...] = data_batch["reasoner_images"]` (line 1657), add video extraction and a three-way homogeneity check, then branch the model call. Replace lines 1656-1696 (`prompts = ...` through the `generate_reasoner_text(...)` call) with:
+
+```python
+        prompts: list[str] = data_batch[self.model.input_caption_key]
+        raw_images: list[Image.Image | None] = data_batch["reasoner_images"]
+        raw_videos: list[str | None] | None = data_batch.get("reasoner_videos")
+        video_sampling_kwargs: dict[str, Any] = data_batch.get("video_sampling_kwargs", {})
+
+        n_img = sum(img is not None for img in raw_images)
+        n_vid = sum(v is not None for v in (raw_videos or []))
+        if n_img and n_vid:
+            raise ValueError(
+                "Reasoner batch mixes image- and video-conditioned samples. Split into separate batches."
+            )
+        if 0 < n_img < len(raw_images):
+            raise ValueError(
+                "Reasoner batch mixes image-conditioned and text-only samples "
+                f"({n_img}/{len(raw_images)} have an image vision_path). Split into separate batches."
+            )
+        if raw_videos is not None and 0 < n_vid < len(raw_videos):
+            raise ValueError(
+                "Reasoner batch mixes video-conditioned and text-only samples "
+                f"({n_vid}/{len(raw_videos)} have a video vision_path). Split into separate batches."
+            )
+        images: list[Image.Image] | None = cast(list[Image.Image], raw_images) if n_img == len(raw_images) else None
+        videos: list[str] | None = (
+            cast(list[str], raw_videos) if raw_videos is not None and n_vid == len(raw_videos) else None
+        )
+
+        try:
+            with sync_distributed_errors():
+                for sa, prompt in zip(sample_args_list, prompts):
+                    if self.should_process_sample(sa) and not warmup:
+                        log.debug(f"{sa.__class__.__name__}({sa})")
+                        assert sa.output_dir is not None
+                        sa.output_dir.mkdir(parents=True, exist_ok=True)
+                        (sa.output_dir / "sample_args.json").write_text(sa.model_dump_json())
+                        self._run_text_guardrail(str(sa.output_dir), prompt)
+        except Exception as e:
+            return [
+                self._handle_sample_exception(sa, e)
+                for sa in sample_args_list
+                if self.should_process_sample(sa) and not warmup
+            ]
+
+        with self._get_timer(f"{self.model.__class__.__name__}.generate_reasoner_text"):
+            texts = self.model.generate_reasoner_text(
+                prompts,
+                max_new_tokens=sample_args_list[0].max_new_tokens,
+                images=images,
+                videos=videos,
+                video_sampling_kwargs=video_sampling_kwargs or None,
+                do_sample=sample_args_list[0].do_sample,
+                temperature=sample_args_list[0].temperature,
+                top_k=sample_args_list[0].top_k,
+                top_p=sample_args_list[0].top_p,
+                repetition_penalty=sample_args_list[0].repetition_penalty,
+                presence_penalty=sample_args_list[0].presence_penalty,
+                seed=sample_args_list[0].seed,
+            )
+```
+
+(Confirm `Any` and `cast` are already imported in `inference.py`; both are used elsewhere in the file, so no new import is needed.)
+
+- [ ] **Step 7: Import/lint check + run builder tests again**
+
+Run (inside container):
+`python -c "import cosmos_framework.inference.inference"`
+`ruff check cosmos_framework/inference/inference.py cosmos_framework/inference/inference_test.py`
+`pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
+Expected: no errors; tests PASS.
+
+- [ ] **Step 8: Commit**
+
+```bash
+git add cosmos_framework/inference/inference.py cosmos_framework/inference/inference_test.py
+git commit -m "feat(reasoner): route mp4 vision_path to video conditioning in inference engine"
+```
+
+---
+
+## Task 8: Example input + user docs
+
+**Files:**
+- Create: `inputs/reasoner/reasoner_video.json`
+- Modify: `docs/inference.md`
+
+- [ ] **Step 1: Create the example input**
+
+`inputs/reasoner/reasoner_video.json`:
+
+```json
+{
+    "model_mode": "reasoner",
+    "prompt": "Describe what happens in this video in one sentence.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/2b17a2413bd86b2cf9b03823637108851e4ddf2d/inputs/vision/robot_153.jpg"
+}
+```
+
+NOTE: replace the placeholder `vision_path` with a real `.mp4` URL or local path before running. If a canonical sample mp4 exists under the cosmos-dependencies repo, use that; otherwise leave a local-path example and document it. (Confirm a sample clip during Task 9; update this file to point at it.)
+
+- [ ] **Step 2: Document in `docs/inference.md`**
+
+In the Modes table (around line 138-146), the reasoner mode is currently text/image only. Add a row or note documenting video input for `reasoner`. Find the reasoner documentation block and add:
+
+```markdown
+For `model_mode=reasoner`, `vision_path` may point to an **image** (`.jpg`/`.png`/…) or a **video** (`.mp4`/…). A video is decoded by the Qwen3-VL processor and sampled into frames. Optional frame-sampling controls (all default to the processor's defaults):
+
+- `video_fps`: frames sampled per second (mutually exclusive with `video_num_frames`).
+- `video_num_frames`: fixed number of frames to sample.
+- `video_min_frames` / `video_max_frames`: bounds on the sampled frame count.
+- `video_min_pixels` / `video_max_pixels`: per-frame pixel budget (drives resolution).
+
+Example: [`inputs/reasoner/reasoner_video.json`](../inputs/reasoner/reasoner_video.json).
+```
+
+- [ ] **Step 3: Verify the example JSON parses**
+
+Run: `python -c "import json; json.load(open('inputs/reasoner/reasoner_video.json'))"`
+Expected: no exception.
+
+- [ ] **Step 4: Commit**
+
+```bash
+git add inputs/reasoner/reasoner_video.json docs/inference.md
+git commit -m "docs(reasoner): document video input + add reasoner_video example"
+```
+
+---
+
+## Task 9: Manual end-to-end GPU verification
+
+**Files:** none (verification only). Use the `cosmos3-run-env` skill to author the wrapper and `slurm-node` to run on a GPU node in the i4 container.
+
+This is the real correctness gate (per the spec: manual verification only). Do NOT mark the feature complete until this passes.
+
+- [ ] **Step 1: Obtain a short sample mp4**
+
+Place a short clip at a known path, e.g. `tmp_inputs/clip.mp4` (a few seconds is enough). Update `inputs/reasoner/reasoner_video.json`'s `vision_path` to that absolute path (or a real mp4 URL).
+
+- [ ] **Step 2: Run reasoner video inference on Cosmos3-Nano**
+
+```bash
+torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
+    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
+    --cp-size=1 --cfgp-size=1 \
+    -i "inputs/reasoner/reasoner_video.json" \
+    -o outputs/reasoner_video --checkpoint-path Cosmos3-Nano --seed=0
+```
+
+Expected: completes without error; `outputs/reasoner_video/reasoner_video/reasoner_text.txt` exists and contains non-empty, on-topic text describing the clip.
+
+- [ ] **Step 3: Repeat for Cosmos3-Super**
+
+Same command with `--checkpoint-path Cosmos3-Super`. Expected: same success criteria.
+
+- [ ] **Step 4: Regression — confirm image and text-only reasoner still work**
+
+```bash
+torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
+    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
+    --cp-size=1 --cfgp-size=1 \
+    -i "inputs/reasoner/reasoner.json" -i "inputs/reasoner/reasoner_image.json" \
+    -o outputs/reasoner_regress --checkpoint-path Cosmos3-Nano --seed=0
+```
+
+Expected: both produce `reasoner_text.txt` with non-empty text, unchanged from pre-change behavior.
+
+- [ ] **Step 5: Sampling-knob smoke check**
+
+Add `"video_fps": 1` (then separately `"video_num_frames": 8`) to the input JSON and re-run Step 2. Expected: still succeeds. Confirm `video_fps` + `video_num_frames` together is rejected with the mutual-exclusion error (validates Task 1 end-to-end). If the processor rejects a kwarg name, adjust the forwarding in `omni_mot_model.py` Task 6 Step 3 (route via the processor's video-loading kwargs) and re-run.
+
+- [ ] **Step 6: Record results**
+
+Note in the PR description: which checkpoints were run, the generated text samples, and confirmation that image/text-only paths are unaffected.
+
+---
+
+## Self-review notes
+
+- **Spec coverage:** schema fields + mutual exclusion (Task 1, spec §args), defaults (Task 2), `prepare_multimodal_reasoner_inputs` video branch (Task 3, spec §component 1), `_impl` + guards (Task 4, spec §component 2 + §validation), pass-throughs (Task 5, spec §component 3), `OmniMoTModel` video block (Task 6, spec §component 4), inference routing + batch homogeneity (Task 7, spec §component 5 + §validation), example + docs (Task 8, spec §files-touched), manual verification (Task 9, spec §verification). All spec sections mapped.
+- **Naming consistency:** `pixel_values_videos` / `video_grid_thw` (model layers), `reasoner_videos` / `video_sampling_kwargs` (data_batch keys), `videos` / `video_sampling_kwargs` (`OmniMoTModel.generate_reasoner_text` params), `video_*` (sample-arg fields) — used consistently across tasks.
+- **Known flag:** the exact `apply_chat_template` video-sampling kwarg surface (Task 6 Step 3) is transformers-version-dependent and confirmed in Task 9 Step 5; fallback documented inline.
+- **Import paths (resolved):** `VIDEO_EXTENSIONS` is exported from `cosmos_framework.inference.common.args`; `inference.py` already imports `Path`/`Any`/`cast`/`Image`. No other new imports required.
+```
\ No newline at end of file

From 6ece15f006cdd58ae999bcf1ec5045c076d854e5 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:43:13 -0700
Subject: [PATCH 03/20] feat(reasoner): add video_* sampling fields +
 mutual-exclusion validation

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cosmos_framework/inference/args.py      | 25 +++++++++++++++++++++++++
 cosmos_framework/inference/args_test.py | 22 ++++++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
index 3c234f7..d289697 100644
--- a/cosmos_framework/inference/args.py
+++ b/cosmos_framework/inference/args.py
@@ -609,6 +609,12 @@ class ReasonerDataArgs(ArgsBase):
     top_p: _ReasonerTopP | None = None
     repetition_penalty: _ReasonerRepetitionPenalty | None = None
     presence_penalty: float | None = None
+    video_fps: float | None = None
+    video_num_frames: pydantic.PositiveInt | None = None
+    video_min_frames: pydantic.PositiveInt | None = None
+    video_max_frames: pydantic.PositiveInt | None = None
+    video_min_pixels: pydantic.PositiveInt | None = None
+    video_max_pixels: pydantic.PositiveInt | None = None
 
 
 class ReasonerDataOverrides(OverridesBase):
@@ -629,6 +635,24 @@ class ReasonerDataOverrides(OverridesBase):
     """CTRL/HF-style multiplicative repetition penalty (>0). ``1.0`` is identity."""
     presence_penalty: float | None = None
     """Additive presence penalty (any sign). ``0.0`` is identity."""
+    video_fps: float | None = None
+    """Frames per second to sample from a video vision_path. Mutually exclusive with video_num_frames. None -> processor default."""
+    video_num_frames: pydantic.PositiveInt | None = None
+    """Fixed number of frames to sample from a video vision_path. Mutually exclusive with video_fps. None -> processor default."""
+    video_min_frames: pydantic.PositiveInt | None = None
+    """Lower bound on sampled frame count. None -> processor default."""
+    video_max_frames: pydantic.PositiveInt | None = None
+    """Upper bound on sampled frame count. None -> processor default."""
+    video_min_pixels: pydantic.PositiveInt | None = None
+    """Lower bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
+    video_max_pixels: pydantic.PositiveInt | None = None
+    """Upper bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
+
+    def _validate_video_sampling(self) -> None:
+        if self.video_fps is not None and self.video_num_frames is not None:
+            raise ValueError(
+                "video_fps and video_num_frames are mutually exclusive — set at most one."
+            )
 
     def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta: SampleMeta):
         if not sample_meta.model_mode.is_reasoner:
@@ -636,6 +660,7 @@ def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta:
         self = cast("SampleDataOverrides", self)
         if not self.prompt.strip():
             raise ValueError("Reasoner inference requires a non-empty 'prompt'.")
+        self._validate_video_sampling()
 
 
 class _TransferDataBase:
diff --git a/cosmos_framework/inference/args_test.py b/cosmos_framework/inference/args_test.py
index 3bf3703..75bfe66 100644
--- a/cosmos_framework/inference/args_test.py
+++ b/cosmos_framework/inference/args_test.py
@@ -15,6 +15,7 @@
     ModelMode,
     OmniSampleOverrides,
     OmniSetupOverrides,
+    ReasonerDataOverrides,
 )
 from cosmos_framework.inference.common.config import structure_config
 
@@ -156,3 +157,24 @@ def test_sample_args(tmp_path: Path):
     assert text2image_args.num_steps == 50
     assert text2image_args.guidance == 4.0
     assert text2image_args.shift == 3.0
+
+
+def test_reasoner_video_fields_default_none():
+    ov = ReasonerDataOverrides()
+    assert ov.video_fps is None
+    assert ov.video_num_frames is None
+    assert ov.video_min_frames is None
+    assert ov.video_max_frames is None
+    assert ov.video_min_pixels is None
+    assert ov.video_max_pixels is None
+
+
+def test_reasoner_video_fps_and_num_frames_mutually_exclusive():
+    ov = ReasonerDataOverrides(video_fps=2, video_num_frames=16)
+    with pytest.raises(ValueError, match="video_fps.*video_num_frames|mutually exclusive"):
+        ov._validate_video_sampling()
+
+
+def test_reasoner_video_fps_alone_ok():
+    ov = ReasonerDataOverrides(video_fps=2)
+    ov._validate_video_sampling()  # no raise

From 78a25a246eba06f8757b029f78a82120255b7da8 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:49:11 -0700
Subject: [PATCH 04/20] refactor(reasoner): video_fps PositiveFloat +
 construction-time mutual-exclusion validator

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/args.py      | 13 ++++++-------
 cosmos_framework/inference/args_test.py |  6 ++----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
index d289697..29b9fe5 100644
--- a/cosmos_framework/inference/args.py
+++ b/cosmos_framework/inference/args.py
@@ -609,7 +609,7 @@ class ReasonerDataArgs(ArgsBase):
     top_p: _ReasonerTopP | None = None
     repetition_penalty: _ReasonerRepetitionPenalty | None = None
     presence_penalty: float | None = None
-    video_fps: float | None = None
+    video_fps: pydantic.PositiveFloat | None = None
     video_num_frames: pydantic.PositiveInt | None = None
     video_min_frames: pydantic.PositiveInt | None = None
     video_max_frames: pydantic.PositiveInt | None = None
@@ -635,7 +635,7 @@ class ReasonerDataOverrides(OverridesBase):
     """CTRL/HF-style multiplicative repetition penalty (>0). ``1.0`` is identity."""
     presence_penalty: float | None = None
     """Additive presence penalty (any sign). ``0.0`` is identity."""
-    video_fps: float | None = None
+    video_fps: pydantic.PositiveFloat | None = None
     """Frames per second to sample from a video vision_path. Mutually exclusive with video_num_frames. None -> processor default."""
     video_num_frames: pydantic.PositiveInt | None = None
     """Fixed number of frames to sample from a video vision_path. Mutually exclusive with video_fps. None -> processor default."""
@@ -648,11 +648,11 @@ class ReasonerDataOverrides(OverridesBase):
     video_max_pixels: pydantic.PositiveInt | None = None
     """Upper bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
 
-    def _validate_video_sampling(self) -> None:
+    @pydantic.model_validator(mode="after")
+    def _validate_video_sampling(self) -> Self:
         if self.video_fps is not None and self.video_num_frames is not None:
-            raise ValueError(
-                "video_fps and video_num_frames are mutually exclusive — set at most one."
-            )
+            raise ValueError("video_fps and video_num_frames are mutually exclusive; set at most one.")
+        return self
 
     def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta: SampleMeta):
         if not sample_meta.model_mode.is_reasoner:
@@ -660,7 +660,6 @@ def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta:
         self = cast("SampleDataOverrides", self)
         if not self.prompt.strip():
             raise ValueError("Reasoner inference requires a non-empty 'prompt'.")
-        self._validate_video_sampling()
 
 
 class _TransferDataBase:
diff --git a/cosmos_framework/inference/args_test.py b/cosmos_framework/inference/args_test.py
index 75bfe66..982d36e 100644
--- a/cosmos_framework/inference/args_test.py
+++ b/cosmos_framework/inference/args_test.py
@@ -170,11 +170,9 @@ def test_reasoner_video_fields_default_none():
 
 
 def test_reasoner_video_fps_and_num_frames_mutually_exclusive():
-    ov = ReasonerDataOverrides(video_fps=2, video_num_frames=16)
     with pytest.raises(ValueError, match="video_fps.*video_num_frames|mutually exclusive"):
-        ov._validate_video_sampling()
+        ReasonerDataOverrides(video_fps=2, video_num_frames=16)
 
 
 def test_reasoner_video_fps_alone_ok():
-    ov = ReasonerDataOverrides(video_fps=2)
-    ov._validate_video_sampling()  # no raise
+    ReasonerDataOverrides(video_fps=2)  # no raise

From 6859a709564d1a2078608f7f2c0cc77f4ac8a07a Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:53:26 -0700
Subject: [PATCH 05/20] feat(reasoner): add video_* defaults (null) to reasoner
 sample_args

---
 .../inference/defaults/reasoner/sample_args.json          | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cosmos_framework/inference/defaults/reasoner/sample_args.json b/cosmos_framework/inference/defaults/reasoner/sample_args.json
index cc53991..a3d7569 100644
--- a/cosmos_framework/inference/defaults/reasoner/sample_args.json
+++ b/cosmos_framework/inference/defaults/reasoner/sample_args.json
@@ -6,5 +6,11 @@
     "top_k": null,
     "top_p": null,
     "repetition_penalty": 1.0,
-    "presence_penalty": 0.0
+    "presence_penalty": 0.0,
+    "video_fps": null,
+    "video_num_frames": null,
+    "video_min_frames": null,
+    "video_max_frames": null,
+    "video_min_pixels": null,
+    "video_max_pixels": null
 }

From 5626909d7c6465ae28ffd0915506f6228779834e Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:55:51 -0700
Subject: [PATCH 06/20] feat(reasoner): video branch in
 prepare_multimodal_reasoner_inputs

---
 .../model/vfm/vlm/qwen3_vl/utils.py           | 60 ++++++++++++-------
 1 file changed, 39 insertions(+), 21 deletions(-)

diff --git a/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py b/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
index 5a8ecff..4768b08 100644
--- a/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
+++ b/cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
@@ -497,8 +497,10 @@ def get_placeholder_mask(
 def prepare_multimodal_reasoner_inputs(
     causal_lm: Any,
     input_ids: torch.Tensor,  # [B,T_prompt]
-    pixel_values: torch.Tensor,  # [N_patches,C,H,W]
-    image_grid_thw: torch.Tensor,  # [num_images,3]
+    pixel_values: torch.Tensor | None = None,  # [N_patches,C,H,W]
+    image_grid_thw: torch.Tensor | None = None,  # [num_images,3]
+    pixel_values_videos: torch.Tensor | None = None,  # [N_patches,C,H,W]
+    video_grid_thw: torch.Tensor | None = None,  # [num_videos,3]
     attention_mask: Optional[torch.Tensor] = None,
 ) -> tuple[
     torch.Tensor,  # inputs_embeds [B,T_prompt,hidden_size]
@@ -525,11 +527,11 @@ def prepare_multimodal_reasoner_inputs(
     ``*TextModel.reasoner_forward`` instead of HF's full
     ``self.language_model(...)`` forward, so HF's
     ``past_key_values`` / ``cache_position`` lifecycle is replaced by
-    the AR loop's :class:`ReasonerKVCache` lifecycle.  Videos and
-    dual image+video paths are not supported here; only
-    ``image_grid_thw`` is consumed — matching the public
-    ``generate_reasoner_text`` API, which has no
-    ``pixel_values_videos`` / ``video_grid_thw`` parameters.
+    the AR loop's :class:`ReasonerKVCache` lifecycle.  Either the
+    image pair (``pixel_values`` + ``image_grid_thw``) or the
+    video pair (``pixel_values_videos`` + ``video_grid_thw``) is consumed —
+    not both. The video recipe mirrors the image recipe but routes through
+    the video placeholder mask and ``video_grid_thw`` rope index.
 
     Validation: ``get_placeholder_mask`` raises ``ValueError`` if the
     number of image placeholder tokens in ``input_ids`` does not match
@@ -574,21 +576,36 @@ def prepare_multimodal_reasoner_inputs(
             mrope_position_deltas: Per-sample rope delta used by the
                 caller to extend positions during decode.
     """
+    is_video = pixel_values_videos is not None
     inputs_embeds = causal_lm.model.embed_tokens(input_ids).clone()  # [B,T_prompt,hidden_size]
-    pixel_values = pixel_values.to(device=inputs_embeds.device)
-    image_grid_thw = image_grid_thw.to(device=inputs_embeds.device)
-
-    image_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values, image_grid_thw)
-    image_embeds = torch.cat(image_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
 
-    image_mask, _video_mask = get_placeholder_mask(
-        causal_lm,
-        input_ids,
-        inputs_embeds=inputs_embeds,
-        image_features=image_embeds,
-    )
-    inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # [B,T_prompt,hidden_size]
-    visual_pos_masks = image_mask[..., 0]  # [B,T_prompt]
+    if is_video:
+        pixel_values_videos = pixel_values_videos.to(device=inputs_embeds.device)
+        video_grid_thw = video_grid_thw.to(device=inputs_embeds.device)
+        # get_video_features == get_image_features (same visual tower); reuse the free helper.
+        video_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values_videos, video_grid_thw)
+        video_embeds = torch.cat(video_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+        _image_mask, video_mask = get_placeholder_mask(
+            causal_lm,
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            video_features=video_embeds,
+        )
+        inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # [B,T_prompt,hidden_size]
+        visual_pos_masks = video_mask[..., 0]  # [B,T_prompt]
+    else:
+        pixel_values = pixel_values.to(device=inputs_embeds.device)
+        image_grid_thw = image_grid_thw.to(device=inputs_embeds.device)
+        image_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values, image_grid_thw)
+        image_embeds = torch.cat(image_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
+        image_mask, _video_mask = get_placeholder_mask(
+            causal_lm,
+            input_ids,
+            inputs_embeds=inputs_embeds,
+            image_features=image_embeds,
+        )
+        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # [B,T_prompt,hidden_size]
+        visual_pos_masks = image_mask[..., 0]  # [B,T_prompt]
 
     deepstack_visual_embeds = [
         embed.to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) for embed in deepstack_visual_embeds
@@ -597,7 +614,8 @@ def prepare_multimodal_reasoner_inputs(
     position_ids, mrope_position_deltas = get_rope_index(
         causal_lm,
         input_ids=input_ids,
-        image_grid_thw=image_grid_thw,
+        image_grid_thw=None if is_video else image_grid_thw,
+        video_grid_thw=video_grid_thw if is_video else None,
         attention_mask=attention_mask,
     )
 

From 86a7e98e36a639709fbdcf6ddc16c36f15f4c919 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 00:59:58 -0700
Subject: [PATCH 07/20] feat(reasoner): accept video tensors in
 _impl_generate_reasoner_text

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cosmos_framework/model/vfm/mot/unified_mot.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cosmos_framework/model/vfm/mot/unified_mot.py b/cosmos_framework/model/vfm/mot/unified_mot.py
index 4908e7a..ccaad14 100644
--- a/cosmos_framework/model/vfm/mot/unified_mot.py
+++ b/cosmos_framework/model/vfm/mot/unified_mot.py
@@ -1494,6 +1494,8 @@ def _impl_generate_reasoner_text(
     *,
     pixel_values: torch.Tensor | None = None,
     image_grid_thw: torch.Tensor | None = None,
+    pixel_values_videos: torch.Tensor | None = None,
+    video_grid_thw: torch.Tensor | None = None,
     attention_mask: torch.Tensor | None = None,
     eos_token_id: int | list[int] | None = None,
     pad_token_id: int | None = None,
@@ -1550,10 +1552,9 @@ def _impl_generate_reasoner_text(
             ``Qwen3VLProcessor`` emits — pass it through unchanged.
             Moved to the prompt's device internally.  ``None`` (default)
             means text-only prompt; in that case the multimodal prefill
-            path is skipped entirely.  Videos are *not* supported here —
-            this function has no ``pixel_values_videos`` / ``video_grid_thw``
-            parameters; for I2V conditioning, frames must be passed as
-            images.
+            path is skipped entirely.  For video conditioning, pass ``pixel_values_videos`` +
+            ``video_grid_thw`` instead (mutually exclusive with the image
+            pair).
         image_grid_thw: Optional ``[num_images, 3]`` long tensor giving
             ``(t, h, w)`` — the temporal / height / width feature-grid
             size per image as produced by ``Qwen3VLProcessor`` (``t`` is
@@ -1643,11 +1644,15 @@ def _impl_generate_reasoner_text(
 
     if (pixel_values is None) != (image_grid_thw is None):
         raise ValueError("pixel_values and image_grid_thw must be provided together.")
+    if (pixel_values_videos is None) != (video_grid_thw is None):
+        raise ValueError("pixel_values_videos and video_grid_thw must be provided together.")
+    if pixel_values is not None and pixel_values_videos is not None:
+        raise ValueError("Reasoner conditions on one medium at a time: pass image OR video, not both.")
 
     _prefill_start = time.time()
 
     mrope_position_deltas: torch.Tensor | None = None
-    if pixel_values is None:
+    if pixel_values is None and pixel_values_videos is None:
         hidden = model.reasoner_forward(input_ids, cache=cache)  # [B,T_prompt,hidden_size]
     else:
         if not hasattr(causal_lm, "visual"):
@@ -1663,6 +1668,8 @@ def _impl_generate_reasoner_text(
             input_ids=input_ids,
             pixel_values=pixel_values,
             image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
             attention_mask=attention_mask,
         )
         hidden = model.reasoner_forward(

From 3f41015a6d96ef42e9ee0d22e4a3d87111807dfb Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:07:25 -0700
Subject: [PATCH 08/20] feat(reasoner): forward video tensors through
 generate_reasoner_text pass-throughs

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../model/vfm/mot/cosmos3_vfm_network.py        | 12 +++++++++---
 cosmos_framework/model/vfm/mot/unified_mot.py   | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py b/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
index 03f0c3f..c643629 100644
--- a/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
+++ b/cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
@@ -276,6 +276,8 @@ def generate_reasoner_text(
         *,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         eos_token_id: int | list[int] | None = None,
         pad_token_id: int | None = None,
@@ -296,9 +298,11 @@ def generate_reasoner_text(
         prompts through this single entry point: pass
         ``pixel_values`` + ``image_grid_thw`` (and optionally
         ``attention_mask``) for image-conditioned prefill via the Qwen3-VL
-        visual encoder, or omit them for text-only prefill.  Uses the
-        und-pathway weights (those WITHOUT the ``_moe_gen`` suffix) plus
-        ``embed_tokens`` / ``norm`` / ``lm_head``; the generation pathway
+        visual encoder, or omit them for text-only prefill.  Video
+        conditioning is also supported via ``pixel_values_videos`` +
+        ``video_grid_thw``; the image and video pairs are mutually exclusive.
+        Uses the und-pathway weights (those WITHOUT the ``_moe_gen`` suffix)
+        plus ``embed_tokens`` / ``norm`` / ``lm_head``; the generation pathway
         and all VFM-level multimodal embedders / heads (``vae2llm``,
         ``llm2vae``, ``sound2llm``, etc.) are bypassed.
 
@@ -327,6 +331,8 @@ def generate_reasoner_text(
             max_new_tokens=max_new_tokens,
             pixel_values=pixel_values,
             image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
             attention_mask=attention_mask,
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
diff --git a/cosmos_framework/model/vfm/mot/unified_mot.py b/cosmos_framework/model/vfm/mot/unified_mot.py
index ccaad14..abe8639 100644
--- a/cosmos_framework/model/vfm/mot/unified_mot.py
+++ b/cosmos_framework/model/vfm/mot/unified_mot.py
@@ -1943,6 +1943,8 @@ def generate_reasoner_text(
         *,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         eos_token_id: int | list[int] | None = None,
         pad_token_id: int | None = None,
@@ -1963,6 +1965,8 @@ def generate_reasoner_text(
         the Qwen3-VL visual encoder; omit them for text-only prefill.  The
         two arguments are mutually required: passing exactly one raises
         ``ValueError`` inside :func:`_impl_generate_reasoner_text`.
+        Video conditioning is also supported via ``pixel_values_videos`` +
+        ``video_grid_thw``; the image and video pairs are mutually exclusive.
 
         Uses the und-pathway weights (those WITHOUT the ``_moe_gen`` suffix)
         plus the model-level ``embed_tokens`` / ``norm`` / ``lm_head``, and —
@@ -1977,6 +1981,8 @@ def generate_reasoner_text(
             max_new_tokens=max_new_tokens,
             pixel_values=pixel_values,
             image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
             attention_mask=attention_mask,
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
@@ -2071,6 +2077,8 @@ def generate_reasoner_text(
         *,
         pixel_values: torch.Tensor | None = None,
         image_grid_thw: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         eos_token_id: int | list[int] | None = None,
         pad_token_id: int | None = None,
@@ -2091,6 +2099,8 @@ def generate_reasoner_text(
         the Qwen3-VL visual encoder; omit them for text-only prefill.  The
         two arguments are mutually required: passing exactly one raises
         ``ValueError`` inside :func:`_impl_generate_reasoner_text`.
+        Video conditioning is also supported via ``pixel_values_videos`` +
+        ``video_grid_thw``; the image and video pairs are mutually exclusive.
 
         Uses the und-pathway weights (those WITHOUT the ``_moe_gen`` suffix)
         plus the model-level ``embed_tokens`` / ``norm`` / ``lm_head``, and —
@@ -2106,6 +2116,8 @@ def generate_reasoner_text(
             max_new_tokens=max_new_tokens,
             pixel_values=pixel_values,
             image_grid_thw=image_grid_thw,
+            pixel_values_videos=pixel_values_videos,
+            video_grid_thw=video_grid_thw,
             attention_mask=attention_mask,
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
@@ -2193,6 +2205,11 @@ def generate_reasoner_text(
         input_ids: torch.Tensor,
         max_new_tokens: int,
         *,
+        pixel_values: torch.Tensor | None = None,
+        image_grid_thw: torch.Tensor | None = None,
+        pixel_values_videos: torch.Tensor | None = None,
+        video_grid_thw: torch.Tensor | None = None,
+        attention_mask: torch.Tensor | None = None,
         eos_token_id: int | list[int] | None = None,
         pad_token_id: int | None = None,
         do_sample: bool = False,

From c7d98743d535622486bc3c5a9a753c1f27788361 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:09:57 -0700
Subject: [PATCH 09/20] fix(reasoner): revert out-of-scope param additions to
 Nemotron generate_reasoner_text stub

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cosmos_framework/model/vfm/mot/unified_mot.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cosmos_framework/model/vfm/mot/unified_mot.py b/cosmos_framework/model/vfm/mot/unified_mot.py
index abe8639..673d0a0 100644
--- a/cosmos_framework/model/vfm/mot/unified_mot.py
+++ b/cosmos_framework/model/vfm/mot/unified_mot.py
@@ -2205,11 +2205,6 @@ def generate_reasoner_text(
         input_ids: torch.Tensor,
         max_new_tokens: int,
         *,
-        pixel_values: torch.Tensor | None = None,
-        image_grid_thw: torch.Tensor | None = None,
-        pixel_values_videos: torch.Tensor | None = None,
-        video_grid_thw: torch.Tensor | None = None,
-        attention_mask: torch.Tensor | None = None,
         eos_token_id: int | list[int] | None = None,
         pad_token_id: int | None = None,
         do_sample: bool = False,

From ff1d67df13743bd353771aad357cb94128cd29c9 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:15:27 -0700
Subject: [PATCH 10/20] feat(reasoner): videos param + video chat block in
 OmniMoTModel.generate_reasoner_text

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cosmos_framework/model/vfm/omni_mot_model.py | 111 ++++++++++++-------
 1 file changed, 72 insertions(+), 39 deletions(-)

diff --git a/cosmos_framework/model/vfm/omni_mot_model.py b/cosmos_framework/model/vfm/omni_mot_model.py
index 9f71a00..8d5e98f 100644
--- a/cosmos_framework/model/vfm/omni_mot_model.py
+++ b/cosmos_framework/model/vfm/omni_mot_model.py
@@ -3763,6 +3763,8 @@ def generate_reasoner_text(
         max_new_tokens: int,
         *,
         images: list[Any] | None = None,
+        videos: list[Any] | None = None,
+        video_sampling_kwargs: dict[str, Any] | None = None,
         prompt_builder: Callable[[str], list[dict[str, Any]]] | None = None,
         do_sample: bool = False,
         temperature: float | None = 1.0,
@@ -3835,6 +3837,14 @@ def generate_reasoner_text(
                 ``processor.apply_chat_template``, so any input it
                 accepts works (file path ``str``, ``PIL.Image.Image``,
                 ``np.ndarray``, or a CHW / HWC tensor).
+            videos: Optional per-prompt conditioning videos (mutually
+                exclusive with ``images``). Each entry is forwarded into a
+                ``{"type": "video", "video": ...}`` chat block; the
+                processor decodes/samples frames and produces
+                ``pixel_values_videos`` / ``video_grid_thw``.
+            video_sampling_kwargs: Optional dict of non-None frame-sampling
+                controls (fps, num_frames, min_frames, max_frames,
+                min_pixels, max_pixels) forwarded to the processor.
             prompt_builder: Optional callback that maps a raw prompt
                 string to a chat-style messages list (e.g.
                 :func:`projects.cosmos3.vfm.upsampler.prompts.build_messages`
@@ -3904,22 +3914,28 @@ def generate_reasoner_text(
         # image-list contract here so the failure happens before any
         # decoding work — far easier to debug than a downstream
         # ``apply_chat_template`` error.
-        use_multimodal = images is not None
+        if images is not None and videos is not None:
+            raise ValueError("generate_reasoner_text conditions on one medium at a time: pass `images` OR `videos`, not both.")
+        use_image = images is not None
+        use_video = videos is not None
+        use_multimodal = use_image or use_video
+        media = images if use_image else videos
         if use_multimodal:
-            assert images is not None  # narrowed by `use_multimodal`
-            if len(images) != len(inputs):
+            assert media is not None  # narrowed by `use_multimodal`
+            if len(media) != len(inputs):
                 raise ValueError(
-                    f"generate_reasoner_text: `images` length ({len(images)}) "
+                    f"generate_reasoner_text: media length ({len(media)}) "
                     f"must equal `inputs` length ({len(inputs)}) for the "
-                    "image-conditioned flow."
+                    "vision-conditioned flow."
                 )
             if not callable(getattr(self.vlm_processor, "apply_chat_template", None)):
                 raise RuntimeError(
-                    "generate_reasoner_text(images=...) requires a multimodal "
+                    "generate_reasoner_text(images=/videos=...) requires a multimodal "
                     "VLM processor (e.g. Qwen3VLProcessor) but the live processor "
                     f"{type(self.vlm_processor).__name__!r} does not implement "
                     "apply_chat_template — the live VLM is configured as text-only."
                 )
+        video_kwargs = {k: v for k, v in (video_sampling_kwargs or {}).items() if v is not None}
 
         # Resolve EOS / pad ids internally so callers don't have to know
         # about VLM-specific id wiring.  EOS comes from the cached VLM
@@ -3957,55 +3973,72 @@ def generate_reasoner_text(
                 messages = [{"role": "user", "content": prompt}]
 
             if use_multimodal:
-                assert images is not None  # narrowed by `use_multimodal`
-                # Replace the LAST user message's content with a Qwen3-VL
-                # multimodal block (image + text).  Earlier messages
-                # (system, prior turns) are kept verbatim so any chat
-                # scaffolding the callback added still governs the
-                # assistant response.
+                assert media is not None  # narrowed by `use_multimodal`
                 last_user = messages[-1]
                 last_text = last_user["content"] if isinstance(last_user.get("content"), str) else ""
+                if use_video:
+                    media_item: dict[str, Any] = {"type": "video", "video": media[idx]}
+                else:
+                    media_item = {"type": "image", "image": media[idx]}
                 multimodal_messages = list(messages[:-1])
                 multimodal_messages.append(
                     {
                         "role": "user",
-                        "content": [
-                            {"type": "image", "image": images[idx]},
-                            {"type": "text", "text": last_text},
-                        ],
+                        "content": [media_item, {"type": "text", "text": last_text}],
                     }
                 )
+                # video_kwargs (fps/num_frames/min_frames/max_frames/min_pixels/max_pixels)
+                # are forwarded to the processor here. The exact kwarg surface depends on the
+                # installed transformers Qwen3VLProcessor; verified on GPU.
                 processor_inputs = self.vlm_processor.apply_chat_template(
                     multimodal_messages,
                     tokenize=True,
                     add_generation_prompt=True,
                     return_tensors="pt",
+                    **(video_kwargs if use_video else {}),
                 )
-                # ``Qwen3VLProcessor.apply_chat_template`` strips the
-                # leading batch dim from ``input_ids`` / ``attention_mask``
-                # (see its inline comment); restore it so the inner
-                # token-level call sees ``[B=1, T_prompt]``.
                 inner_input_ids = processor_inputs["input_ids"].to(device).unsqueeze(0)
                 inner_attention_mask = processor_inputs["attention_mask"].to(device).unsqueeze(0)
-                inner_pixel_values = processor_inputs["pixel_values"].to(device)  # [N_patches,C,H,W]
-                inner_image_grid_thw = processor_inputs["image_grid_thw"].to(device)  # [num_images,3]
-                out_ids = self.net.generate_reasoner_text(
-                    input_ids=inner_input_ids,
-                    max_new_tokens=max_new_tokens,
-                    pixel_values=inner_pixel_values,
-                    image_grid_thw=inner_image_grid_thw,
-                    attention_mask=inner_attention_mask,
-                    eos_token_id=eos_id,
-                    pad_token_id=pad_id,
-                    do_sample=do_sample,
-                    temperature=temperature if temperature is not None else 1.0,
-                    top_k=top_k,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    presence_penalty=presence_penalty,
-                    seed=seed,
-                    return_only_new_tokens=True,
-                )
+                if use_video:
+                    inner_pixel_values_videos = processor_inputs["pixel_values_videos"].to(device)
+                    inner_video_grid_thw = processor_inputs["video_grid_thw"].to(device)
+                    out_ids = self.net.generate_reasoner_text(
+                        input_ids=inner_input_ids,
+                        max_new_tokens=max_new_tokens,
+                        pixel_values_videos=inner_pixel_values_videos,
+                        video_grid_thw=inner_video_grid_thw,
+                        attention_mask=inner_attention_mask,
+                        eos_token_id=eos_id,
+                        pad_token_id=pad_id,
+                        do_sample=do_sample,
+                        temperature=temperature if temperature is not None else 1.0,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        presence_penalty=presence_penalty,
+                        seed=seed,
+                        return_only_new_tokens=True,
+                    )
+                else:
+                    inner_pixel_values = processor_inputs["pixel_values"].to(device)  # [N_patches,C,H,W]
+                    inner_image_grid_thw = processor_inputs["image_grid_thw"].to(device)  # [num_images,3]
+                    out_ids = self.net.generate_reasoner_text(
+                        input_ids=inner_input_ids,
+                        max_new_tokens=max_new_tokens,
+                        pixel_values=inner_pixel_values,
+                        image_grid_thw=inner_image_grid_thw,
+                        attention_mask=inner_attention_mask,
+                        eos_token_id=eos_id,
+                        pad_token_id=pad_id,
+                        do_sample=do_sample,
+                        temperature=temperature if temperature is not None else 1.0,
+                        top_k=top_k,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        presence_penalty=presence_penalty,
+                        seed=seed,
+                        return_only_new_tokens=True,
+                    )
             else:
                 # Text-only path.  Pull the system prompt (if any) and
                 # the last user message text out of the messages list,

From dbd7e86fffa690c916da53d616b29eb414295fc6 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:22:43 -0700
Subject: [PATCH 11/20] docs(reasoner): update generate_reasoner_text docstring
 for video path

---
 cosmos_framework/model/vfm/omni_mot_model.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cosmos_framework/model/vfm/omni_mot_model.py b/cosmos_framework/model/vfm/omni_mot_model.py
index 8d5e98f..d40e072 100644
--- a/cosmos_framework/model/vfm/omni_mot_model.py
+++ b/cosmos_framework/model/vfm/omni_mot_model.py
@@ -3781,8 +3781,10 @@ def generate_reasoner_text(
         (or wraps the prompt as a single user message when no callback is
         given), (b) tokenizes it — text-only via :meth:`tokenize_text`, or
         multimodal via ``self.vlm_processor.apply_chat_template`` when
-        ``images`` is supplied (which lowers the chat into ``input_ids``,
-        ``attention_mask``, ``pixel_values``, and ``image_grid_thw``), (c)
+        ``images`` or ``videos`` is supplied (the image path lowers the chat
+        into ``input_ids``, ``attention_mask``, ``pixel_values``, and
+        ``image_grid_thw``; the video path yields ``pixel_values_videos`` and
+        ``video_grid_thw`` instead), (c)
         runs the reasoner-only AR decode loop through
         ``self.net.generate_reasoner_text`` (the lower-level token-driven
         pass-through that delegates to ``unified_mot._impl_generate_reasoner_text``),
@@ -3905,10 +3907,14 @@ def generate_reasoner_text(
 
         Raises:
             ValueError: If ``images`` length does not match ``inputs``
-                length.
-            RuntimeError: If ``images`` is supplied but the live VLM
-                processor does not implement ``apply_chat_template``
-                (i.e., the VLM is configured as text-only).
+                length, or if ``videos`` length does not match ``inputs``
+                length.  Also raised if both ``images`` and ``videos`` are
+                supplied simultaneously (only one medium is allowed per
+                call).
+            RuntimeError: If ``images`` or ``videos`` is supplied but the
+                live VLM processor does not implement
+                ``apply_chat_template`` (i.e., the VLM is configured as
+                text-only).
         """
         # Decide whether the multimodal flow is in play, and validate the
         # image-list contract here so the failure happens before any

From c31261cd2a6ec5691199426c5d2c2288f30b601e Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:28:07 -0700
Subject: [PATCH 12/20] feat(reasoner): route mp4 vision_path to video
 conditioning in inference engine

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 cosmos_framework/inference/inference.py      | 62 ++++++++++++++++----
 cosmos_framework/inference/inference_test.py | 46 +++++++++++++++
 2 files changed, 96 insertions(+), 12 deletions(-)

diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
index 83ff665..f659919 100644
--- a/cosmos_framework/inference/inference.py
+++ b/cosmos_framework/inference/inference.py
@@ -19,6 +19,8 @@
 from torch.utils.data import Dataset
 from typing_extensions import Self
 
+from cosmos_framework.configs.base.defaults.compile import CompileConfig
+from cosmos_framework.configs.base.defaults.parallelism import ParallelismConfig
 from cosmos_framework.inference.args import (
     ModelMode,
     NegativeMetadataMode,
@@ -26,6 +28,7 @@
     OmniSetupArgs,
 )
 from cosmos_framework.inference.common.args import (
+    VIDEO_EXTENSIONS,
     CheckpointType,
     ConfigFileType,
     ParallelismArgs,
@@ -46,13 +49,11 @@
     pil_to_conditioning_frames,
     resize_pil_image,
 )
-from cosmos_framework.utils import log
-from cosmos_framework.tools.visualize.video import save_img_or_video
-from cosmos_framework.configs.base.defaults.compile import CompileConfig
-from cosmos_framework.configs.base.defaults.parallelism import ParallelismConfig
 from cosmos_framework.model.vfm.omni_mot_model import OmniMoTModel
-from cosmos_framework.model.vfm.vlm.qwen3_vl.utils import _SYSTEM_PROMPT_IMAGE_EDITING
 from cosmos_framework.model.vfm.upsampler.prompts import is_upsampled_prompt
+from cosmos_framework.model.vfm.vlm.qwen3_vl.utils import _SYSTEM_PROMPT_IMAGE_EDITING
+from cosmos_framework.tools.visualize.video import save_img_or_video
+from cosmos_framework.utils import log
 
 if TYPE_CHECKING:
     from cosmos_framework.configs.base.defaults.model_config import OmniMoTModelConfig
@@ -464,14 +465,33 @@ def _get_prompt_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel, *,
 
 
 def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel) -> dict[str, Any]:
-    """Sample batch for reasoner text generation: prompt + optional conditioning image."""
+    """Sample batch for reasoner text generation: prompt + optional conditioning image or video."""
     image: Image.Image | None = None
+    video: str | None = None
     if sample_args.vision_path is not None:
-        image = Image.open(sample_args.vision_path).convert("RGB")
-    return {
+        if Path(sample_args.vision_path).suffix.lower() in VIDEO_EXTENSIONS:
+            video = str(sample_args.vision_path)
+        else:
+            image = Image.open(sample_args.vision_path).convert("RGB")
+    out: dict[str, Any] = {
         model.input_caption_key: [sample_args.prompt],
         "reasoner_images": [image],
     }
+    if video is not None:
+        out["reasoner_videos"] = [video]
+        out["video_sampling_kwargs"] = {
+            k: v
+            for k, v in {
+                "fps": sample_args.video_fps,
+                "num_frames": sample_args.video_num_frames,
+                "min_frames": sample_args.video_min_frames,
+                "max_frames": sample_args.video_max_frames,
+                "min_pixels": sample_args.video_min_pixels,
+                "max_pixels": sample_args.video_max_pixels,
+            }.items()
+            if v is not None
+        }
+    return out
 
 
 def _get_image_edit_sample_data(
@@ -1655,13 +1675,29 @@ def _generate_reasoner_batch(
 
         prompts: list[str] = data_batch[self.model.input_caption_key]
         raw_images: list[Image.Image | None] = data_batch["reasoner_images"]
-        n_set = sum(img is not None for img in raw_images)
-        if 0 < n_set < len(raw_images):
+        raw_videos: list[str | None] | None = data_batch.get("reasoner_videos")
+        video_sampling_kwargs: dict[str, Any] = data_batch.get("video_sampling_kwargs", {})
+
+        n_img = sum(img is not None for img in raw_images)
+        n_vid = sum(v is not None for v in (raw_videos or []))
+        if n_img and n_vid:
+            raise ValueError(
+                "Reasoner batch mixes image- and video-conditioned samples. Split into separate batches."
+            )
+        if 0 < n_img < len(raw_images):
             raise ValueError(
                 "Reasoner batch mixes image-conditioned and text-only samples "
-                f"({n_set}/{len(raw_images)} have vision_path). Split into separate batches."
+                f"({n_img}/{len(raw_images)} have an image vision_path). Split into separate batches."
             )
-        images: list[Image.Image] | None = cast(list[Image.Image], raw_images) if n_set == len(raw_images) else None
+        if raw_videos is not None and 0 < n_vid < len(raw_videos):
+            raise ValueError(
+                "Reasoner batch mixes video-conditioned and text-only samples "
+                f"({n_vid}/{len(raw_videos)} have a video vision_path). Split into separate batches."
+            )
+        images: list[Image.Image] | None = cast(list[Image.Image], raw_images) if n_img == len(raw_images) else None
+        videos: list[str] | None = (
+            cast(list[str], raw_videos) if raw_videos is not None and n_vid == len(raw_videos) else None
+        )
 
         try:
             with sync_distributed_errors():
@@ -1686,6 +1722,8 @@ def _generate_reasoner_batch(
                 prompts,
                 max_new_tokens=sample_args_list[0].max_new_tokens,
                 images=images,
+                videos=videos,
+                video_sampling_kwargs=video_sampling_kwargs or None,
                 do_sample=sample_args_list[0].do_sample,
                 temperature=sample_args_list[0].temperature,
                 top_k=sample_args_list[0].top_k,
diff --git a/cosmos_framework/inference/inference_test.py b/cosmos_framework/inference/inference_test.py
index d1f501b..4561fb1 100644
--- a/cosmos_framework/inference/inference_test.py
+++ b/cosmos_framework/inference/inference_test.py
@@ -349,3 +349,49 @@ def test_reasoner_defaults_validate_against_overrides() -> None:
     filtered = {k: v for k, v in defaults.items() if k in OmniSampleOverrides.model_fields}
     assert set(defaults) - set(filtered) == set(), f"defaults has unknown fields: {set(defaults) - set(filtered)}"
     OmniSampleOverrides.model_validate(filtered)
+
+
+# ---------------------------------------------------------------------------
+# _get_reasoner_sample_data: image / video / text-only routing
+# ---------------------------------------------------------------------------
+
+
+def _fake_sa(vision_path: Any, **video_kw: Any) -> SimpleNamespace:
+    base: dict[str, Any] = dict(
+        prompt="describe",
+        vision_path=vision_path,
+        video_fps=None,
+        video_num_frames=None,
+        video_min_frames=None,
+        video_max_frames=None,
+        video_min_pixels=None,
+        video_max_pixels=None,
+    )
+    base.update(video_kw)
+    return SimpleNamespace(**base)
+
+
+_fake_model = SimpleNamespace(input_caption_key="caption")
+
+
+@pytest.mark.L0
+def test_reasoner_sample_data_text_only() -> None:
+    from cosmos_framework.inference.inference import _get_reasoner_sample_data
+
+    out = _get_reasoner_sample_data(_fake_sa(None), _fake_model)
+    assert out["caption"] == ["describe"]
+    assert out["reasoner_images"] == [None]
+    assert "reasoner_videos" not in out
+
+
+@pytest.mark.L0
+def test_reasoner_sample_data_video_routes_to_videos(tmp_path: Path) -> None:
+    from cosmos_framework.inference.inference import _get_reasoner_sample_data
+
+    clip = tmp_path / "clip.mp4"
+    clip.write_bytes(b"\x00")  # not decoded by the builder
+    out = _get_reasoner_sample_data(_fake_sa(str(clip), video_fps=2), _fake_model)
+    assert out["caption"] == ["describe"]
+    assert out["reasoner_videos"] == [str(clip)]
+    assert out["reasoner_images"] == [None]
+    assert out["video_sampling_kwargs"] == {"fps": 2}

From 769f46558bdd62e097dbf158b4c4d239063f204c Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:42:23 -0700
Subject: [PATCH 13/20] docs(reasoner): document video input + add
 reasoner_video example

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 docs/inference.md                   | 11 +++++++++++
 inputs/reasoner/reasoner_video.json |  5 +++++
 2 files changed, 16 insertions(+)
 create mode 100644 inputs/reasoner/reasoner_video.json

diff --git a/docs/inference.md b/docs/inference.md
index 1580147..179fd16 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -196,6 +196,17 @@ Generation arguments:
 
 Outputs `vision.jpg` or `vision.mp4` depending on `num_frames`.
 
+### Reasoner
+
+For `model_mode=reasoner`, `vision_path` may point to an **image** (`.jpg`/`.png`/…) or a **video** (`.mp4`). A video is decoded by the Qwen3-VL processor and sampled into frames. Optional frame-sampling controls (all default to the processor's defaults):
+
+- `video_fps`: frames sampled per second (mutually exclusive with `video_num_frames`).
+- `video_num_frames`: fixed number of frames to sample.
+- `video_min_frames` / `video_max_frames`: bounds on the sampled frame count.
+- `video_min_pixels` / `video_max_pixels`: per-frame pixel budget (drives resolution).
+
+Example: [`inputs/reasoner/reasoner_video.json`](../inputs/reasoner/reasoner_video.json).
+
 ### Action
 
 Common arguments:
diff --git a/inputs/reasoner/reasoner_video.json b/inputs/reasoner/reasoner_video.json
new file mode 100644
index 0000000..1c68308
--- /dev/null
+++ b/inputs/reasoner/reasoner_video.json
@@ -0,0 +1,5 @@
+{
+    "model_mode": "reasoner",
+    "prompt": "Describe what is happening in this video in one sentence.",
+    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/2b17a2413bd86b2cf9b03823637108851e4ddf2d/inputs/vision/robot_pouring.mp4"
+}

From 663112ec35d71ee34a7805b776b234ac3ba1ec35 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 01:47:22 -0700
Subject: [PATCH 14/20] docs(reasoner): clarify vision_path comment covers
 video too

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/args.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
index 29b9fe5..0c12a16 100644
--- a/cosmos_framework/inference/args.py
+++ b/cosmos_framework/inference/args.py
@@ -454,7 +454,7 @@ def _build_vision_data(self, model_config: "OmniMoTModelConfig", sample_meta: Sa
         if self.vision_path and "://" in self.vision_path:
             raise ValueError("Must call `download()` before building vision data")
 
-        # Reasoner mode treats ``vision_path`` as a PIL image source; resolution/fps/num_frames are unused.
+        # Reasoner mode treats ``vision_path`` as an image (PIL) or video (mp4) source; resolution/fps/num_frames are unused.
         if sample_meta.model_mode.is_reasoner:
             self.condition_frame_indexes_vision = self.condition_frame_indexes_vision or []
             self.condition_video_keep = self.condition_video_keep or "first"

From 1d957bb094c372198c9504126ea3ced7109fa82d Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 02:36:22 -0700
Subject: [PATCH 15/20] fix(reasoner): decode video frames for
 Qwen3VLProcessor; reduce knobs to video_fps

---
 cosmos_framework/inference/args.py            | 23 +---------
 cosmos_framework/inference/args_test.py       | 17 ++------
 .../defaults/reasoner/sample_args.json        |  7 +---
 cosmos_framework/inference/inference.py       | 42 ++++++++++---------
 cosmos_framework/inference/inference_test.py  | 24 ++++-------
 cosmos_framework/model/vfm/omni_mot_model.py  | 21 ++++------
 docs/inference.md                             |  7 +---
 7 files changed, 46 insertions(+), 95 deletions(-)

diff --git a/cosmos_framework/inference/args.py b/cosmos_framework/inference/args.py
index 0c12a16..bee239c 100644
--- a/cosmos_framework/inference/args.py
+++ b/cosmos_framework/inference/args.py
@@ -610,11 +610,6 @@ class ReasonerDataArgs(ArgsBase):
     repetition_penalty: _ReasonerRepetitionPenalty | None = None
     presence_penalty: float | None = None
     video_fps: pydantic.PositiveFloat | None = None
-    video_num_frames: pydantic.PositiveInt | None = None
-    video_min_frames: pydantic.PositiveInt | None = None
-    video_max_frames: pydantic.PositiveInt | None = None
-    video_min_pixels: pydantic.PositiveInt | None = None
-    video_max_pixels: pydantic.PositiveInt | None = None
 
 
 class ReasonerDataOverrides(OverridesBase):
@@ -636,23 +631,7 @@ class ReasonerDataOverrides(OverridesBase):
     presence_penalty: float | None = None
     """Additive presence penalty (any sign). ``0.0`` is identity."""
     video_fps: pydantic.PositiveFloat | None = None
-    """Frames per second to sample from a video vision_path. Mutually exclusive with video_num_frames. None -> processor default."""
-    video_num_frames: pydantic.PositiveInt | None = None
-    """Fixed number of frames to sample from a video vision_path. Mutually exclusive with video_fps. None -> processor default."""
-    video_min_frames: pydantic.PositiveInt | None = None
-    """Lower bound on sampled frame count. None -> processor default."""
-    video_max_frames: pydantic.PositiveInt | None = None
-    """Upper bound on sampled frame count. None -> processor default."""
-    video_min_pixels: pydantic.PositiveInt | None = None
-    """Lower bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
-    video_max_pixels: pydantic.PositiveInt | None = None
-    """Upper bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
-
-    @pydantic.model_validator(mode="after")
-    def _validate_video_sampling(self) -> Self:
-        if self.video_fps is not None and self.video_num_frames is not None:
-            raise ValueError("video_fps and video_num_frames are mutually exclusive; set at most one.")
-        return self
+    """Frames per second to sample from a video vision_path. None -> decoder default (2.0)."""
 
     def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta: SampleMeta):
         if not sample_meta.model_mode.is_reasoner:
diff --git a/cosmos_framework/inference/args_test.py b/cosmos_framework/inference/args_test.py
index 982d36e..bd78439 100644
--- a/cosmos_framework/inference/args_test.py
+++ b/cosmos_framework/inference/args_test.py
@@ -159,20 +159,11 @@ def test_sample_args(tmp_path: Path):
     assert text2image_args.shift == 3.0
 
 
-def test_reasoner_video_fields_default_none():
+def test_reasoner_video_fps_defaults_none():
     ov = ReasonerDataOverrides()
     assert ov.video_fps is None
-    assert ov.video_num_frames is None
-    assert ov.video_min_frames is None
-    assert ov.video_max_frames is None
-    assert ov.video_min_pixels is None
-    assert ov.video_max_pixels is None
 
 
-def test_reasoner_video_fps_and_num_frames_mutually_exclusive():
-    with pytest.raises(ValueError, match="video_fps.*video_num_frames|mutually exclusive"):
-        ReasonerDataOverrides(video_fps=2, video_num_frames=16)
-
-
-def test_reasoner_video_fps_alone_ok():
-    ReasonerDataOverrides(video_fps=2)  # no raise
+def test_reasoner_video_fps_accepts_positive_float():
+    ov = ReasonerDataOverrides(video_fps=2.0)
+    assert ov.video_fps == 2.0
diff --git a/cosmos_framework/inference/defaults/reasoner/sample_args.json b/cosmos_framework/inference/defaults/reasoner/sample_args.json
index a3d7569..e7a25ad 100644
--- a/cosmos_framework/inference/defaults/reasoner/sample_args.json
+++ b/cosmos_framework/inference/defaults/reasoner/sample_args.json
@@ -7,10 +7,5 @@
     "top_p": null,
     "repetition_penalty": 1.0,
     "presence_penalty": 0.0,
-    "video_fps": null,
-    "video_num_frames": null,
-    "video_min_frames": null,
-    "video_max_frames": null,
-    "video_min_pixels": null,
-    "video_max_pixels": null
+    "video_fps": null
 }
diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
index f659919..bbd5224 100644
--- a/cosmos_framework/inference/inference.py
+++ b/cosmos_framework/inference/inference.py
@@ -21,6 +21,7 @@
 
 from cosmos_framework.configs.base.defaults.compile import CompileConfig
 from cosmos_framework.configs.base.defaults.parallelism import ParallelismConfig
+from cosmos_framework.data.vfm.vlm.video_decoder_qwen import _video_decoder_qwen_func
 from cosmos_framework.inference.args import (
     ModelMode,
     NegativeMetadataMode,
@@ -54,6 +55,7 @@
 from cosmos_framework.model.vfm.vlm.qwen3_vl.utils import _SYSTEM_PROMPT_IMAGE_EDITING
 from cosmos_framework.tools.visualize.video import save_img_or_video
 from cosmos_framework.utils import log
+from cosmos_framework.utils.vfm.video_preprocess import tensor_to_pil_images
 
 if TYPE_CHECKING:
     from cosmos_framework.configs.base.defaults.model_config import OmniMoTModelConfig
@@ -464,13 +466,29 @@ def _get_prompt_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel, *,
     return out
 
 
+def _decode_reasoner_video(vision_path: str, video_fps: float | None) -> dict[str, Any]:
+    """Decode a local video file into the frame-list payload the Qwen3-VL processor expects.
+
+    Returns ``{"frames": [PIL.Image, ...], "fps": float}``. Reuses the dataloader's
+    decode path (``_video_decoder_qwen_func`` + ``tensor_to_pil_images``)."""
+    with open(vision_path, "rb") as f:
+        video_bytes = f.read()
+    decode_kwargs: dict[str, Any] = {}
+    if video_fps is not None:
+        decode_kwargs["target_fps"] = video_fps
+    result = _video_decoder_qwen_func(key="video.mp4", data=video_bytes, **decode_kwargs)
+    if result is None:
+        raise ValueError(f"Failed to decode reasoner video: {vision_path}")
+    return {"frames": tensor_to_pil_images(result["videos"]), "fps": result["fps"]}
+
+
 def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel) -> dict[str, Any]:
     """Sample batch for reasoner text generation: prompt + optional conditioning image or video."""
     image: Image.Image | None = None
-    video: str | None = None
+    video: dict[str, Any] | None = None
     if sample_args.vision_path is not None:
         if Path(sample_args.vision_path).suffix.lower() in VIDEO_EXTENSIONS:
-            video = str(sample_args.vision_path)
+            video = _decode_reasoner_video(str(sample_args.vision_path), sample_args.video_fps)
         else:
             image = Image.open(sample_args.vision_path).convert("RGB")
     out: dict[str, Any] = {
@@ -479,18 +497,6 @@ def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel)
     }
     if video is not None:
         out["reasoner_videos"] = [video]
-        out["video_sampling_kwargs"] = {
-            k: v
-            for k, v in {
-                "fps": sample_args.video_fps,
-                "num_frames": sample_args.video_num_frames,
-                "min_frames": sample_args.video_min_frames,
-                "max_frames": sample_args.video_max_frames,
-                "min_pixels": sample_args.video_min_pixels,
-                "max_pixels": sample_args.video_max_pixels,
-            }.items()
-            if v is not None
-        }
     return out
 
 
@@ -1675,8 +1681,7 @@ def _generate_reasoner_batch(
 
         prompts: list[str] = data_batch[self.model.input_caption_key]
         raw_images: list[Image.Image | None] = data_batch["reasoner_images"]
-        raw_videos: list[str | None] | None = data_batch.get("reasoner_videos")
-        video_sampling_kwargs: dict[str, Any] = data_batch.get("video_sampling_kwargs", {})
+        raw_videos: list[dict[str, Any] | None] | None = data_batch.get("reasoner_videos")
 
         n_img = sum(img is not None for img in raw_images)
         n_vid = sum(v is not None for v in (raw_videos or []))
@@ -1695,8 +1700,8 @@ def _generate_reasoner_batch(
                 f"({n_vid}/{len(raw_videos)} have a video vision_path). Split into separate batches."
             )
         images: list[Image.Image] | None = cast(list[Image.Image], raw_images) if n_img == len(raw_images) else None
-        videos: list[str] | None = (
-            cast(list[str], raw_videos) if raw_videos is not None and n_vid == len(raw_videos) else None
+        videos: list[dict[str, Any]] | None = (
+            cast(list[dict[str, Any]], raw_videos) if raw_videos is not None and n_vid == len(raw_videos) else None
         )
 
         try:
@@ -1723,7 +1728,6 @@ def _generate_reasoner_batch(
                 max_new_tokens=sample_args_list[0].max_new_tokens,
                 images=images,
                 videos=videos,
-                video_sampling_kwargs=video_sampling_kwargs or None,
                 do_sample=sample_args_list[0].do_sample,
                 temperature=sample_args_list[0].temperature,
                 top_k=sample_args_list[0].top_k,
diff --git a/cosmos_framework/inference/inference_test.py b/cosmos_framework/inference/inference_test.py
index 4561fb1..8e77195 100644
--- a/cosmos_framework/inference/inference_test.py
+++ b/cosmos_framework/inference/inference_test.py
@@ -357,16 +357,7 @@ def test_reasoner_defaults_validate_against_overrides() -> None:
 
 
 def _fake_sa(vision_path: Any, **video_kw: Any) -> SimpleNamespace:
-    base: dict[str, Any] = dict(
-        prompt="describe",
-        vision_path=vision_path,
-        video_fps=None,
-        video_num_frames=None,
-        video_min_frames=None,
-        video_max_frames=None,
-        video_min_pixels=None,
-        video_max_pixels=None,
-    )
+    base: dict[str, Any] = dict(prompt="describe", vision_path=vision_path, video_fps=None)
     base.update(video_kw)
     return SimpleNamespace(**base)
 
@@ -385,13 +376,14 @@ def test_reasoner_sample_data_text_only() -> None:
 
 
 @pytest.mark.L0
-def test_reasoner_sample_data_video_routes_to_videos(tmp_path: Path) -> None:
+def test_reasoner_sample_data_video_routes_to_videos(monkeypatch: pytest.MonkeyPatch) -> None:
+    import cosmos_framework.inference.inference as inf
     from cosmos_framework.inference.inference import _get_reasoner_sample_data
 
-    clip = tmp_path / "clip.mp4"
-    clip.write_bytes(b"\x00")  # not decoded by the builder
-    out = _get_reasoner_sample_data(_fake_sa(str(clip), video_fps=2), _fake_model)
+    sentinel = {"frames": ["F0", "F1"], "fps": 2.0}
+    monkeypatch.setattr(inf, "_decode_reasoner_video", lambda path, fps: sentinel)
+    out = _get_reasoner_sample_data(_fake_sa("/tmp/clip.mp4", video_fps=2.0), _fake_model)
     assert out["caption"] == ["describe"]
-    assert out["reasoner_videos"] == [str(clip)]
+    assert out["reasoner_videos"] == [sentinel]
     assert out["reasoner_images"] == [None]
-    assert out["video_sampling_kwargs"] == {"fps": 2}
+    assert "video_sampling_kwargs" not in out
diff --git a/cosmos_framework/model/vfm/omni_mot_model.py b/cosmos_framework/model/vfm/omni_mot_model.py
index d40e072..429cbfc 100644
--- a/cosmos_framework/model/vfm/omni_mot_model.py
+++ b/cosmos_framework/model/vfm/omni_mot_model.py
@@ -3764,7 +3764,6 @@ def generate_reasoner_text(
         *,
         images: list[Any] | None = None,
         videos: list[Any] | None = None,
-        video_sampling_kwargs: dict[str, Any] | None = None,
         prompt_builder: Callable[[str], list[dict[str, Any]]] | None = None,
         do_sample: bool = False,
         temperature: float | None = 1.0,
@@ -3840,13 +3839,13 @@ def generate_reasoner_text(
                 accepts works (file path ``str``, ``PIL.Image.Image``,
                 ``np.ndarray``, or a CHW / HWC tensor).
             videos: Optional per-prompt conditioning videos (mutually
-                exclusive with ``images``). Each entry is forwarded into a
-                ``{"type": "video", "video": ...}`` chat block; the
-                processor decodes/samples frames and produces
+                exclusive with ``images``). Each entry must be a
+                ``{"frames": [...PIL...], "fps": float}`` payload
+                (pre-decoded by the caller, e.g. via
+                ``_decode_reasoner_video``). The frames list and fps are
+                forwarded into the ``{"type": "video", "video": frames,
+                "fps": fps}`` chat block so the processor produces
                 ``pixel_values_videos`` / ``video_grid_thw``.
-            video_sampling_kwargs: Optional dict of non-None frame-sampling
-                controls (fps, num_frames, min_frames, max_frames,
-                min_pixels, max_pixels) forwarded to the processor.
             prompt_builder: Optional callback that maps a raw prompt
                 string to a chat-style messages list (e.g.
                 :func:`projects.cosmos3.vfm.upsampler.prompts.build_messages`
@@ -3941,8 +3940,6 @@ def generate_reasoner_text(
                     f"{type(self.vlm_processor).__name__!r} does not implement "
                     "apply_chat_template — the live VLM is configured as text-only."
                 )
-        video_kwargs = {k: v for k, v in (video_sampling_kwargs or {}).items() if v is not None}
-
         # Resolve EOS / pad ids internally so callers don't have to know
         # about VLM-specific id wiring.  EOS comes from the cached VLM
         # special-tokens dict (set in ``set_up_tokenizers``); pad mirrors
@@ -3983,7 +3980,7 @@ def generate_reasoner_text(
                 last_user = messages[-1]
                 last_text = last_user["content"] if isinstance(last_user.get("content"), str) else ""
                 if use_video:
-                    media_item: dict[str, Any] = {"type": "video", "video": media[idx]}
+                    media_item: dict[str, Any] = {"type": "video", "video": media[idx]["frames"], "fps": media[idx]["fps"]}
                 else:
                     media_item = {"type": "image", "image": media[idx]}
                 multimodal_messages = list(messages[:-1])
@@ -3993,15 +3990,11 @@ def generate_reasoner_text(
                         "content": [media_item, {"type": "text", "text": last_text}],
                     }
                 )
-                # video_kwargs (fps/num_frames/min_frames/max_frames/min_pixels/max_pixels)
-                # are forwarded to the processor here. The exact kwarg surface depends on the
-                # installed transformers Qwen3VLProcessor; verified on GPU.
                 processor_inputs = self.vlm_processor.apply_chat_template(
                     multimodal_messages,
                     tokenize=True,
                     add_generation_prompt=True,
                     return_tensors="pt",
-                    **(video_kwargs if use_video else {}),
                 )
                 inner_input_ids = processor_inputs["input_ids"].to(device).unsqueeze(0)
                 inner_attention_mask = processor_inputs["attention_mask"].to(device).unsqueeze(0)
diff --git a/docs/inference.md b/docs/inference.md
index 179fd16..ae04065 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -198,12 +198,9 @@ Outputs `vision.jpg` or `vision.mp4` depending on `num_frames`.
 
 ### Reasoner
 
-For `model_mode=reasoner`, `vision_path` may point to an **image** (`.jpg`/`.png`/…) or a **video** (`.mp4`). A video is decoded by the Qwen3-VL processor and sampled into frames. Optional frame-sampling controls (all default to the processor's defaults):
+For `model_mode=reasoner`, `vision_path` may point to an **image** (`.jpg`/`.png`/…) or a **video** (`.mp4`). A video is decoded into frames using the dataloader's canonical decode path and then passed to the Qwen3-VL processor.
 
-- `video_fps`: frames sampled per second (mutually exclusive with `video_num_frames`).
-- `video_num_frames`: fixed number of frames to sample.
-- `video_min_frames` / `video_max_frames`: bounds on the sampled frame count.
-- `video_min_pixels` / `video_max_pixels`: per-frame pixel budget (drives resolution).
+- `video_fps`: frames per second to sample from the video (default: the decoder's default of 2.0).
 
 Example: [`inputs/reasoner/reasoner_video.json`](../inputs/reasoner/reasoner_video.json).
 

From e9aa7f50346136bc82cd5b2915dd1ffad11cdf72 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 02:49:00 -0700
Subject: [PATCH 16/20] fix(reasoner): decode video via torchvision.io +
 smart_nframes (drop decord/pkl_to_media dep)

The repo Qwen3VLProcessor runs do_sample_frames=False and expects a pre-decoded
frame list; decode with the inference-canonical torchvision.io.read_video (no
undeclared decord dep) and sample toward video_fps via Qwen smart_nframes.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/inference.py | 32 +++++++++++++++----------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
index bbd5224..4554571 100644
--- a/cosmos_framework/inference/inference.py
+++ b/cosmos_framework/inference/inference.py
@@ -14,14 +14,15 @@
 import cattrs.preconf.json
 import safetensors.torch
 import torch
+import torchvision.io
 from PIL import Image
+from qwen_vl_utils.vision_process import smart_nframes
 from torch.utils._pytree import tree_map_only
 from torch.utils.data import Dataset
 from typing_extensions import Self
 
 from cosmos_framework.configs.base.defaults.compile import CompileConfig
 from cosmos_framework.configs.base.defaults.parallelism import ParallelismConfig
-from cosmos_framework.data.vfm.vlm.video_decoder_qwen import _video_decoder_qwen_func
 from cosmos_framework.inference.args import (
     ModelMode,
     NegativeMetadataMode,
@@ -55,7 +56,6 @@
 from cosmos_framework.model.vfm.vlm.qwen3_vl.utils import _SYSTEM_PROMPT_IMAGE_EDITING
 from cosmos_framework.tools.visualize.video import save_img_or_video
 from cosmos_framework.utils import log
-from cosmos_framework.utils.vfm.video_preprocess import tensor_to_pil_images
 
 if TYPE_CHECKING:
     from cosmos_framework.configs.base.defaults.model_config import OmniMoTModelConfig
@@ -469,17 +469,23 @@ def _get_prompt_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel, *,
 def _decode_reasoner_video(vision_path: str, video_fps: float | None) -> dict[str, Any]:
     """Decode a local video file into the frame-list payload the Qwen3-VL processor expects.
 
-    Returns ``{"frames": [PIL.Image, ...], "fps": float}``. Reuses the dataloader's
-    decode path (``_video_decoder_qwen_func`` + ``tensor_to_pil_images``)."""
-    with open(vision_path, "rb") as f:
-        video_bytes = f.read()
-    decode_kwargs: dict[str, Any] = {}
-    if video_fps is not None:
-        decode_kwargs["target_fps"] = video_fps
-    result = _video_decoder_qwen_func(key="video.mp4", data=video_bytes, **decode_kwargs)
-    if result is None:
-        raise ValueError(f"Failed to decode reasoner video: {vision_path}")
-    return {"frames": tensor_to_pil_images(result["videos"]), "fps": result["fps"]}
+    Returns ``{"frames": [PIL.Image, ...], "fps": float}``. Uses the same
+    ``torchvision.io.read_video`` decode the rest of the inference path relies on
+    (no ``decord`` dependency), then uniformly samples frames toward ``video_fps``
+    (default 2.0) via Qwen's ``smart_nframes``. The repo ``Qwen3VLProcessor`` runs
+    with ``do_sample_frames=False``, so it consumes this pre-sampled frame list
+    as-is and handles its own per-frame resize."""
+    frames, _, info = torchvision.io.read_video(str(vision_path), pts_unit="sec")  # [T,H,W,C] uint8
+    total_frames = int(frames.shape[0])
+    if total_frames == 0:
+        raise ValueError(f"Decoded zero frames from reasoner video: {vision_path}")
+    src_fps = float(info.get("video_fps") or 0.0) or 1.0
+    target_fps = video_fps if video_fps is not None else 2.0
+    nframes = smart_nframes({"fps": target_fps}, total_frames=total_frames, video_fps=src_fps)
+    idx = torch.linspace(0, total_frames - 1, nframes).round().long().tolist()
+    pil_frames = [Image.fromarray(frames[i].numpy()) for i in idx]
+    sample_fps = nframes / total_frames * src_fps
+    return {"frames": pil_frames, "fps": sample_fps}
 
 
 def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel) -> dict[str, Any]:

From 19bd71695cae16c831069e1bc7be9565826fc656 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 02:56:54 -0700
Subject: [PATCH 17/20] fix(reasoner): emit reasoner_videos uniformly
 per-sample; mark design/plan superseded

Always emit reasoner_videos=[video_or_None] (like reasoner_images) so the batch
homogeneity check aligns positionally and reliably rejects an image/video/text mix.
Add superseded banners to the spec/plan docs (frame-decode + video_fps-only is final).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/inference.py               |  9 +++++----
 cosmos_framework/inference/inference_test.py          |  2 +-
 .../plans/2026-06-08-video-reasoner-input.md          |  8 ++++++++
 .../specs/2026-06-07-video-reasoner-input-design.md   | 11 ++++++++++-
 4 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/cosmos_framework/inference/inference.py b/cosmos_framework/inference/inference.py
index 4554571..3078e4d 100644
--- a/cosmos_framework/inference/inference.py
+++ b/cosmos_framework/inference/inference.py
@@ -497,13 +497,14 @@ def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel)
             video = _decode_reasoner_video(str(sample_args.vision_path), sample_args.video_fps)
         else:
             image = Image.open(sample_args.vision_path).convert("RGB")
-    out: dict[str, Any] = {
+    # Both keys are emitted for every sample (``None`` when absent) so the batch
+    # builder can positionally align them and the three-way homogeneity check in
+    # ``_generate_reasoner_batch`` reliably detects an image/video/text mix.
+    return {
         model.input_caption_key: [sample_args.prompt],
         "reasoner_images": [image],
+        "reasoner_videos": [video],
     }
-    if video is not None:
-        out["reasoner_videos"] = [video]
-    return out
 
 
 def _get_image_edit_sample_data(
diff --git a/cosmos_framework/inference/inference_test.py b/cosmos_framework/inference/inference_test.py
index 8e77195..58da0c0 100644
--- a/cosmos_framework/inference/inference_test.py
+++ b/cosmos_framework/inference/inference_test.py
@@ -372,7 +372,7 @@ def test_reasoner_sample_data_text_only() -> None:
     out = _get_reasoner_sample_data(_fake_sa(None), _fake_model)
     assert out["caption"] == ["describe"]
     assert out["reasoner_images"] == [None]
-    assert "reasoner_videos" not in out
+    assert out["reasoner_videos"] == [None]
 
 
 @pytest.mark.L0
diff --git a/docs/superpowers/plans/2026-06-08-video-reasoner-input.md b/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
index a0edd25..caef03a 100644
--- a/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
+++ b/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
@@ -1,5 +1,13 @@
 # Video input for `reasoner` model-mode — Implementation Plan
 
+> **⚠️ PARTIALLY SUPERSEDED (2026-06-08).** Tasks 1/6/7 below assumed the processor
+> decodes the mp4 path and accepts `video_*` sampling kwargs. Review showed the repo
+> `Qwen3VLProcessor` instead needs a **pre-decoded PIL frame list** (`do_sample_frames=False`)
+> and drops forwarded kwargs. The shipped code decodes frames itself
+> (`torchvision.io.read_video` + Qwen `smart_nframes`) and exposes **only `video_fps`**.
+> The other five `video_*` knobs and the processor-side-sampling approach were dropped.
+> The code is the source of truth.
+
 > **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
 
 **Goal:** Let `model_mode=reasoner` in `python -m cosmos_framework.scripts.inference` accept a local mp4 video as conditioning input (Cosmos3-Nano and Cosmos3-Super), producing text that reasons over the clip.
diff --git a/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md b/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
index 60b82da..7ddbca8 100644
--- a/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
+++ b/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
@@ -1,8 +1,17 @@
 # Video input for the `reasoner` model-mode of inference — design
 
+> **⚠️ SUPERSEDED (2026-06-08) — historical record.** Review found the inference
+> processor is the repo `Qwen3VLProcessor` wrapper, which runs
+> `do_sample_frames=False` and expects a **pre-decoded PIL frame list** (it drops
+> forwarded kwargs) — not a path it samples itself. The shipped implementation
+> therefore decodes frames via `torchvision.io.read_video` + Qwen `smart_nframes`
+> and exposes **only `video_fps`** (the `num_frames`/`min_frames`/`max_frames`/
+> `min_pixels`/`max_pixels` knobs described below were dropped). Treat the sections
+> below as the original design intent; the code is the source of truth.
+
 **Date:** 2026-06-07
 **Branch:** `maoshengl/video_reasoner_inference`
-**Status:** approved design, ready for implementation plan
+**Status:** superseded by implementation (see banner above)
 
 ## Goal
 

From 92e3491038f252aea8c681c9ab59eb6bbb078b24 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 03:38:58 -0700
Subject: [PATCH 18/20] chore(reasoner): untrack video-reasoner spec/plan docs
 (keep in-repo, untracked)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../plans/2026-06-08-video-reasoner-input.md  | 924 ------------------
 .../2026-06-07-video-reasoner-input-design.md | 266 -----
 2 files changed, 1190 deletions(-)
 delete mode 100644 docs/superpowers/plans/2026-06-08-video-reasoner-input.md
 delete mode 100644 docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md

diff --git a/docs/superpowers/plans/2026-06-08-video-reasoner-input.md b/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
deleted file mode 100644
index caef03a..0000000
--- a/docs/superpowers/plans/2026-06-08-video-reasoner-input.md
+++ /dev/null
@@ -1,924 +0,0 @@
-# Video input for `reasoner` model-mode — Implementation Plan
-
-> **⚠️ PARTIALLY SUPERSEDED (2026-06-08).** Tasks 1/6/7 below assumed the processor
-> decodes the mp4 path and accepts `video_*` sampling kwargs. Review showed the repo
-> `Qwen3VLProcessor` instead needs a **pre-decoded PIL frame list** (`do_sample_frames=False`)
-> and drops forwarded kwargs. The shipped code decodes frames itself
-> (`torchvision.io.read_video` + Qwen `smart_nframes`) and exposes **only `video_fps`**.
-> The other five `video_*` knobs and the processor-side-sampling approach were dropped.
-> The code is the source of truth.
-
-> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
-
-**Goal:** Let `model_mode=reasoner` in `python -m cosmos_framework.scripts.inference` accept a local mp4 video as conditioning input (Cosmos3-Nano and Cosmos3-Super), producing text that reasons over the clip.
-
-**Architecture:** Additive "video lane" alongside the existing image lane through the reasoner wrapper stack. The vendored Qwen3-VL model + `video_processing_qwen3_vl.py` already implement video end to end (`get_video_features`, `get_rope_index(video_grid_thw=…)`, `get_placeholder_mask(video_features=…)`, `video_token_id`); only the Cosmos wrapper layers are hardcoded to images. We thread optional `pixel_values_videos` / `video_grid_thw` (and a high-level `videos` list + `video_*` sampling knobs) through five layers. A prompt carries either an image, a video, or neither — never both.
-
-**Tech Stack:** Python, PyTorch, pydantic, HuggingFace transformers (`Qwen3VLProcessor`), torchrun. Repo lives at `cosmos-framework/`; spec at `docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md`.
-
-**Verification policy (read before starting):** Per the spec, the **end-to-end video path is verified manually on GPU** (Task 9) — there is no automated GPU test, because it requires real checkpoints + multi-GPU. The two pure-Python/logic changes (args schema in Task 1, builder routing in Task 7) DO get real `pytest` unit tests (CPU-only, no checkpoints). Model-layer tasks (3–6) are verified by import/lint checks plus the Task 9 manual run.
-
-**How to run tests/commands:** Python/pytest must run inside the i4 container (`bob_echo_dev`). Use the `cosmos3-run-env` skill to author the wrapper shell and the `slurm-node` skill to execute. Where a step says `pytest …`, it means "run that inside the container."
-
----
-
-## File Structure
-
-| File | Responsibility | Change |
-| ---- | -------------- | ------ |
-| `cosmos_framework/inference/args.py` | Sample-arg schema | Add `video_*` reasoner fields + mutual-exclusion validation |
-| `cosmos_framework/inference/args_test.py` | Schema unit tests | Add tests for new fields/validation |
-| `cosmos_framework/inference/defaults/reasoner/sample_args.json` | Reasoner defaults | Add `video_*` keys (null) |
-| `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py` | Multimodal prefill | `prepare_multimodal_reasoner_inputs`: add video branch |
-| `cosmos_framework/model/vfm/mot/unified_mot.py` | Reasoner decode | `_impl_generate_reasoner_text` + 3 wrapper `generate_reasoner_text`: add/forward video params + guards |
-| `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py` | Network pass-through | `generate_reasoner_text`: forward video params |
-| `cosmos_framework/model/vfm/omni_mot_model.py` | High-level entry | `generate_reasoner_text`: `videos` param, video chat block, sampling kwargs |
-| `cosmos_framework/inference/inference.py` | Inference engine | `_get_reasoner_sample_data` route mp4; `_generate_reasoner_batch` homogeneity + video forward |
-| `cosmos_framework/inference/inference_test.py` | Builder unit test | Add routing test (CPU) |
-| `inputs/reasoner/reasoner_video.json` | Example input | New file |
-| `docs/inference.md` | User docs | Document video input + `video_*` fields |
-
-Implementation order: Task 1 (schema) → Task 2 (defaults) → Tasks 3–6 (model layers, bottom-up) → Task 7 (inference wiring) → Task 8 (docs/example) → Task 9 (manual GPU verification).
-
----
-
-## Task 1: Add `video_*` reasoner sample-arg fields + validation
-
-**Files:**
-- Modify: `cosmos_framework/inference/args.py` (class `ReasonerDataArgs` ~600-611, class `ReasonerDataOverrides` ~614-638)
-- Test: `cosmos_framework/inference/args_test.py`
-
-The new fields control how the Qwen3-VL processor samples frames from the mp4. They are `video_`-prefixed to avoid colliding with the existing output-oriented `fps`/`num_frames` fields. `video_fps` and `video_num_frames` are mutually exclusive (the processor itself raises if both are set).
-
-- [ ] **Step 1: Write the failing tests**
-
-Add to `cosmos_framework/inference/args_test.py` (match the existing test style/imports in that file; these construct a reasoner override and resolve it). If the file already has a helper to build an `OmniSampleOverrides`/model config, reuse it; otherwise mirror the nearest existing reasoner test.
-
-```python
-def test_reasoner_video_fields_default_none():
-    ov = ReasonerDataOverrides()
-    assert ov.video_fps is None
-    assert ov.video_num_frames is None
-    assert ov.video_min_frames is None
-    assert ov.video_max_frames is None
-    assert ov.video_min_pixels is None
-    assert ov.video_max_pixels is None
-
-
-def test_reasoner_video_fps_and_num_frames_mutually_exclusive():
-    import pytest
-    ov = ReasonerDataOverrides(video_fps=2, video_num_frames=16)
-    # _validate_video_sampling is called from _build_reasoner_data; call it directly
-    with pytest.raises(ValueError, match="video_fps.*video_num_frames|mutually exclusive"):
-        ov._validate_video_sampling()
-
-
-def test_reasoner_video_fps_alone_ok():
-    ov = ReasonerDataOverrides(video_fps=2)
-    ov._validate_video_sampling()  # no raise
-```
-
-Add the import for `ReasonerDataOverrides` to the test file's import block if not present:
-
-```python
-from cosmos_framework.inference.args import ReasonerDataOverrides
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run (inside container): `pytest cosmos_framework/inference/args_test.py -k reasoner_video -v`
-Expected: FAIL — `ReasonerDataOverrides` has no `video_fps` (AttributeError / unexpected-keyword), and no `_validate_video_sampling`.
-
-- [ ] **Step 3: Add the fields to `ReasonerDataArgs`**
-
-In `args.py`, append to class `ReasonerDataArgs` (after `presence_penalty: float | None = None`, ~line 611):
-
-```python
-    video_fps: float | None = None
-    video_num_frames: pydantic.PositiveInt | None = None
-    video_min_frames: pydantic.PositiveInt | None = None
-    video_max_frames: pydantic.PositiveInt | None = None
-    video_min_pixels: pydantic.PositiveInt | None = None
-    video_max_pixels: pydantic.PositiveInt | None = None
-```
-
-- [ ] **Step 4: Add the fields + validation to `ReasonerDataOverrides`**
-
-In `args.py`, append to class `ReasonerDataOverrides` (after `presence_penalty`, ~line 631, before `_build_reasoner_data`):
-
-```python
-    video_fps: float | None = None
-    """Frames per second to sample from a video vision_path. Mutually exclusive with video_num_frames. None -> processor default."""
-    video_num_frames: pydantic.PositiveInt | None = None
-    """Fixed number of frames to sample from a video vision_path. Mutually exclusive with video_fps. None -> processor default."""
-    video_min_frames: pydantic.PositiveInt | None = None
-    """Lower bound on sampled frame count. None -> processor default."""
-    video_max_frames: pydantic.PositiveInt | None = None
-    """Upper bound on sampled frame count. None -> processor default."""
-    video_min_pixels: pydantic.PositiveInt | None = None
-    """Lower bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
-    video_max_pixels: pydantic.PositiveInt | None = None
-    """Upper bound on per-frame pixel budget (drives smart_resize). None -> processor default."""
-
-    def _validate_video_sampling(self) -> None:
-        if self.video_fps is not None and self.video_num_frames is not None:
-            raise ValueError(
-                "video_fps and video_num_frames are mutually exclusive — set at most one."
-            )
-```
-
-Then call it from `_build_reasoner_data` so resolution-time validation fires. Replace the body of `_build_reasoner_data` (~lines 633-638) with:
-
-```python
-    def _build_reasoner_data(self, model_config: "OmniMoTModelConfig", sample_meta: SampleMeta):
-        if not sample_meta.model_mode.is_reasoner:
-            return
-        self = cast("SampleDataOverrides", self)
-        if not self.prompt.strip():
-            raise ValueError("Reasoner inference requires a non-empty 'prompt'.")
-        self._validate_video_sampling()
-```
-
-- [ ] **Step 5: Run tests to verify they pass**
-
-Run: `pytest cosmos_framework/inference/args_test.py -k reasoner_video -v`
-Expected: PASS (3 tests).
-
-- [ ] **Step 6: Lint + commit**
-
-```bash
-ruff check cosmos_framework/inference/args.py cosmos_framework/inference/args_test.py
-git add cosmos_framework/inference/args.py cosmos_framework/inference/args_test.py
-git commit -m "feat(reasoner): add video_* sampling fields + mutual-exclusion validation"
-```
-
----
-
-## Task 2: Add `video_*` defaults to the reasoner defaults file
-
-**Files:**
-- Modify: `cosmos_framework/inference/defaults/reasoner/sample_args.json`
-
-`None` defaults already live in the schema; adding explicit `null` keys here documents the knobs and keeps the defaults file self-describing.
-
-- [ ] **Step 1: Edit the JSON**
-
-Replace the file contents with:
-
-```json
-{
-    "model_mode": "reasoner",
-    "max_new_tokens": 64,
-    "do_sample": false,
-    "temperature": 1.0,
-    "top_k": null,
-    "top_p": null,
-    "repetition_penalty": 1.0,
-    "presence_penalty": 0.0,
-    "video_fps": null,
-    "video_num_frames": null,
-    "video_min_frames": null,
-    "video_max_frames": null,
-    "video_min_pixels": null,
-    "video_max_pixels": null
-}
-```
-
-- [ ] **Step 2: Verify it loads**
-
-Run (inside container):
-`python -c "from cosmos_framework.inference.args import _load_modality_defaults; print(_load_modality_defaults('reasoner'))"`
-Expected: prints the dict including the `video_*` keys; no exception.
-
-- [ ] **Step 3: Commit**
-
-```bash
-git add cosmos_framework/inference/defaults/reasoner/sample_args.json
-git commit -m "feat(reasoner): add video_* defaults (null) to reasoner sample_args"
-```
-
----
-
-## Task 3: Add a video branch to `prepare_multimodal_reasoner_inputs`
-
-**Files:**
-- Modify: `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py:497-604`
-
-This is the one real seam. The image recipe (lines 577-604) is: `get_image_features` → `get_placeholder_mask(image_features=…)` → `masked_scatter(image_mask)` → `get_rope_index(image_grid_thw=…)`. The video recipe is identical but uses the video helpers — and `get_video_features` is literally "same implementation as for images" (`qwen3_vl.py:1243`), so we reuse the existing free `get_image_features` helper with the video tensors. `get_placeholder_mask` and `get_rope_index` already accept video arguments.
-
-- [ ] **Step 1: Add optional video params to the signature**
-
-Change the signature (lines 497-509) to add two params after `image_grid_thw`:
-
-```python
-def prepare_multimodal_reasoner_inputs(
-    causal_lm: Any,
-    input_ids: torch.Tensor,  # [B,T_prompt]
-    pixel_values: torch.Tensor | None = None,  # [N_patches,C,H,W]
-    image_grid_thw: torch.Tensor | None = None,  # [num_images,3]
-    pixel_values_videos: torch.Tensor | None = None,  # [N_patches,C,H,W]
-    video_grid_thw: torch.Tensor | None = None,  # [num_videos,3]
-    attention_mask: Optional[torch.Tensor] = None,
-) -> tuple[
-    torch.Tensor,  # inputs_embeds [B,T_prompt,hidden_size]
-    torch.Tensor,  # visual_pos_masks [B,T_prompt] bool
-    list[torch.Tensor],  # deepstack_visual_embeds (per deepstack layer)
-    torch.Tensor,  # position_ids
-    torch.Tensor,  # mrope_position_deltas
-]:
-```
-
-(Note: `pixel_values`/`image_grid_thw` are now defaulted to `None`; existing callers pass them positionally/by keyword so behavior is unchanged.)
-
-- [ ] **Step 2: Replace the body (lines 577-604) with image/video branching**
-
-```python
-    is_video = pixel_values_videos is not None
-    inputs_embeds = causal_lm.model.embed_tokens(input_ids).clone()  # [B,T_prompt,hidden_size]
-
-    if is_video:
-        pixel_values_videos = pixel_values_videos.to(device=inputs_embeds.device)
-        video_grid_thw = video_grid_thw.to(device=inputs_embeds.device)
-        # get_video_features == get_image_features (same visual tower); reuse the free helper.
-        video_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values_videos, video_grid_thw)
-        video_embeds = torch.cat(video_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
-        _image_mask, video_mask = get_placeholder_mask(
-            causal_lm,
-            input_ids,
-            inputs_embeds=inputs_embeds,
-            video_features=video_embeds,
-        )
-        inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # [B,T_prompt,hidden_size]
-        visual_pos_masks = video_mask[..., 0]  # [B,T_prompt]
-    else:
-        pixel_values = pixel_values.to(device=inputs_embeds.device)
-        image_grid_thw = image_grid_thw.to(device=inputs_embeds.device)
-        image_embeds, deepstack_visual_embeds = get_image_features(causal_lm, pixel_values, image_grid_thw)
-        image_embeds = torch.cat(image_embeds, dim=0).to(device=inputs_embeds.device, dtype=inputs_embeds.dtype)
-        image_mask, _video_mask = get_placeholder_mask(
-            causal_lm,
-            input_ids,
-            inputs_embeds=inputs_embeds,
-            image_features=image_embeds,
-        )
-        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # [B,T_prompt,hidden_size]
-        visual_pos_masks = image_mask[..., 0]  # [B,T_prompt]
-
-    deepstack_visual_embeds = [
-        embed.to(device=inputs_embeds.device, dtype=inputs_embeds.dtype) for embed in deepstack_visual_embeds
-    ]
-
-    position_ids, mrope_position_deltas = get_rope_index(
-        causal_lm,
-        input_ids=input_ids,
-        image_grid_thw=None if is_video else image_grid_thw,
-        video_grid_thw=video_grid_thw if is_video else None,
-        attention_mask=attention_mask,
-    )
-
-    return inputs_embeds, visual_pos_masks, deepstack_visual_embeds, position_ids, mrope_position_deltas
-```
-
-- [ ] **Step 3: Update the docstring**
-
-In the docstring (lines 528-532), replace the sentence "Videos and dual image+video paths are not supported here; only `image_grid_thw` is consumed…" with:
-
-```
-    Either the image pair (``pixel_values`` + ``image_grid_thw``) or the
-    video pair (``pixel_values_videos`` + ``video_grid_thw``) is consumed —
-    not both. The video recipe mirrors the image recipe but routes through
-    the video placeholder mask and ``video_grid_thw`` rope index.
-```
-
-- [ ] **Step 4: Import/lint check (no GPU test — verified end-to-end in Task 9)**
-
-Run (inside container):
-`python -c "import cosmos_framework.model.vfm.vlm.qwen3_vl.utils"`
-Expected: no ImportError / SyntaxError.
-`ruff check cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py`
-
-- [ ] **Step 5: Commit**
-
-```bash
-git add cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py
-git commit -m "feat(reasoner): video branch in prepare_multimodal_reasoner_inputs"
-```
-
----
-
-## Task 4: Thread video params through `_impl_generate_reasoner_text`
-
-**Files:**
-- Modify: `cosmos_framework/model/vfm/mot/unified_mot.py:1490-1675`
-
-- [ ] **Step 1: Add params to the signature**
-
-In `_impl_generate_reasoner_text` (lines 1490-1508), add two params after `image_grid_thw` (line 1496):
-
-```python
-    pixel_values_videos: torch.Tensor | None = None,
-    video_grid_thw: torch.Tensor | None = None,
-```
-
-- [ ] **Step 2: Extend the validation guard**
-
-Replace the guard at lines 1644-1645:
-
-```python
-    if (pixel_values is None) != (image_grid_thw is None):
-        raise ValueError("pixel_values and image_grid_thw must be provided together.")
-```
-
-with:
-
-```python
-    if (pixel_values is None) != (image_grid_thw is None):
-        raise ValueError("pixel_values and image_grid_thw must be provided together.")
-    if (pixel_values_videos is None) != (video_grid_thw is None):
-        raise ValueError("pixel_values_videos and video_grid_thw must be provided together.")
-    if pixel_values is not None and pixel_values_videos is not None:
-        raise ValueError("Reasoner conditions on one medium at a time: pass image OR video, not both.")
-```
-
-- [ ] **Step 3: Route to the prefill helper for both media**
-
-Replace the prefill branch at lines 1650-1667:
-
-```python
-    if pixel_values is None:
-        hidden = model.reasoner_forward(input_ids, cache=cache)  # [B,T_prompt,hidden_size]
-    else:
-        if not hasattr(causal_lm, "visual"):
-            raise ValueError("Combined checkpoint does not include a visual module on the reasoner language model.")
-        (
-            inputs_embeds,
-            visual_pos_masks,
-            deepstack_visual_embeds,
-            position_ids,
-            mrope_position_deltas,
-        ) = prepare_multimodal_reasoner_inputs(
-            causal_lm,
-            input_ids=input_ids,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
-            attention_mask=attention_mask,
-        )
-```
-
-with:
-
-```python
-    if pixel_values is None and pixel_values_videos is None:
-        hidden = model.reasoner_forward(input_ids, cache=cache)  # [B,T_prompt,hidden_size]
-    else:
-        if not hasattr(causal_lm, "visual"):
-            raise ValueError("Combined checkpoint does not include a visual module on the reasoner language model.")
-        (
-            inputs_embeds,
-            visual_pos_masks,
-            deepstack_visual_embeds,
-            position_ids,
-            mrope_position_deltas,
-        ) = prepare_multimodal_reasoner_inputs(
-            causal_lm,
-            input_ids=input_ids,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
-            pixel_values_videos=pixel_values_videos,
-            video_grid_thw=video_grid_thw,
-            attention_mask=attention_mask,
-        )
-```
-
-- [ ] **Step 4: Update the docstring**
-
-In the `pixel_values` docstring (lines 1553-1556), replace "Videos are *not* supported here — this function has no `pixel_values_videos` / `video_grid_thw` parameters; for I2V conditioning, frames must be passed as images." with:
-
-```
-            For video conditioning, pass ``pixel_values_videos`` +
-            ``video_grid_thw`` instead (mutually exclusive with the image
-            pair).
-```
-
-- [ ] **Step 5: Import/lint check**
-
-Run (inside container):
-`python -c "import cosmos_framework.model.vfm.mot.unified_mot"`
-`ruff check cosmos_framework/model/vfm/mot/unified_mot.py`
-Expected: no errors.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add cosmos_framework/model/vfm/mot/unified_mot.py
-git commit -m "feat(reasoner): accept video tensors in _impl_generate_reasoner_text"
-```
-
----
-
-## Task 5: Forward video params through the wrapper `generate_reasoner_text` pass-throughs
-
-**Files:**
-- Modify: `cosmos_framework/model/vfm/mot/unified_mot.py` — three wrappers at lines 1932 (`Qwen3VLTextForCausalLM`), 2060 (`Qwen3VLMoeTextForCausalLM`), 2184 (`Nemotron3DenseVLTextForCausalLM`)
-- Modify: `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py:272-341`
-
-All four are pure pass-throughs to `_impl_generate_reasoner_text` (the three unified_mot wrappers) and to `self.language_model.generate_reasoner_text` (the network). Each needs the two new params added to its signature and forwarded.
-
-- [ ] **Step 1: Update the three unified_mot wrappers**
-
-For EACH of the three `generate_reasoner_text` methods (lines 1932, 2060, 2184): add after `image_grid_thw: torch.Tensor | None = None,` in the signature:
-
-```python
-        pixel_values_videos: torch.Tensor | None = None,
-        video_grid_thw: torch.Tensor | None = None,
-```
-
-and add to the `_impl_generate_reasoner_text(...)` call (after `image_grid_thw=image_grid_thw,`):
-
-```python
-            pixel_values_videos=pixel_values_videos,
-            video_grid_thw=video_grid_thw,
-```
-
-(The three methods are textually identical in this region; apply the same two-line additions to each.)
-
-- [ ] **Step 2: Update the network pass-through**
-
-In `cosmos3_vfm_network.py`, add to the `generate_reasoner_text` signature (after `image_grid_thw: torch.Tensor | None = None,`, ~line 278):
-
-```python
-        pixel_values_videos: torch.Tensor | None = None,
-        video_grid_thw: torch.Tensor | None = None,
-```
-
-and to the forwarded call (after `image_grid_thw=image_grid_thw,`, ~line 329):
-
-```python
-            pixel_values_videos=pixel_values_videos,
-            video_grid_thw=video_grid_thw,
-```
-
-- [ ] **Step 3: Import/lint check**
-
-Run (inside container):
-`python -c "import cosmos_framework.model.vfm.mot.unified_mot, cosmos_framework.model.vfm.mot.cosmos3_vfm_network"`
-`ruff check cosmos_framework/model/vfm/mot/unified_mot.py cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py`
-Expected: no errors.
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add cosmos_framework/model/vfm/mot/unified_mot.py cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py
-git commit -m "feat(reasoner): forward video tensors through generate_reasoner_text pass-throughs"
-```
-
----
-
-## Task 6: Add `videos` + sampling kwargs to `OmniMoTModel.generate_reasoner_text`
-
-**Files:**
-- Modify: `cosmos_framework/model/vfm/omni_mot_model.py:3760-4007`
-
-This builds a `{"type":"video", ...}` chat block (parallel to the existing image block at lines 3959-4008), extracts `pixel_values_videos` / `video_grid_thw` from `apply_chat_template`, and passes them down.
-
-- [ ] **Step 1: Add params to the signature**
-
-In `generate_reasoner_text` (lines 3760-3774), add after `images: list[Any] | None = None,` (line 3765):
-
-```python
-        videos: list[Any] | None = None,
-        video_sampling_kwargs: dict[str, Any] | None = None,
-```
-
-- [ ] **Step 2: Validate not-both and set the multimodal flag**
-
-Replace the validation block at lines 3907-3922 (`use_multimodal = images is not None` … through the `apply_chat_template` RuntimeError) with:
-
-```python
-        if images is not None and videos is not None:
-            raise ValueError("generate_reasoner_text conditions on one medium at a time: pass `images` OR `videos`, not both.")
-        use_image = images is not None
-        use_video = videos is not None
-        use_multimodal = use_image or use_video
-        media = images if use_image else videos
-        if use_multimodal:
-            assert media is not None  # narrowed by `use_multimodal`
-            if len(media) != len(inputs):
-                raise ValueError(
-                    f"generate_reasoner_text: media length ({len(media)}) "
-                    f"must equal `inputs` length ({len(inputs)}) for the "
-                    "vision-conditioned flow."
-                )
-            if not callable(getattr(self.vlm_processor, "apply_chat_template", None)):
-                raise RuntimeError(
-                    "generate_reasoner_text(images=/videos=...) requires a multimodal "
-                    "VLM processor (e.g. Qwen3VLProcessor) but the live processor "
-                    f"{type(self.vlm_processor).__name__!r} does not implement "
-                    "apply_chat_template — the live VLM is configured as text-only."
-                )
-        video_kwargs = {k: v for k, v in (video_sampling_kwargs or {}).items() if v is not None}
-```
-
-- [ ] **Step 3: Build the image-or-video chat block and extract tensors**
-
-Replace the multimodal block construction at lines 3959-4008 (`if use_multimodal:` … through the `out_ids = self.net.generate_reasoner_text(...)` image call) with:
-
-```python
-            if use_multimodal:
-                assert media is not None  # narrowed by `use_multimodal`
-                # Replace the LAST user message's content with a Qwen3-VL
-                # multimodal block. Earlier messages (system, prior turns)
-                # are kept verbatim.
-                last_user = messages[-1]
-                last_text = last_user["content"] if isinstance(last_user.get("content"), str) else ""
-                if use_video:
-                    media_item: dict[str, Any] = {"type": "video", "video": media[idx]}
-                else:
-                    media_item = {"type": "image", "image": media[idx]}
-                multimodal_messages = list(messages[:-1])
-                multimodal_messages.append(
-                    {
-                        "role": "user",
-                        "content": [media_item, {"type": "text", "text": last_text}],
-                    }
-                )
-                # NOTE: `video_kwargs` (fps/num_frames/min_frames/max_frames/
-                # min_pixels/max_pixels) are forwarded to the processor here.
-                # The exact kwarg surface depends on the installed transformers
-                # Qwen3VLProcessor; if a key is rejected, route via the
-                # processor's video-loading kwargs. Verified manually in the
-                # plan's Task 9.
-                processor_inputs = self.vlm_processor.apply_chat_template(
-                    multimodal_messages,
-                    tokenize=True,
-                    add_generation_prompt=True,
-                    return_tensors="pt",
-                    **(video_kwargs if use_video else {}),
-                )
-                inner_input_ids = processor_inputs["input_ids"].to(device).unsqueeze(0)
-                inner_attention_mask = processor_inputs["attention_mask"].to(device).unsqueeze(0)
-                if use_video:
-                    inner_pixel_values_videos = processor_inputs["pixel_values_videos"].to(device)
-                    inner_video_grid_thw = processor_inputs["video_grid_thw"].to(device)
-                    out_ids = self.net.generate_reasoner_text(
-                        input_ids=inner_input_ids,
-                        max_new_tokens=max_new_tokens,
-                        pixel_values_videos=inner_pixel_values_videos,
-                        video_grid_thw=inner_video_grid_thw,
-                        attention_mask=inner_attention_mask,
-                        eos_token_id=eos_id,
-                        pad_token_id=pad_id,
-                        do_sample=do_sample,
-                        temperature=temperature if temperature is not None else 1.0,
-                        top_k=top_k,
-                        top_p=top_p,
-                        repetition_penalty=repetition_penalty,
-                        presence_penalty=presence_penalty,
-                        seed=seed,
-                        return_only_new_tokens=True,
-                    )
-                else:
-                    inner_pixel_values = processor_inputs["pixel_values"].to(device)  # [N_patches,C,H,W]
-                    inner_image_grid_thw = processor_inputs["image_grid_thw"].to(device)  # [num_images,3]
-                    out_ids = self.net.generate_reasoner_text(
-                        input_ids=inner_input_ids,
-                        max_new_tokens=max_new_tokens,
-                        pixel_values=inner_pixel_values,
-                        image_grid_thw=inner_image_grid_thw,
-                        attention_mask=inner_attention_mask,
-                        eos_token_id=eos_id,
-                        pad_token_id=pad_id,
-                        do_sample=do_sample,
-                        temperature=temperature if temperature is not None else 1.0,
-                        top_k=top_k,
-                        top_p=top_p,
-                        repetition_penalty=repetition_penalty,
-                        presence_penalty=presence_penalty,
-                        seed=seed,
-                        return_only_new_tokens=True,
-                    )
-```
-
-(The text-only `else:` branch at lines 4009+ is unchanged.)
-
-- [ ] **Step 4: Update the docstring**
-
-In the `images:` Args entry (~lines 3828-3837), add a sibling paragraph:
-
-```
-            videos: Optional per-prompt conditioning videos (mutually
-                exclusive with ``images``). Each entry is forwarded into a
-                ``{"type": "video", "video": ...}`` chat block; the
-                processor decodes/samples frames and produces
-                ``pixel_values_videos`` / ``video_grid_thw``.
-            video_sampling_kwargs: Optional dict of non-None frame-sampling
-                controls (fps, num_frames, min_frames, max_frames,
-                min_pixels, max_pixels) forwarded to the processor.
-```
-
-- [ ] **Step 5: Import/lint check**
-
-Run (inside container):
-`python -c "import cosmos_framework.model.vfm.omni_mot_model"`
-`ruff check cosmos_framework/model/vfm/omni_mot_model.py`
-Expected: no errors.
-
-- [ ] **Step 6: Commit**
-
-```bash
-git add cosmos_framework/model/vfm/omni_mot_model.py
-git commit -m "feat(reasoner): videos param + video chat block in OmniMoTModel.generate_reasoner_text"
-```
-
----
-
-## Task 7: Wire mp4 routing into the inference engine
-
-**Files:**
-- Modify: `cosmos_framework/inference/inference.py` — `_get_reasoner_sample_data:466-474`, `_generate_reasoner_batch:1644-1696`
-- Test: `cosmos_framework/inference/inference_test.py`
-
-The builder detects an mp4 `vision_path` by extension and returns it under a `reasoner_videos` key (path string, not decoded) plus the resolved `video_*` sampling kwargs; the batch method routes videos to `generate_reasoner_text(videos=…)`.
-
-- [ ] **Step 1: Write the failing routing test**
-
-Add to `cosmos_framework/inference/inference_test.py` (use `types.SimpleNamespace` to avoid constructing a full model/args; the builder only reads `vision_path`, `prompt`, and `video_*` off `sample_args`, and `input_caption_key` off `model`):
-
-```python
-import types
-from cosmos_framework.inference.inference import _get_reasoner_sample_data
-
-
-def _fake_sa(vision_path, **video_kw):
-    base = dict(
-        prompt="describe",
-        vision_path=vision_path,
-        video_fps=None, video_num_frames=None, video_min_frames=None,
-        video_max_frames=None, video_min_pixels=None, video_max_pixels=None,
-    )
-    base.update(video_kw)
-    return types.SimpleNamespace(**base)
-
-
-_fake_model = types.SimpleNamespace(input_caption_key="caption")
-
-
-def test_reasoner_sample_data_text_only():
-    out = _get_reasoner_sample_data(_fake_sa(None), _fake_model)
-    assert out["caption"] == ["describe"]
-    assert out["reasoner_images"] == [None]
-    assert "reasoner_videos" not in out
-
-
-def test_reasoner_sample_data_video_routes_to_videos(tmp_path):
-    clip = tmp_path / "clip.mp4"
-    clip.write_bytes(b"\x00")  # not decoded by the builder
-    out = _get_reasoner_sample_data(_fake_sa(str(clip), video_fps=2), _fake_model)
-    assert out["caption"] == ["describe"]
-    assert out["reasoner_videos"] == [str(clip)]
-    assert out["reasoner_images"] == [None]
-    assert out["video_sampling_kwargs"] == {"fps": 2}
-```
-
-- [ ] **Step 2: Run tests to verify they fail**
-
-Run (inside container): `pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
-Expected: FAIL — current builder always calls `Image.open` and has no `reasoner_videos`/`video_sampling_kwargs` keys.
-
-- [ ] **Step 3: Add the `VIDEO_EXTENSIONS` import**
-
-`VIDEO_EXTENSIONS` is exported from `cosmos_framework.inference.common.args` (the same module `args.py` imports it from). `inference.py` already imports `Path`, `Any`, `cast`, and `Image`, so this is the only new import. Add near the top of `inference.py`:
-
-```python
-from cosmos_framework.inference.common.args import VIDEO_EXTENSIONS
-```
-
-(If `inference.py` already imports other names from `cosmos_framework.inference.common.args`, append `VIDEO_EXTENSIONS` to that existing import instead of adding a new line.)
-
-- [ ] **Step 4: Rewrite `_get_reasoner_sample_data`**
-
-Replace lines 466-474:
-
-```python
-def _get_reasoner_sample_data(sample_args: OmniSampleArgs, model: OmniMoTModel) -> dict[str, Any]:
-    """Sample batch for reasoner text generation: prompt + optional conditioning image or video."""
-    image: Image.Image | None = None
-    video: str | None = None
-    if sample_args.vision_path is not None:
-        if Path(sample_args.vision_path).suffix.lower() in VIDEO_EXTENSIONS:
-            video = str(sample_args.vision_path)
-        else:
-            image = Image.open(sample_args.vision_path).convert("RGB")
-    out: dict[str, Any] = {
-        model.input_caption_key: [sample_args.prompt],
-        "reasoner_images": [image],
-    }
-    if video is not None:
-        out["reasoner_videos"] = [video]
-        out["video_sampling_kwargs"] = {
-            k: v
-            for k, v in {
-                "fps": sample_args.video_fps,
-                "num_frames": sample_args.video_num_frames,
-                "min_frames": sample_args.video_min_frames,
-                "max_frames": sample_args.video_max_frames,
-                "min_pixels": sample_args.video_min_pixels,
-                "max_pixels": sample_args.video_max_pixels,
-            }.items()
-            if v is not None
-        }
-    return out
-```
-
-- [ ] **Step 5: Run the routing tests**
-
-Run: `pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
-Expected: PASS (2 tests).
-
-- [ ] **Step 6: Update `_generate_reasoner_batch` to route videos**
-
-In `_generate_reasoner_batch` (lines 1656-1696), after `raw_images: list[...] = data_batch["reasoner_images"]` (line 1657), add video extraction and a three-way homogeneity check, then branch the model call. Replace lines 1656-1696 (`prompts = ...` through the `generate_reasoner_text(...)` call) with:
-
-```python
-        prompts: list[str] = data_batch[self.model.input_caption_key]
-        raw_images: list[Image.Image | None] = data_batch["reasoner_images"]
-        raw_videos: list[str | None] | None = data_batch.get("reasoner_videos")
-        video_sampling_kwargs: dict[str, Any] = data_batch.get("video_sampling_kwargs", {})
-
-        n_img = sum(img is not None for img in raw_images)
-        n_vid = sum(v is not None for v in (raw_videos or []))
-        if n_img and n_vid:
-            raise ValueError(
-                "Reasoner batch mixes image- and video-conditioned samples. Split into separate batches."
-            )
-        if 0 < n_img < len(raw_images):
-            raise ValueError(
-                "Reasoner batch mixes image-conditioned and text-only samples "
-                f"({n_img}/{len(raw_images)} have an image vision_path). Split into separate batches."
-            )
-        if raw_videos is not None and 0 < n_vid < len(raw_videos):
-            raise ValueError(
-                "Reasoner batch mixes video-conditioned and text-only samples "
-                f"({n_vid}/{len(raw_videos)} have a video vision_path). Split into separate batches."
-            )
-        images: list[Image.Image] | None = cast(list[Image.Image], raw_images) if n_img == len(raw_images) else None
-        videos: list[str] | None = (
-            cast(list[str], raw_videos) if raw_videos is not None and n_vid == len(raw_videos) else None
-        )
-
-        try:
-            with sync_distributed_errors():
-                for sa, prompt in zip(sample_args_list, prompts):
-                    if self.should_process_sample(sa) and not warmup:
-                        log.debug(f"{sa.__class__.__name__}({sa})")
-                        assert sa.output_dir is not None
-                        sa.output_dir.mkdir(parents=True, exist_ok=True)
-                        (sa.output_dir / "sample_args.json").write_text(sa.model_dump_json())
-                        self._run_text_guardrail(str(sa.output_dir), prompt)
-        except Exception as e:
-            return [
-                self._handle_sample_exception(sa, e)
-                for sa in sample_args_list
-                if self.should_process_sample(sa) and not warmup
-            ]
-
-        with self._get_timer(f"{self.model.__class__.__name__}.generate_reasoner_text"):
-            texts = self.model.generate_reasoner_text(
-                prompts,
-                max_new_tokens=sample_args_list[0].max_new_tokens,
-                images=images,
-                videos=videos,
-                video_sampling_kwargs=video_sampling_kwargs or None,
-                do_sample=sample_args_list[0].do_sample,
-                temperature=sample_args_list[0].temperature,
-                top_k=sample_args_list[0].top_k,
-                top_p=sample_args_list[0].top_p,
-                repetition_penalty=sample_args_list[0].repetition_penalty,
-                presence_penalty=sample_args_list[0].presence_penalty,
-                seed=sample_args_list[0].seed,
-            )
-```
-
-(Confirm `Any` and `cast` are already imported in `inference.py`; both are used elsewhere in the file, so no new import is needed.)
-
-- [ ] **Step 7: Import/lint check + run builder tests again**
-
-Run (inside container):
-`python -c "import cosmos_framework.inference.inference"`
-`ruff check cosmos_framework/inference/inference.py cosmos_framework/inference/inference_test.py`
-`pytest cosmos_framework/inference/inference_test.py -k reasoner_sample_data -v`
-Expected: no errors; tests PASS.
-
-- [ ] **Step 8: Commit**
-
-```bash
-git add cosmos_framework/inference/inference.py cosmos_framework/inference/inference_test.py
-git commit -m "feat(reasoner): route mp4 vision_path to video conditioning in inference engine"
-```
-
----
-
-## Task 8: Example input + user docs
-
-**Files:**
-- Create: `inputs/reasoner/reasoner_video.json`
-- Modify: `docs/inference.md`
-
-- [ ] **Step 1: Create the example input**
-
-`inputs/reasoner/reasoner_video.json`:
-
-```json
-{
-    "model_mode": "reasoner",
-    "prompt": "Describe what happens in this video in one sentence.",
-    "vision_path": "https://github.com/nvidia-cosmos/cosmos-dependencies/raw/2b17a2413bd86b2cf9b03823637108851e4ddf2d/inputs/vision/robot_153.jpg"
-}
-```
-
-NOTE: replace the placeholder `vision_path` with a real `.mp4` URL or local path before running. If a canonical sample mp4 exists under the cosmos-dependencies repo, use that; otherwise leave a local-path example and document it. (Confirm a sample clip during Task 9; update this file to point at it.)
-
-- [ ] **Step 2: Document in `docs/inference.md`**
-
-In the Modes table (around line 138-146), the reasoner mode is currently text/image only. Add a row or note documenting video input for `reasoner`. Find the reasoner documentation block and add:
-
-```markdown
-For `model_mode=reasoner`, `vision_path` may point to an **image** (`.jpg`/`.png`/…) or a **video** (`.mp4`/…). A video is decoded by the Qwen3-VL processor and sampled into frames. Optional frame-sampling controls (all default to the processor's defaults):
-
-- `video_fps`: frames sampled per second (mutually exclusive with `video_num_frames`).
-- `video_num_frames`: fixed number of frames to sample.
-- `video_min_frames` / `video_max_frames`: bounds on the sampled frame count.
-- `video_min_pixels` / `video_max_pixels`: per-frame pixel budget (drives resolution).
-
-Example: [`inputs/reasoner/reasoner_video.json`](../inputs/reasoner/reasoner_video.json).
-```
-
-- [ ] **Step 3: Verify the example JSON parses**
-
-Run: `python -c "import json; json.load(open('inputs/reasoner/reasoner_video.json'))"`
-Expected: no exception.
-
-- [ ] **Step 4: Commit**
-
-```bash
-git add inputs/reasoner/reasoner_video.json docs/inference.md
-git commit -m "docs(reasoner): document video input + add reasoner_video example"
-```
-
----
-
-## Task 9: Manual end-to-end GPU verification
-
-**Files:** none (verification only). Use the `cosmos3-run-env` skill to author the wrapper and `slurm-node` to run on a GPU node in the i4 container.
-
-This is the real correctness gate (per the spec: manual verification only). Do NOT mark the feature complete until this passes.
-
-- [ ] **Step 1: Obtain a short sample mp4**
-
-Place a short clip at a known path, e.g. `tmp_inputs/clip.mp4` (a few seconds is enough). Update `inputs/reasoner/reasoner_video.json`'s `vision_path` to that absolute path (or a real mp4 URL).
-
-- [ ] **Step 2: Run reasoner video inference on Cosmos3-Nano**
-
-```bash
-torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
-    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
-    --cp-size=1 --cfgp-size=1 \
-    -i "inputs/reasoner/reasoner_video.json" \
-    -o outputs/reasoner_video --checkpoint-path Cosmos3-Nano --seed=0
-```
-
-Expected: completes without error; `outputs/reasoner_video/reasoner_video/reasoner_text.txt` exists and contains non-empty, on-topic text describing the clip.
-
-- [ ] **Step 3: Repeat for Cosmos3-Super**
-
-Same command with `--checkpoint-path Cosmos3-Super`. Expected: same success criteria.
-
-- [ ] **Step 4: Regression — confirm image and text-only reasoner still work**
-
-```bash
-torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
-    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
-    --cp-size=1 --cfgp-size=1 \
-    -i "inputs/reasoner/reasoner.json" -i "inputs/reasoner/reasoner_image.json" \
-    -o outputs/reasoner_regress --checkpoint-path Cosmos3-Nano --seed=0
-```
-
-Expected: both produce `reasoner_text.txt` with non-empty text, unchanged from pre-change behavior.
-
-- [ ] **Step 5: Sampling-knob smoke check**
-
-Add `"video_fps": 1` (then separately `"video_num_frames": 8`) to the input JSON and re-run Step 2. Expected: still succeeds. Confirm `video_fps` + `video_num_frames` together is rejected with the mutual-exclusion error (validates Task 1 end-to-end). If the processor rejects a kwarg name, adjust the forwarding in `omni_mot_model.py` Task 6 Step 3 (route via the processor's video-loading kwargs) and re-run.
-
-- [ ] **Step 6: Record results**
-
-Note in the PR description: which checkpoints were run, the generated text samples, and confirmation that image/text-only paths are unaffected.
-
----
-
-## Self-review notes
-
-- **Spec coverage:** schema fields + mutual exclusion (Task 1, spec §args), defaults (Task 2), `prepare_multimodal_reasoner_inputs` video branch (Task 3, spec §component 1), `_impl` + guards (Task 4, spec §component 2 + §validation), pass-throughs (Task 5, spec §component 3), `OmniMoTModel` video block (Task 6, spec §component 4), inference routing + batch homogeneity (Task 7, spec §component 5 + §validation), example + docs (Task 8, spec §files-touched), manual verification (Task 9, spec §verification). All spec sections mapped.
-- **Naming consistency:** `pixel_values_videos` / `video_grid_thw` (model layers), `reasoner_videos` / `video_sampling_kwargs` (data_batch keys), `videos` / `video_sampling_kwargs` (`OmniMoTModel.generate_reasoner_text` params), `video_*` (sample-arg fields) — used consistently across tasks.
-- **Known flag:** the exact `apply_chat_template` video-sampling kwarg surface (Task 6 Step 3) is transformers-version-dependent and confirmed in Task 9 Step 5; fallback documented inline.
-- **Import paths (resolved):** `VIDEO_EXTENSIONS` is exported from `cosmos_framework.inference.common.args`; `inference.py` already imports `Path`/`Any`/`cast`/`Image`. No other new imports required.
-```
\ No newline at end of file
diff --git a/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md b/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
deleted file mode 100644
index 7ddbca8..0000000
--- a/docs/superpowers/specs/2026-06-07-video-reasoner-input-design.md
+++ /dev/null
@@ -1,266 +0,0 @@
-# Video input for the `reasoner` model-mode of inference — design
-
-> **⚠️ SUPERSEDED (2026-06-08) — historical record.** Review found the inference
-> processor is the repo `Qwen3VLProcessor` wrapper, which runs
-> `do_sample_frames=False` and expects a **pre-decoded PIL frame list** (it drops
-> forwarded kwargs) — not a path it samples itself. The shipped implementation
-> therefore decodes frames via `torchvision.io.read_video` + Qwen `smart_nframes`
-> and exposes **only `video_fps`** (the `num_frames`/`min_frames`/`max_frames`/
-> `min_pixels`/`max_pixels` knobs described below were dropped). Treat the sections
-> below as the original design intent; the code is the source of truth.
-
-**Date:** 2026-06-07
-**Branch:** `maoshengl/video_reasoner_inference`
-**Status:** superseded by implementation (see banner above)
-
-## Goal
-
-Let `model_mode=reasoner` in the Cosmos inference engine
-(`python -m cosmos_framework.scripts.inference`) accept a **local mp4 video**
-as conditioning input, producing text that reasons over the clip — for both
-`Cosmos3-Nano` and `Cosmos3-Super`. Today the reasoner accepts only a text
-prompt or a single still image.
-
-## Background: why this is a gap
-
-The reasoner text-generation path runs entirely inside the Cosmos engine:
-
-```
-inference.py:_get_reasoner_sample_data        # loads ONE PIL image via Image.open
-  -> OmniMoTModel.generate_reasoner_text      # builds {"type":"image",...} chat block
-    -> net.generate_reasoner_text             # pass-through
-      -> unified_mot._impl_generate_reasoner_text   # pixel_values + image_grid_thw only
-        -> prepare_multimodal_reasoner_inputs       # image recipe only
-```
-
-Two hard blocks:
-
-1. `_get_reasoner_sample_data` (`cosmos_framework/inference/inference.py`) calls
-   `Image.open(vision_path)` unconditionally — PIL cannot decode mp4.
-2. `_impl_generate_reasoner_text` and `prepare_multimodal_reasoner_inputs`
-   **explicitly reject video** ("for I2V conditioning, frames must be passed as
-   images" — they have no `pixel_values_videos` / `video_grid_thw` params).
-
-Separately, `cosmos_framework/scripts/vlm/eval_videophy2.py` *does* consume
-video, but through a **different, standalone path**: a raw HuggingFace
-`Qwen3VLForConditionalGeneration` + `processor.apply_chat_template([{"type":
-"video",...}])` + `model.generate()`. It never touches the Cosmos engine, so it
-does not satisfy the goal of supporting `model_mode=reasoner` in
-`scripts.inference`.
-
-**Key enabling fact:** the vendored Qwen3-VL model under
-`cosmos_framework/model/vfm/vlm/qwen3_vl/` already implements video end to end —
-`get_video_features`, `get_rope_index(video_grid_thw=...)`,
-`get_placeholder_mask(pixel_values_videos=...)`, a `video_token_id`, and a full
-`video_processing_qwen3_vl.py`. Only the Cosmos reasoner **wrapper layers** are
-hardcoded to images. So the change is additive plumbing, not new model logic.
-
-## Approach (chosen)
-
-**B1 — add a parallel video lane through the existing reasoner stack.**
-
-Add optional video parameters alongside the existing image parameters through
-the wrapper layers, leaving the image and text-only paths bit-identical. A given
-prompt carries **either** an image, **or** a video, **or** neither — never both.
-No mixed image+video support (not needed).
-
-Approaches considered and rejected:
-
-- **B2 — unify image+video into one "media item" abstraction.** Cleaner
-  long-term and enables mixed media in one prompt, but larger blast radius, more
-  validation/tests, and supports a capability not requested (YAGNI).
-- **B3 — expose the HF `Qwen3VLForConditionalGeneration` route instead.** Bypasses
-  the Cosmos engine entirely (no `model_mode=reasoner`, no parallelism /
-  guardrails / output plumbing) — does not meet the goal.
-
-## Data flow
-
-```
-inputs/reasoner/reasoner_video.json
-  { model_mode: "reasoner", prompt, vision_path: "clip.mp4", video_*: ... }
-        |
-        v  args.py: vision_path resolves; extension -> ConditionVisionMode.VIDEO (already detected)
-_get_reasoner_sample_data()
-        |  detect .mp4 -> {prompt, "reasoner_videos": [path], "<video sampling kwargs>"}
-        v                (vs "reasoner_images" for the image branch)
-_generate_reasoner_batch()
-        |  route videos -> model.generate_reasoner_text(videos=[...], video_* kwargs)
-        v
-OmniMoTModel.generate_reasoner_text(videos=..., video_* kwargs)
-        |  build {"type":"video","video":path, <sampling kwargs>} chat block
-        |  apply_chat_template -> pixel_values_videos, video_grid_thw
-        v
-net.generate_reasoner_text(pixel_values_videos=..., video_grid_thw=...)   [pass-through]
-        v
-unified_mot._impl_generate_reasoner_text(... video tensors ...)
-        v
-prepare_multimodal_reasoner_inputs(...)   NEW video branch:
-        get_video_features -> get_placeholder_mask(video) -> get_rope_index(video_grid_thw)
-        v
-reasoner_forward -> AR decode -> text   (unchanged)
-```
-
-## Component changes
-
-All new params are optional and default to `None`/absent, so existing callers
-and the image/text-only paths are unchanged.
-
-### 1. `qwen3_vl/utils.py` — `prepare_multimodal_reasoner_inputs` (the one real seam)
-
-Add optional `pixel_values_videos` / `video_grid_thw` params. When they are set
-(and the image params are not), run the video recipe using helpers that already
-exist:
-
-- `get_video_features(causal_lm, pixel_values_videos, video_grid_thw)` instead of
-  `get_image_features`
-- `get_placeholder_mask(..., video_features=video_embeds)` -> use the returned
-  `_video_mask`
-- `get_rope_index(..., video_grid_thw=video_grid_thw)` instead of the image grid
-
-The `masked_scatter`, `visual_pos_masks`, deepstack alignment, and return shape
-all stay identical — only which features and which grid feed in change. The
-image branch is untouched. Update the docstring that currently says videos are
-not supported.
-
-### 2. `unified_mot.py` — `_impl_generate_reasoner_text`
-
-Add `pixel_values_videos` / `video_grid_thw` params. Extend the pairing guard
-(currently `(pixel_values is None) != (image_grid_thw is None)`) to also validate
-the video pair and to reject image+video supplied together. Branch: if video
-tensors present -> call `prepare_multimodal_reasoner_inputs` with them; else
-existing behavior. Update the "Videos are not supported" docstring.
-
-### 3. `unified_mot.py` + `cosmos3_vfm_network.py` — the two `generate_reasoner_text` pass-throughs
-
-Add the two video params and forward verbatim. Pure plumbing.
-
-### 4. `omni_mot_model.py` — `OmniMoTModel.generate_reasoner_text`
-
-Add `videos: list[Any] | None = None` (parallel to `images`) plus the optional
-video sampling kwargs (see schema below). Validate not-both (image and video).
-When `videos` is set, build the last user message with a
-`{"type": "video", "video": videos[idx], <sampling kwargs>}` block instead of the
-image block, then read `pixel_values_videos` / `video_grid_thw` out of the
-`apply_chat_template` output and pass them down. Same per-prompt `B=1` loop, same
-CP/CFGP output broadcast.
-
-### 5. `inference.py` — `_get_reasoner_sample_data` + `_generate_reasoner_batch`
-
-- Builder: branch on `Path(vision_path).suffix`. Image extension keeps
-  `Image.open` + `reasoner_images`. Video extension passes the **path string**
-  under `reasoner_videos` (the processor decodes it — see "Frame sampling"
-  below), and carries the resolved `video_*` sampling kwargs.
-- Batch: read whichever key is present, apply the homogeneity check (no mixing
-  within a batch), and call `generate_reasoner_text(videos=...)` with the
-  sampling kwargs when videos are present.
-
-### 6. `args.py` — schema (`SamplingArgs` / `SamplingOverrides`) + reasoner `sample_args.json`
-
-Add the input-video sampling knobs. They are named with a `video_` prefix to
-avoid colliding with the existing **output**-oriented `fps` / `num_frames`
-fields (which mean output rate/length and are otherwise unused by the reasoner).
-
-| New reasoner sample-arg | Maps to processor kwarg | Default     |
-| ----------------------- | ----------------------- | ----------- |
-| `video_fps`             | `fps`                   | `None` (->2)|
-| `video_num_frames`      | `num_frames`            | `None`      |
-| `video_min_frames`      | `min_frames`            | `None` (->4)|
-| `video_max_frames`      | `max_frames`            | `None`(->768)|
-| `video_min_pixels`      | `min_pixels`            | `None`      |
-| `video_max_pixels`      | `max_pixels`            | `None`      |
-
-`None` means "use the processor default," so the no-override behavior is
-identical to relying purely on processor defaults. Only non-`None` values are
-forwarded into the video block / processor kwargs.
-
-## Frame sampling
-
-The Qwen3-VL processor decodes the mp4 and samples frames itself; we pass the
-**path string** straight into the `{"type":"video",...}` block (matching
-`eval_videophy2.py`) rather than pre-decoding frames ourselves. The optional
-`video_*` knobs above tune that sampling.
-
-## Validation & error handling (fail fast, clear messages)
-
-- **Image + video together** — rejected at `_impl_generate_reasoner_text` and at
-  `OmniMoTModel.generate_reasoner_text`. The reasoner conditions on one medium at
-  a time.
-- **Video pairing** — `pixel_values_videos` and `video_grid_thw` must both be
-  present or both absent (mirrors the existing image-pair guard).
-- **`video_fps` + `video_num_frames` together** — rejected in the schema,
-  mirroring the processor's own mutual-exclusion rule.
-- **Batch homogeneity** — extend the current "no mixing image-conditioned and
-  text-only" check in `_generate_reasoner_batch` to three kinds: a batch is
-  all-text, all-image, or all-video. Mixed -> `ValueError` telling the user to
-  split inputs.
-- **No vision tower** — already handled: `_impl` raises if `causal_lm.visual` is
-  missing.
-- **Placeholder-token mismatch** — already handled: `get_placeholder_mask`
-  raises if the video token count != produced features.
-- **Extension routing** — relies on the existing `VIDEO_EXTENSIONS` /
-  `IMAGE_EXTENSIONS` sets in `args.py`; an unrecognized extension already raises
-  `Invalid vision extension`.
-
-## Non-goals / notes
-
-- **Mixed image+video in one prompt** — out of scope.
-- **Input-video content-safety guardrail** — none today; not added. The reasoner
-  emits only text, never video, so the text guardrail on prompt and output is
-  unchanged and sufficient.
-- **Video decode backend** — the processor needs a video backend
-  (decord / torchvision) to read the mp4; if missing, the failure surfaces inside
-  `apply_chat_template`. We do not add our own decode path. This is an
-  environment dependency to document, not code we write.
-- **Unused output vision fields** — `fps` / `num_frames` / resolution remain
-  unused by the reasoner (already defaulted in `args.py`).
-
-## Verification (manual only)
-
-No automated test for now. The implementation ships the artifacts to verify by
-hand.
-
-Example input `inputs/reasoner/reasoner_video.json`:
-
-```json
-{
-    "model_mode": "reasoner",
-    "prompt": "Describe what happens in this video in one sentence.",
-    "vision_path": "/abs/path/to/clip.mp4",
-    "video_fps": 2,
-    "video_max_pixels": 200704
-}
-```
-
-(`video_*` fields optional — omit to use processor defaults.)
-
-Run (Nano; Super identical but `--checkpoint-path Cosmos3-Super`):
-
-```bash
-torchrun --nproc-per-node=8 -m cosmos_framework.scripts.inference \
-    --parallelism-preset=throughput --dp-shard-size=8 --dp-replicate-size=1 \
-    --cp-size=1 --cfgp-size=1 \
-    -i "inputs/reasoner/reasoner_video.json" \
-    -o outputs/reasoner_video --checkpoint-path Cosmos3-Nano --seed=0
-```
-
-Expected: `outputs/reasoner_video/reasoner_video/reasoner_text.txt` contains
-non-empty, on-topic text describing the clip; no crash; image and text-only
-reasoner inputs still work unchanged.
-
-A parity check against the HF `eval_videophy2.py` path is a possible future
-hardening step, out of scope here.
-
-## Files touched
-
-| File | Change |
-| ---- | ------ |
-| `cosmos_framework/model/vfm/vlm/qwen3_vl/utils.py` | `prepare_multimodal_reasoner_inputs`: add video branch |
-| `cosmos_framework/model/vfm/mot/unified_mot.py` | `_impl_generate_reasoner_text` + wrapper `generate_reasoner_text`: add/forward video params |
-| `cosmos_framework/model/vfm/mot/cosmos3_vfm_network.py` | `generate_reasoner_text`: forward video params |
-| `cosmos_framework/model/vfm/omni_mot_model.py` | `generate_reasoner_text`: `videos` param, video chat block, sampling kwargs |
-| `cosmos_framework/inference/inference.py` | `_get_reasoner_sample_data` + `_generate_reasoner_batch`: route mp4 |
-| `cosmos_framework/inference/args.py` | add `video_*` sampling fields + mutual-exclusion validation |
-| `cosmos_framework/inference/defaults/reasoner/sample_args.json` | add `video_*` defaults (`null`) |
-| `inputs/reasoner/reasoner_video.json` | new example input |
-| `docs/inference.md` | document video input + `video_*` fields for `reasoner` mode |
-```
\ No newline at end of file

From eb203478f95efe82e6766b9e09e0e12383f58334 Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Mon, 8 Jun 2026 03:41:53 -0700
Subject: [PATCH 19/20] docs(reasoner): regenerate inference.md TOC for
 Reasoner section (pre-commit)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 docs/inference.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/inference.md b/docs/inference.md
index ae04065..aa46b75 100644
--- a/docs/inference.md
+++ b/docs/inference.md
@@ -19,6 +19,7 @@ ______________________________________________________________________
 - [Sample Arguments](#sample-arguments)
   - [Text](#text)
   - [Vision (Image/Video)](#vision-imagevideo)
+  - [Reasoner](#reasoner)
   - [Action](#action)
   - [Custom Defaults](#custom-defaults)
 - [Guardrails](#guardrails)

From 6e5830a87269b18e8e1ac16e7675ac4022addc6b Mon Sep 17 00:00:00 2001
From: Maosheng Liao <maoshengl@nvidia.com>
Date: Tue, 9 Jun 2026 05:25:42 -0700
Subject: [PATCH 20/20] test(reasoner): cover video modality in get_sample_data
 tests; account for reasoner_videos key

- _make_reasoner_sample_args gains video_fps
- text-only / with-image get_sample_data tests assert the always-present reasoner_videos:[None]
- add test_get_sample_data_reasoner_with_video (monkeypatched decoder)
- drop redundant lower-level _get_reasoner_sample_data duplicates (public get_sample_data set covers them)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cosmos_framework/inference/inference_test.py | 67 +++++++++-----------
 1 file changed, 29 insertions(+), 38 deletions(-)

diff --git a/cosmos_framework/inference/inference_test.py b/cosmos_framework/inference/inference_test.py
index 58da0c0..b8ec16b 100644
--- a/cosmos_framework/inference/inference_test.py
+++ b/cosmos_framework/inference/inference_test.py
@@ -169,6 +169,7 @@ def _make_reasoner_sample_args(**overrides: Any) -> SimpleNamespace:
         model_mode=ModelMode.REASONER,
         prompt="Describe a robotic arm.",
         vision_path=None,
+        video_fps=None,
         max_new_tokens=8,
         do_sample=False,
         temperature=1.0,
@@ -189,7 +190,11 @@ def test_get_sample_data_reasoner_text_only() -> None:
 
     out = inference.get_sample_data(sample_args, model, device="cpu")
 
-    assert out == {"caption": ["Describe a robotic arm."], "reasoner_images": [None]}
+    assert out == {
+        "caption": ["Describe a robotic arm."],
+        "reasoner_images": [None],
+        "reasoner_videos": [None],
+    }
 
 
 @pytest.mark.L0
@@ -205,13 +210,35 @@ def test_get_sample_data_reasoner_with_image(tmp_path: Path) -> None:
 
     out = inference.get_sample_data(sample_args, model, device="cpu")
 
-    assert list(out) == ["caption", "reasoner_images"]
+    assert list(out) == ["caption", "reasoner_images", "reasoner_videos"]
     assert out["caption"] == ["Describe a robotic arm."]
+    assert out["reasoner_videos"] == [None]
     assert len(out["reasoner_images"]) == 1
     assert out["reasoner_images"][0].size == (8, 8)
     assert out["reasoner_images"][0].mode == "RGB"
 
 
+@pytest.mark.L0
+def test_get_sample_data_reasoner_with_video(monkeypatch: pytest.MonkeyPatch) -> None:
+    """A video ``vision_path`` routes through ``_decode_reasoner_video`` into ``reasoner_videos``.
+
+    The decoder is monkeypatched (real decode needs torchvision + an actual clip);
+    this asserts the routing/contract, not the decode itself."""
+    from cosmos_framework.inference import inference
+
+    decoded = {"frames": ["F0", "F1"], "fps": 2.0}
+    monkeypatch.setattr(inference, "_decode_reasoner_video", lambda path, fps: decoded)
+    model = SimpleNamespace(input_caption_key="caption")
+    sample_args = _make_reasoner_sample_args(vision_path="/tmp/clip.mp4", video_fps=2.0)
+
+    out = inference.get_sample_data(sample_args, model, device="cpu")
+
+    assert out["caption"] == ["Describe a robotic arm."]
+    assert out["reasoner_videos"] == [decoded]
+    assert out["reasoner_images"] == [None]
+    assert "video_sampling_kwargs" not in out
+
+
 @pytest.mark.L0
 def test_reasoner_defaults_json_round_trip() -> None:
     import json as _json
@@ -351,39 +378,3 @@ def test_reasoner_defaults_validate_against_overrides() -> None:
     OmniSampleOverrides.model_validate(filtered)
 
 
-# ---------------------------------------------------------------------------
-# _get_reasoner_sample_data: image / video / text-only routing
-# ---------------------------------------------------------------------------
-
-
-def _fake_sa(vision_path: Any, **video_kw: Any) -> SimpleNamespace:
-    base: dict[str, Any] = dict(prompt="describe", vision_path=vision_path, video_fps=None)
-    base.update(video_kw)
-    return SimpleNamespace(**base)
-
-
-_fake_model = SimpleNamespace(input_caption_key="caption")
-
-
-@pytest.mark.L0
-def test_reasoner_sample_data_text_only() -> None:
-    from cosmos_framework.inference.inference import _get_reasoner_sample_data
-
-    out = _get_reasoner_sample_data(_fake_sa(None), _fake_model)
-    assert out["caption"] == ["describe"]
-    assert out["reasoner_images"] == [None]
-    assert out["reasoner_videos"] == [None]
-
-
-@pytest.mark.L0
-def test_reasoner_sample_data_video_routes_to_videos(monkeypatch: pytest.MonkeyPatch) -> None:
-    import cosmos_framework.inference.inference as inf
-    from cosmos_framework.inference.inference import _get_reasoner_sample_data
-
-    sentinel = {"frames": ["F0", "F1"], "fps": 2.0}
-    monkeypatch.setattr(inf, "_decode_reasoner_video", lambda path, fps: sentinel)
-    out = _get_reasoner_sample_data(_fake_sa("/tmp/clip.mp4", video_fps=2.0), _fake_model)
-    assert out["caption"] == ["describe"]
-    assert out["reasoner_videos"] == [sentinel]
-    assert out["reasoner_images"] == [None]
-    assert "video_sampling_kwargs" not in out