NVIDIA-NeMo · terrykong · Jan 7, 2026 · Jan 8, 2026 · Jan 10, 2026 · Jan 10, 2026
@@ -57,15 +57,18 @@ repos:
   # intend to merge. Without it, you might run experiments with one config, but when merging upstream,
   # the config could silently fall back to the base defaults—resulting in different hyperparameters.
   #
-  # For example, we’ve seen cases where an SFT recipe runs without a custom chat_template. When merged,
-  # it unexpectedly picks up the default recommended chat_template from upstream, which doesn’t match
+  # For example, we've seen cases where an SFT recipe runs without a custom chat_template. When merged,
+  # it unexpectedly picks up the default recommended chat_template from upstream, which doesn't match
   # the original experiment setup.
   #
   # If this check is disruptive, you can disable the pre-commit hook locally. However, before a recipe
   # is accepted upstream, we expect the config to be minimized.
+  #
+  # The minimize-check command infers the base config from each recipe's `defaults` key, so it
+  # correctly handles inheritance chains (e.g., child → parent → grandparent).
   - repo: local
     hooks:
-      - id: configs-minimize-check-llm
+      - id: configs-minimize-check
         name: minimize-check llm recipes
         language: system
         pass_filenames: false
@@ -74,17 +77,4 @@ repos:
           - -lc
           - |
             set -euo pipefail
-            base="examples/configs/dpo.yaml"; for f in examples/configs/recipes/llm/dpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/grpo_math_1B.yaml"; for f in examples/configs/recipes/llm/grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/sft.yaml"; for f in examples/configs/recipes/llm/sft-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-            base="examples/configs/distillation_math.yaml"; for f in examples/configs/recipes/llm/distillation-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
-      - id: configs-minimize-check-vlm
-        name: minimize-check vlm recipes
-        language: system
-        pass_filenames: false
-        entry: bash
-        args:
-          - -lc
-          - |
-            set -euo pipefail
-            base="examples/configs/vlm_grpo_3B.yaml"; for f in examples/configs/recipes/vlm/vlm_grpo-*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$base" "$f"; done
+            for f in examples/configs/recipes/{llm,vlm}/*.yaml; do [ -e "$f" ] && ./tools/config_cli.py minimize-check "$f"; done
@@ -286,6 +286,10 @@ sbatch \
     --gres=gpu:8 \
     ray.sub
 ```
+
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 The required `CONTAINER` can be built by following the instructions in the [Docker documentation](docs/docker.md).
 
 #### GRPO Qwen2.5-32B
@@ -313,6 +317,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 #### GRPO Multi-Turn
 
 We also support multi-turn generation and training (tool use, games, etc.).
@@ -361,6 +368,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## Supervised Fine-Tuning (SFT)
 
 We provide example SFT experiments using various datasets including [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/), OpenAI format datasets (with tool calling support), and custom JSONL datasets. For detailed documentation on supported datasets and configurations, see the [SFT documentation](docs/guides/sft.md).
@@ -406,6 +416,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## DPO
 
 We provide a sample DPO experiment that uses the [HelpSteer3 dataset](https://huggingface.co/datasets/nvidia/HelpSteer3) for preference-based training.
@@ -464,6 +477,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## RM
 
 We provide a sample RM experiment that uses the [HelpSteer3 dataset](https://huggingface.co/datasets/nvidia/HelpSteer3) for preference-based training.
@@ -508,6 +524,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## Evaluation
 
 We provide evaluation tools to assess model capabilities.

@@ -64,6 +64,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## Configuration
 
 DAPO uses the same configuration structure as GRPO. The key parameters are:

@@ -56,3 +56,5 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
@@ -64,6 +64,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 The required `CONTAINER` can be built by following the instructions in the [Docker documentation](../../docker.md).
 
 ## GRPO Qwen2.5-32B
@@ -92,6 +95,9 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 ## GRPO Multi-Turn
 
 We also support multi-turn generation and training (tool use, games, etc.). Reference example for training to play a Sliding Puzzle Game:

@@ -41,3 +41,5 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
@@ -42,3 +42,5 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
@@ -43,3 +43,5 @@ sbatch \
     ray.sub
 ```
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
@@ -28,6 +28,9 @@ sbatch \
 > [!TIP]
 > Depending on your Slurm cluster configuration, you may or may not need to include the `--gres=gpu:8` option in the `sbatch` command.
 
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead of `--gres=gpu:8`.
+
 Upon successful submission, Slurm will print the `SLURM_JOB_ID`:
 ```text
 Submitted batch job 1980204
@@ -58,6 +61,10 @@ sbatch \
     --gres=gpu:8 \
     ray.sub
 ```
+
+> [!NOTE]
+> For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.
+
 Upon successful submission, Slurm will print the `SLURM_JOB_ID`:
 ```text
 Submitted batch job 1980204

@@ -0,0 +1,12 @@
+defaults: ./dapo-qwen2.5-7b.yaml
+policy:
+  dtensor_cfg:
+    context_parallel_size: 2
+checkpointing:
+  checkpoint_dir: results/dapo-qwen2.5-7b-16n4g-fsdp2cp2
+logger:
+  log_dir: logs/dapo-qwen2.5-7b-16n4g-fsdp2cp2
+  wandb:
+    name: dapo-qwen2.5-7b-16n4g-fsdp2cp2
+cluster:
+  gpus_per_node: 4
@@ -1,22 +1,16 @@
 defaults: ../../grpo_math_1B.yaml
 grpo:
   num_prompts_per_step: 512
-  num_generations_per_prompt: 16
   batch_multiplier: 3 # Multiplier for dataloader batch size calculation (batch_multiplier × num_prompts_per_step). Following DAPO dynamic sampling, the actual training batch size equals num_prompts_per_step × num_generations_per_prompt.
-  max_rollout_turns: 1 # for multi-turn rollouts. Math Environments just have 1 turn (answering the question)
   max_num_steps: 10000
   use_leave_one_out_baseline: false
   val_period: 20
   max_val_samples: 960
   val_batch_size: 960
   use_dynamic_sampling: true
-  dynamic_sampling_max_gen_batches: 10
   reward_scaling:
     enabled: true
-    source_min: 0.0
-    source_max: 1.0
     target_min: -1.0
-    target_max: 1.0
   reward_shaping:
     enabled: true
     overlong_buffer_length: 2048
@@ -41,7 +35,6 @@ policy:
     _v2: false
     context_parallel_size: 4
   megatron_cfg:
-    empty_unused_memory_level: 1
     tensor_model_parallel_size: 4
     pipeline_model_parallel_size: 2
     context_parallel_size: 2
@@ -87,8 +80,7 @@ data:
 env:
   math:
     num_workers: 16
-    math_verify_impl: "dapo_math_verify"
-
+    math_verify_impl: dapo_math_verify
 logger:
   monitor_gpus: false
   wandb:

@@ -0,0 +1,9 @@
+defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-fsdp2tp1.v1.yaml
+checkpointing:
+  checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-fsdp2tp1
+logger:
+  log_dir: logs/distillation-qwen3-32b-to-1.7b-base-1n4g-fsdp2tp1
+  wandb:
+    name: distillation-qwen3-32b-to-1.7b-base-1n4g-fsdp2tp1
+cluster:
+  gpus_per_node: 4
@@ -0,0 +1,16 @@
+defaults: ./distillation-qwen3-32b-to-1.7b-base-1n8g-megatron-tp2pp2cp2-pack.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+teacher:
+  megatron_cfg:
+    tensor_model_parallel_size: 2
+checkpointing:
+  checkpoint_dir: checkpoints/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack
+logger:
+  log_dir: logs/distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack
+  wandb:
+    name: distillation-qwen3-32b-to-1.7b-base-1n4g-megatron-tp1pp2cp2-pack
+cluster:
+  gpus_per_node: 4
+
@@ -0,0 +1,14 @@
+defaults: ./distillation-qwen3-32b-to-4b-base-2n8g-fsdp2tp2-long.v1.yaml
+policy:
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 1
+checkpointing:
+  checkpoint_dir: checkpoints/distillation-qwen3-32b-to-4b-base-2n4g-fsdp2tp1-long
+logger:
+  log_dir: logs/distillation-qwen3-32b-to-4b-base-2n4g-fsdp2tp1-long
+  wandb:
+    name: distillation-qwen3-32b-to-4b-base-2n4g-fsdp2tp1-long
+cluster:
+  gpus_per_node: 4
+
@@ -0,0 +1,10 @@
+defaults: ./dpo-llama3.1-8b-instruct-4n8g-fsdp2tp2-quick.v2.yaml
+policy:
+  dtensor_cfg:
+    tensor_parallel_size: 1
+logger:
+  wandb:
+    name: dpo-llama3.1-8b-instruct-4n4g-fsdp2tp1-quick
+cluster:
+  gpus_per_node: 4
+
@@ -0,0 +1,11 @@
+defaults: ./dpo-llama3.1-8b-instruct-4n8g-megatrontp2pp2-quick.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 1
+    sequence_parallel: false
+logger:
+  wandb:
+    name: dpo-llama3.1-8b-instruct-4n4g-megatrontp1pp2-quick
+cluster:
+  gpus_per_node: 4
+
@@ -0,0 +1,10 @@
+defaults: ./dpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v2.yaml
+checkpointing:
+  checkpoint_dir: results/dpo-llama3.2-1b-instruct-1n4g-fsdp2tp1
+logger:
+  log_dir: logs/dpo-llama3.2-1b-instruct-1n4g-fsdp2tp1
+  wandb:
+    name: dpo-llama3.2-1b-instruct-1n4g-fsdp2tp1
+cluster:
+  gpus_per_node: 4
+
@@ -0,0 +1,23 @@
+defaults: ./grpo-dapomath17k-dsv3-megatron.yaml
+policy:
+  megatron_cfg:
+    tensor_model_parallel_size: 4
+    expert_model_parallel_size: 16
+    pipeline_model_parallel_size: 4
+    context_parallel_size: 2
+    num_layers_in_first_pipeline_stage: 15
+    num_layers_in_last_pipeline_stage: 14
+  make_sequence_length_divisible_by: 4
+  generation:
+    vllm_cfg:
+      tensor_parallel_size: 16
+checkpointing:
+  checkpoint_dir: results/grpo-dapomath17k-dsv3-32n4g-megatron
+logger:
+  wandb:
+    name: grpo-dapomath17k-dsv3-32n4g-megatron
+  mlflow:
+    run_name: grpo-dapomath17k-dsv3-32n4g-megatron
+cluster:
+  gpus_per_node: 4
+
@@ -1,14 +1,9 @@
-defaults:
-  - ../../grpo_math_1B.yaml
-  - grpo-deepscaler-1.5b-8K.yaml
+defaults: grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.001
   ratio_clip_max: 0.28
 policy:
   max_total_sequence_length: 16384
   logprob_batch_size: 2
   dtensor_cfg:
-    cpu_offload: true
-    activation_checkpointing: true
     tensor_parallel_size: 2
-    _v2: false
@@ -0,0 +1,10 @@
+defaults: ./grpo-deepscaler-1.5b-8K.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-deepscaler-1.5b-1n4g-8K
+logger:
+  log_dir: logs/grpo-deepscaler-1.5b-1n4g-8K
+  wandb:
+    name: grpo-deepscaler-1.5b-1n4g-8K
+cluster:
+  gpus_per_node: 4
+
@@ -1,19 +1,12 @@
-defaults:
-  - ../../grpo_math_1B.yaml
-  - grpo-deepscaler-1.5b-8K.yaml
+defaults: grpo-deepscaler-1.5b-8K.yaml
 loss_fn:
   reference_policy_kl_penalty: 0.0001
   ratio_clip_max: 0.28
 policy:
   max_total_sequence_length: 24576
   logprob_batch_size: 2
   dtensor_cfg:
-    cpu_offload: true
-    activation_checkpointing: true
     tensor_parallel_size: 2
-    _v2: false
-  sequence_packing:
-    enabled: false
   optimizer:
     kwargs:
       lr: 5.0e-07

@@ -0,0 +1,10 @@
+defaults: ./grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
+checkpointing:
+  checkpoint_dir: results/grpo-gemma3-1b-it-1n4g-fsdp2tp1
+logger:
+  log_dir: logs/grpo-gemma3-1b-it-1n4g-fsdp2tp1
+  wandb:
+    name: grpo-gemma3-1b-it-1n4g-fsdp2tp1
+cluster:
+  gpus_per_node: 4
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -56,3 +56,5 @@ sbatch \ @@
         ray.sub
     ```
+    > [!NOTE]
+    > For GB200 systems with 4 GPUs per node, use `--gres=gpu:4` instead.