NVIDIA-NeMo · anubhutivyas · Jun 17, 2026 · coderabbitai · Jun 17, 2026
@@ -38,6 +38,10 @@ class LoRAParams(BaseModel):
     dropout: float = Field(default=0.0, ge=0.0, le=1.0, description="LoRA dropout probability for regularization.")
     merge: bool = False
     target_modules: list[str] | None = None
+    exclude_modules: list[str] | None = Field(
+        default=None, description="Module name patterns to exclude from LoRA (e.g. ['*.out_proj'])."
+    )
+    use_triton: bool = Field(default=True, description="Use the optimized Triton LoRA kernel.")
 
 
 class DatasetSpec(BaseModel):
@@ -59,6 +63,10 @@ class TrainingSpec(BaseModel):
         default=None,
         description="Model precision for training. Auto-detected from the checkpoint when unset.",
     )
+    attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
+        default="sdpa",
+        description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
+    )
     execution_profile: str | None = Field(default=None, min_length=1)
     teacher_model: str | None = None
     distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0)
@@ -90,6 +98,12 @@ class BatchSpec(BaseModel):
     global_batch_size: int = Field(default=8, gt=0)
     micro_batch_size: int = Field(default=1, gt=0)
     sequence_packing: bool = False
+    sequence_packing_max_samples: int = Field(
+        default=1000, gt=0, description="Samples analyzed to estimate the optimal pack size when packing is enabled."
+    )
+    split_across_pack: bool = Field(
+        default=False, description="Allow a single sample to span two packs when sequence packing."
+    )
 
 
 class OptimizerSpec(BaseModel):
@@ -103,6 +117,11 @@ class OptimizerSpec(BaseModel):
     adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam optimizer beta1.")
     adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam optimizer beta2.")
     warmup_steps: int = Field(default=0, ge=0)
+    adam_eps: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.")
+    optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
+    lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
+        default="cosine", description="Learning-rate decay schedule."
+    )
 
 
 class ParallelismSpec(BaseModel):

@@ -117,10 +117,13 @@ Full template:
       "alpha": 32,
       "dropout": 0.0,
       "merge": false,
-      "target_modules": null
+      "target_modules": null,
+      "exclude_modules": null,
+      "use_triton": true
     },
     "max_seq_length": 2048,
     "precision": null,
+    "attn_implementation": "sdpa",
     "execution_profile": null
   },
   "schedule": {
@@ -132,14 +135,19 @@ Full template:
   "batch": {
     "global_batch_size": 4,
     "micro_batch_size": 1,
-    "sequence_packing": false
+    "sequence_packing": false,
+    "sequence_packing_max_samples": 1000,
+    "split_across_pack": false
   },
   "optimizer": {
     "learning_rate": 5e-5,
     "min_learning_rate": null,
     "weight_decay": 0.01,
     "adam_beta1": 0.9,
     "adam_beta2": 0.999,
+    "adam_eps": 1e-8,
+    "optimizer": "adam",
+    "lr_decay_style": "cosine",
     "warmup_steps": 0
   },
   "parallelism": {
@@ -171,8 +179,11 @@ Full template:
 | `lora.dropout` | `0.0` | LoRA dropout (0.0–1.0) for regularization |
 | `lora.merge` | `false` | If true with `lora_merged`, output is full weights not adapter |
 | `lora.target_modules` | `null` | e.g. `["q_proj","v_proj"]`; null = platform default targets |
+| `lora.exclude_modules` | `null` | Patterns to exclude from LoRA, e.g. `["*.out_proj"]` |
+| `lora.use_triton` | `true` | Use the optimized Triton LoRA kernel |
 | `max_seq_length` | `2048` | Truncate/pack to this length; lower if OOM |
 | `precision` | `null` | `bf16` \| `fp16` \| `fp32` \| `fp8`; null auto-detects from the checkpoint |
+| `attn_implementation` | `sdpa` | `sdpa` (PyTorch native) \| `flash_attention_2` \| `eager` |
 | `teacher_model` | — | **Model entity ref** (not HF id). Required for distillation; see below |
 | `distillation_ratio` | `0.5` | KD blend (0–1) |
 | `distillation_temperature` | `1.0` | KD temperature |
@@ -199,6 +210,8 @@ LoRA block is auto-created when `finetuning_type` is `lora` or `lora_merged`.
 | `global_batch_size` | `8` (schema) | Effective batch across all GPUs; **≥48 GB LoRA tables → `SKILL.md`** |
 | `micro_batch_size` | `1` (schema) | **Per GPU**; same SKILL tables for single- and multi-GPU (TP=1) |
 | `sequence_packing` | `false` | Pack short sequences for throughput (needs compatible data) |
+| `sequence_packing_max_samples` | `1000` | Samples analyzed to estimate the optimal pack size (only when packing) |
+| `split_across_pack` | `false` | Allow a single sample to span two packs (only when packing) |
 
 **Validation:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where:
 
@@ -215,6 +228,9 @@ Example: 1 node, 2 GPUs, TP=1 → DP=2 → GBS must be a multiple of `2 × micro
 | `weight_decay` | `0.01` | L2-style regularization |
 | `adam_beta1` | `0.9` | Adam optimizer beta1 |
 | `adam_beta2` | `0.999` | Adam optimizer beta2 |
+| `adam_eps` | `1e-8` | Adam/AdamW epsilon for numerical stability |
+| `optimizer` | `adam` | `adam` \| `adamw` |
+| `lr_decay_style` | `cosine` | `cosine` \| `linear` \| `constant` |
 | `warmup_steps` | `0` | Linear warmup; try ~10% of total steps for long runs |
 
 ### `parallelism`
@@ -403,7 +419,9 @@ Full template (every section, defaults inline):
     "load_in_4bit": true,
     "load_in_8bit": false,
     "dtype": "auto",
-    "trust_remote_code": false
+    "trust_remote_code": false,
+    "device_map": null,
+    "rope_scaling": null
   },
   "dataset": {
     "path": "default/<dataset-fileset>",
@@ -422,7 +440,13 @@ Full template (every section, defaults inline):
       "target_modules": ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
       "bias": "none",
       "use_rslora": false,
-      "random_state": 3407
+      "random_state": 3407,
+      "use_dora": false,
+      "loftq_config": null,
+      "modules_to_save": null,
+      "layers_to_transform": null,
+      "layer_replication": null,
+      "init_lora_weights": true
     },
     "use_gradient_checkpointing": "unsloth"
   },
@@ -432,6 +456,7 @@ Full template (every section, defaults inline):
     "warmup_steps": 0,
     "warmup_ratio": null,
     "lr_scheduler_type": "linear",
+    "lr_scheduler_kwargs": null,
     "logging_steps": 1,
     "save_steps": null,
     "eval_steps": null,
@@ -444,7 +469,13 @@ Full template (every section, defaults inline):
   "optimizer": {
     "learning_rate": 5e-5,
     "weight_decay": 0.0,
-    "optim": "adamw_8bit"
+    "optim": "adamw_8bit",
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-8,
+    "max_grad_norm": 1.0,
+    "label_smoothing_factor": 0.0,
+    "neftune_noise_alpha": null
   },
   "hardware": {
     "gpus": "0",
@@ -474,6 +505,7 @@ Full template (every section, defaults inline):
 | `dtype` | `"auto"` | One of `"auto"`, `"bfloat16"`, `"float16"`, `"float32"`. |
 | `trust_remote_code` | `false` | HF `trust_remote_code` flag for custom model code (required by some hybrid Mamba/MoE models, e.g. Nemotron-H). |
 | `device_map` | `null` | Placement for `FastLanguageModel.from_pretrained`. `null` pins the whole model to the single visible GPU (`{"": 0}`) — the right default for this single-GPU backend. Leave unset unless experimenting; `"auto"`/`"balanced"`/`"sequential"` can spill layers to CPU on unified-memory hosts (GB10 / DGX Spark) and abort 4-bit loads. |
+| `rope_scaling` | `null` | RoPE scaling for long-context extension, e.g. `{"type": "linear", "factor": 2.0}`. `null` uses the model's native context length. |
 
 **Mutex:** `load_in_4bit` xor `load_in_8bit`. Both quantization flags are also **incompatible with `training.finetuning_type: "all_weights"`** — full SFT must use a non-quantized base.
 
@@ -511,6 +543,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `bias` | `"none"` | `"none"` / `"all"` / `"lora_only"`. |
 | `use_rslora` | `false` | Rank-stabilized LoRA. |
 | `random_state` | `3407` | Reproducibility seed for the LoRA init. |
+| `use_dora` | `false` | DoRA (weight-decomposed LoRA). Better quality at low ranks; adds overhead. |
+| `loftq_config` | `null` | LoftQ init config for quantized bases. `null` disables. |
+| `modules_to_save` | `null` | Extra non-LoRA modules trained & saved in full, e.g. `["embed_tokens","lm_head"]` (vocab changes / continued pretraining). |
+| `layers_to_transform` | `null` | Restrict LoRA to specific layer index(es). `null` = all layers. |
+| `layer_replication` | `null` | Layer-replication ranges for stacking, e.g. `[[0,16],[8,24]]`. |
+| `init_lora_weights` | `true` | Init scheme. `true` = PEFT default; `"gaussian"`/`"pissa"`/`"olora"`/`"loftq"` for advanced inits. |
 
 `lora` is auto-filled with these defaults when `finetuning_type: "lora"` and the user omits the block. Must be `null` / omitted when `finetuning_type: "all_weights"`.
 
@@ -523,6 +561,7 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `warmup_steps` | `0` | Linear warmup. Mutex with `warmup_ratio`. |
 | `warmup_ratio` | `null` | Fractional warmup over total steps. Mutex with `warmup_steps`. |
 | `lr_scheduler_type` | `"linear"` | `"linear"`, `"cosine"`, `"constant"`, `"constant_with_warmup"`, `"cosine_with_restarts"`. |
+| `lr_scheduler_kwargs` | `null` | Extra scheduler kwargs, e.g. `{"num_cycles": 3}` for `cosine_with_restarts`. `null` uses defaults. |
 | `logging_steps` | `1` | Loss-log cadence. |
 | `save_steps` | `null` | If set, save checkpoint every N steps. |
 | `eval_steps` | `null` | If set with `validation_path`, eval every N steps. When `null` and `validation_path` is set, the training driver defaults to **one validation pass per effective epoch** at `max(1, effective_steps - 1)` (same effective-step cap as automodel's default `val_check_interval`). |
@@ -546,6 +585,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `learning_rate` | `2e-4` (schema default; skill uses `5e-5` for LoRA SFT) | See LR table below. |
 | `weight_decay` | `0.0` | L2-style regularization. |
 | `optim` | `"adamw_8bit"` | `"adamw_torch"`, `"adamw_torch_fused"` (Hopper+), `"adamw_8bit"`, `"paged_adamw_8bit"`, `"sgd"`. `adamw_8bit` has the smallest optimizer state and is Unsloth's notebook default. |
+| `adam_beta1` | `0.9` | Adam/AdamW beta1. |
+| `adam_beta2` | `0.999` | Adam/AdamW beta2. |
+| `adam_epsilon` | `1e-8` | Adam/AdamW epsilon. |
+| `max_grad_norm` | `1.0` | Gradient-clipping max norm (TRL default). |
+| `label_smoothing_factor` | `0.0` | Label smoothing for the CE loss. `0.0` disables. |
+| `neftune_noise_alpha` | `null` | NEFTune embedding-noise alpha (quality boost). `null` disables. |
 
 `warmup_steps` is on `schedule`, not on `optimizer` (different from the automodel schema).
 

@@ -38,6 +38,8 @@ def _build_peft(training: dict[str, Any]) -> LoRAParams | None:
         dropout=lora.get("dropout", 0.0),
         merge=ft == "lora_merged" or lora.get("merge", False),
         target_modules=lora.get("target_modules"),
+        exclude_modules=lora.get("exclude_modules"),
+        use_triton=lora.get("use_triton", True),
     )
 
 
@@ -55,15 +57,21 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra
         "weight_decay": optimizer.get("weight_decay", 0.01),
         "adam_beta1": optimizer.get("adam_beta1", 0.9),
         "adam_beta2": optimizer.get("adam_beta2", 0.999),
+        "adam_eps": optimizer.get("adam_eps", 1e-8),
+        "optimizer": optimizer.get("optimizer", "adam"),
+        "lr_decay_style": optimizer.get("lr_decay_style", "cosine"),
         "warmup_steps": optimizer.get("warmup_steps", 0),
         "epochs": schedule.get("epochs", 1),
         "max_steps": schedule.get("max_steps"),
         "val_check_interval": schedule.get("val_check_interval"),
         "batch_size": batch.get("global_batch_size", 8),
         "micro_batch_size": batch.get("micro_batch_size", 1),
         "sequence_packing": batch.get("sequence_packing", False),
+        "sequence_packing_max_samples": batch.get("sequence_packing_max_samples", 1000),
+        "split_across_pack": batch.get("split_across_pack", False),
         "max_seq_length": training.get("max_seq_length", 2048),
         "precision": training.get("precision"),
+        "attn_implementation": training.get("attn_implementation", "sdpa"),
         "seed": schedule.get("seed"),
         "parallelism": ParallelismParams(
             num_nodes=parallelism.get("num_nodes", 1),

@@ -88,6 +88,14 @@ class LoRAParams(_PEFTParams):
         description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). "
         "If not set, applies to all '*proj' linear layers.",
     )
+    exclude_modules: Optional[list[str]] = Field(
+        default=None,
+        description="Module name patterns to exclude from LoRA (e.g., ['*.out_proj']).",
+    )
+    use_triton: bool = Field(
+        default=True,
+        description="Use the optimized Triton LoRA kernel.",
+    )
     merge: bool = Field(
         default=False,
         description="Merge LoRA weights into base model after training. "
@@ -173,12 +181,20 @@ class _TrainingBase(BaseModel):
         default=0.999,
         description="Adam beta2 parameter. Adjust for optimizer tuning.",
     )
+    adam_eps: float = Field(
+        default=1e-8,
+        gt=0.0,
+        description="Adam/AdamW epsilon for numerical stability.",
+    )
     warmup_steps: int = Field(
         default=0,
         ge=0,
         description="Linear warmup steps. Recommended: 10% of total training steps for stable training.",
     )
-    optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').")
+    optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
+    lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
+        default="cosine", description="Learning-rate decay schedule."
+    )
 
     # --- Schedule ---
     epochs: int = Field(
@@ -218,6 +234,15 @@ class _TrainingBase(BaseModel):
         default=False,
         description="Enable sequence packing for efficiency. Can improve training speed.",
     )
+    sequence_packing_max_samples: int = Field(
+        default=1000,
+        gt=0,
+        description="Samples analyzed to estimate the optimal pack size when sequence packing is enabled.",
+    )
+    split_across_pack: bool = Field(
+        default=False,
+        description="Allow a single sample to span two packs when sequence packing.",
+    )
 
     # --- Model ---
     max_seq_length: int = Field(
@@ -229,6 +254,10 @@ class _TrainingBase(BaseModel):
         default=None,
         description="Model precision for training. Auto-detected if unset.",
     )
+    attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
+        default="sdpa",
+        description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
+    )
     seed: Optional[int] = Field(
         default=None,
         description="Random seed for reproducibility. Optional.",

@@ -157,10 +157,15 @@ def compile_training_step(
             global_batch_size=training.batch_size,
             micro_batch_size=training.micro_batch_size,
             sequence_packing=training.sequence_packing,
+            sequence_packing_max_samples=training.sequence_packing_max_samples,
+            split_across_pack=training.split_across_pack,
         ),
         optimizer=TrainingStepConfig.OptimizerConfig(
+            optimizer_name=training.optimizer,
+            lr_decay_style=training.lr_decay_style,
             learning_rate=training.learning_rate,
             min_learning_rate=training.min_learning_rate,
+            eps=training.adam_eps,
             weight_decay=training.weight_decay,
             beta1=training.adam_beta1,
             beta2=training.adam_beta2,
@@ -241,6 +246,7 @@ def _translate_model_config(
         name=_extract_model_name(job_spec),
         max_seq_length=training.max_seq_length,
         precision=training.precision,
+        attn_implementation=training.attn_implementation,
         trust_remote_code=trust_remote_code,
         is_embedding_model=is_embedding_model,
         chat_template=chat_template,
@@ -293,7 +299,8 @@ def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig:
         alpha=api_lora.alpha,
         dropout=api_lora.dropout,
         target_modules=api_lora.target_modules,
-        use_triton=True,
+        exclude_modules=api_lora.exclude_modules,
+        use_triton=api_lora.use_triton,
     )
 
     if not lora.target_modules:

@@ -183,9 +183,12 @@ class BatchConfig(BaseModel):
         micro_batch_size: int = Field(default=1, gt=0)
         sequence_packing: bool = False
         sequence_packing_max_samples: int = 1000
+        split_across_pack: bool = False
 
     class OptimizerConfig(BaseModel):
         optimizer_type: Optional[OptimizerType] = Field(default=None)
+        optimizer_name: str = "adam"
+        lr_decay_style: str = "cosine"
         learning_rate: float = 1e-4
         min_learning_rate: Optional[float] = None
         eps: float = 1e-8

@@ -254,16 +254,18 @@ def compile_automodel_config(
             )
 
     # === Optimizer ===
+    # Map the optimizer choice to its torch class. Defaults to Adam
+    optimizer_targets = {"adam": "torch.optim.Adam", "adamw": "torch.optim.AdamW"}
     cfg["optimizer"] = {
-        "_target_": "torch.optim.Adam",
+        "_target_": optimizer_targets.get(customizer_config.optimizer.optimizer_name, "torch.optim.Adam"),
         "lr": customizer_config.optimizer.learning_rate,
         "weight_decay": customizer_config.optimizer.weight_decay,
         "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2],
         "eps": customizer_config.optimizer.eps,  # Adam epsilon for numerical stability
     }
 
     cfg["lr_scheduler"] = {
-        "lr_decay_style": "cosine",
+        "lr_decay_style": customizer_config.optimizer.lr_decay_style,
         "lr_warmup_steps": customizer_config.optimizer.warmup_steps,
     }
     if customizer_config.optimizer.min_learning_rate:
@@ -310,7 +312,7 @@ def compile_automodel_config(
 
             cfg["packed_sequence"] = {
                 "packed_sequence_size": optimal_pack_size,
-                "split_across_pack": False,
+                "split_across_pack": customizer_config.batch.split_across_pack,
             }
 
             # Use pack size as the effective sequence length for datasets
@@ -356,9 +358,8 @@ def compile_automodel_config(
             "use_triton": lora.use_triton,
             "target_modules": lora.target_modules,
         }
-        # TODO: Support exclude_modules via the API
-        # if lora.exclude_modules:
-        #     peft_cfg["exclude_modules"] = lora.exclude_modules
+        if lora.exclude_modules:
+            peft_cfg["exclude_modules"] = lora.exclude_modules
         cfg["peft"] = peft_cfg
 
     # === Loss ===