diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py
index 3355e01c33..02133023bc 100644
--- a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py
+++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py
@@ -38,6 +38,10 @@ class LoRAParams(BaseModel):
     dropout: float = Field(default=0.0, ge=0.0, le=1.0, description="LoRA dropout probability for regularization.")
     merge: bool = False
     target_modules: list[str] | None = None
+    exclude_modules: list[str] | None = Field(
+        default=None, description="Module name patterns to exclude from LoRA (e.g. ['*.out_proj'])."
+    )
+    use_triton: bool = Field(default=True, description="Use the optimized Triton LoRA kernel.")
 
 
 class DatasetSpec(BaseModel):
@@ -59,6 +63,10 @@ class TrainingSpec(BaseModel):
         default=None,
         description="Model precision for training. Auto-detected from the checkpoint when unset.",
     )
+    attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
+        default="sdpa",
+        description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
+    )
     execution_profile: str | None = Field(default=None, min_length=1)
     teacher_model: str | None = None
     distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0)
@@ -90,6 +98,12 @@ class BatchSpec(BaseModel):
     global_batch_size: int = Field(default=8, gt=0)
     micro_batch_size: int = Field(default=1, gt=0)
     sequence_packing: bool = False
+    sequence_packing_max_samples: int = Field(
+        default=1000, gt=0, description="Samples analyzed to estimate the optimal pack size when packing is enabled."
+    )
+    split_across_pack: bool = Field(
+        default=False, description="Allow a single sample to span two packs when sequence packing."
+    )
 
 
 class OptimizerSpec(BaseModel):
@@ -103,6 +117,11 @@ class OptimizerSpec(BaseModel):
     adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam optimizer beta1.")
     adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam optimizer beta2.")
     warmup_steps: int = Field(default=0, ge=0)
+    adam_eps: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.")
+    optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
+    lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
+        default="cosine", description="Learning-rate decay schedule."
+    )
 
 
 class ParallelismSpec(BaseModel):
diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md
index 2b6e5e0495..a04a5e000b 100644
--- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md
+++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md
@@ -117,10 +117,13 @@ Full template:
       "alpha": 32,
       "dropout": 0.0,
       "merge": false,
-      "target_modules": null
+      "target_modules": null,
+      "exclude_modules": null,
+      "use_triton": true
     },
     "max_seq_length": 2048,
     "precision": null,
+    "attn_implementation": "sdpa",
     "execution_profile": null
   },
   "schedule": {
@@ -132,7 +135,9 @@ Full template:
   "batch": {
     "global_batch_size": 4,
     "micro_batch_size": 1,
-    "sequence_packing": false
+    "sequence_packing": false,
+    "sequence_packing_max_samples": 1000,
+    "split_across_pack": false
   },
   "optimizer": {
     "learning_rate": 5e-5,
@@ -140,6 +145,9 @@ Full template:
     "weight_decay": 0.01,
     "adam_beta1": 0.9,
     "adam_beta2": 0.999,
+    "adam_eps": 1e-8,
+    "optimizer": "adam",
+    "lr_decay_style": "cosine",
     "warmup_steps": 0
   },
   "parallelism": {
@@ -171,8 +179,11 @@ Full template:
 | `lora.dropout` | `0.0` | LoRA dropout (0.0–1.0) for regularization |
 | `lora.merge` | `false` | If true with `lora_merged`, output is full weights not adapter |
 | `lora.target_modules` | `null` | e.g. `["q_proj","v_proj"]`; null = platform default targets |
+| `lora.exclude_modules` | `null` | Patterns to exclude from LoRA, e.g. `["*.out_proj"]` |
+| `lora.use_triton` | `true` | Use the optimized Triton LoRA kernel |
 | `max_seq_length` | `2048` | Truncate/pack to this length; lower if OOM |
 | `precision` | `null` | `bf16` \| `fp16` \| `fp32` \| `fp8`; null auto-detects from the checkpoint |
+| `attn_implementation` | `sdpa` | `sdpa` (PyTorch native) \| `flash_attention_2` \| `eager` |
 | `teacher_model` | — | **Model entity ref** (not HF id). Required for distillation; see below |
 | `distillation_ratio` | `0.5` | KD blend (0–1) |
 | `distillation_temperature` | `1.0` | KD temperature |
@@ -199,6 +210,8 @@ LoRA block is auto-created when `finetuning_type` is `lora` or `lora_merged`.
 | `global_batch_size` | `8` (schema) | Effective batch across all GPUs; **≥48 GB LoRA tables → `SKILL.md`** |
 | `micro_batch_size` | `1` (schema) | **Per GPU**; same SKILL tables for single- and multi-GPU (TP=1) |
 | `sequence_packing` | `false` | Pack short sequences for throughput (needs compatible data) |
+| `sequence_packing_max_samples` | `1000` | Samples analyzed to estimate the optimal pack size (only when packing) |
+| `split_across_pack` | `false` | Allow a single sample to span two packs (only when packing) |
 
 **Validation:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where:
 
@@ -215,6 +228,9 @@ Example: 1 node, 2 GPUs, TP=1 → DP=2 → GBS must be a multiple of `2 × micro
 | `weight_decay` | `0.01` | L2-style regularization |
 | `adam_beta1` | `0.9` | Adam optimizer beta1 |
 | `adam_beta2` | `0.999` | Adam optimizer beta2 |
+| `adam_eps` | `1e-8` | Adam/AdamW epsilon for numerical stability |
+| `optimizer` | `adam` | `adam` \| `adamw` |
+| `lr_decay_style` | `cosine` | `cosine` \| `linear` \| `constant` |
 | `warmup_steps` | `0` | Linear warmup; try ~10% of total steps for long runs |
 
 ### `parallelism`
@@ -403,7 +419,9 @@ Full template (every section, defaults inline):
     "load_in_4bit": true,
     "load_in_8bit": false,
     "dtype": "auto",
-    "trust_remote_code": false
+    "trust_remote_code": false,
+    "device_map": null,
+    "rope_scaling": null
   },
   "dataset": {
     "path": "default/<dataset-fileset>",
@@ -422,7 +440,13 @@ Full template (every section, defaults inline):
       "target_modules": ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
       "bias": "none",
       "use_rslora": false,
-      "random_state": 3407
+      "random_state": 3407,
+      "use_dora": false,
+      "loftq_config": null,
+      "modules_to_save": null,
+      "layers_to_transform": null,
+      "layer_replication": null,
+      "init_lora_weights": true
     },
     "use_gradient_checkpointing": "unsloth"
   },
@@ -432,6 +456,7 @@ Full template (every section, defaults inline):
     "warmup_steps": 0,
     "warmup_ratio": null,
     "lr_scheduler_type": "linear",
+    "lr_scheduler_kwargs": null,
     "logging_steps": 1,
     "save_steps": null,
     "eval_steps": null,
@@ -444,7 +469,13 @@ Full template (every section, defaults inline):
   "optimizer": {
     "learning_rate": 5e-5,
     "weight_decay": 0.0,
-    "optim": "adamw_8bit"
+    "optim": "adamw_8bit",
+    "adam_beta1": 0.9,
+    "adam_beta2": 0.999,
+    "adam_epsilon": 1e-8,
+    "max_grad_norm": 1.0,
+    "label_smoothing_factor": 0.0,
+    "neftune_noise_alpha": null
   },
   "hardware": {
     "gpus": "0",
@@ -474,6 +505,7 @@ Full template (every section, defaults inline):
 | `dtype` | `"auto"` | One of `"auto"`, `"bfloat16"`, `"float16"`, `"float32"`. |
 | `trust_remote_code` | `false` | HF `trust_remote_code` flag for custom model code (required by some hybrid Mamba/MoE models, e.g. Nemotron-H). |
 | `device_map` | `null` | Placement for `FastLanguageModel.from_pretrained`. `null` pins the whole model to the single visible GPU (`{"": 0}`) — the right default for this single-GPU backend. Leave unset unless experimenting; `"auto"`/`"balanced"`/`"sequential"` can spill layers to CPU on unified-memory hosts (GB10 / DGX Spark) and abort 4-bit loads. |
+| `rope_scaling` | `null` | RoPE scaling for long-context extension, e.g. `{"type": "linear", "factor": 2.0}`. `null` uses the model's native context length. |
 
 **Mutex:** `load_in_4bit` xor `load_in_8bit`. Both quantization flags are also **incompatible with `training.finetuning_type: "all_weights"`** — full SFT must use a non-quantized base.
 
@@ -511,6 +543,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `bias` | `"none"` | `"none"` / `"all"` / `"lora_only"`. |
 | `use_rslora` | `false` | Rank-stabilized LoRA. |
 | `random_state` | `3407` | Reproducibility seed for the LoRA init. |
+| `use_dora` | `false` | DoRA (weight-decomposed LoRA). Better quality at low ranks; adds overhead. |
+| `loftq_config` | `null` | LoftQ init config for quantized bases. `null` disables. |
+| `modules_to_save` | `null` | Extra non-LoRA modules trained & saved in full, e.g. `["embed_tokens","lm_head"]` (vocab changes / continued pretraining). |
+| `layers_to_transform` | `null` | Restrict LoRA to specific layer index(es). `null` = all layers. |
+| `layer_replication` | `null` | Layer-replication ranges for stacking, e.g. `[[0,16],[8,24]]`. |
+| `init_lora_weights` | `true` | Init scheme. `true` = PEFT default; `"gaussian"`/`"pissa"`/`"olora"`/`"loftq"` for advanced inits. |
 
 `lora` is auto-filled with these defaults when `finetuning_type: "lora"` and the user omits the block. Must be `null` / omitted when `finetuning_type: "all_weights"`.
 
@@ -523,6 +561,7 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `warmup_steps` | `0` | Linear warmup. Mutex with `warmup_ratio`. |
 | `warmup_ratio` | `null` | Fractional warmup over total steps. Mutex with `warmup_steps`. |
 | `lr_scheduler_type` | `"linear"` | `"linear"`, `"cosine"`, `"constant"`, `"constant_with_warmup"`, `"cosine_with_restarts"`. |
+| `lr_scheduler_kwargs` | `null` | Extra scheduler kwargs, e.g. `{"num_cycles": 3}` for `cosine_with_restarts`. `null` uses defaults. |
 | `logging_steps` | `1` | Loss-log cadence. |
 | `save_steps` | `null` | If set, save checkpoint every N steps. |
 | `eval_steps` | `null` | If set with `validation_path`, eval every N steps. When `null` and `validation_path` is set, the training driver defaults to **one validation pass per effective epoch** at `max(1, effective_steps - 1)` (same effective-step cap as automodel's default `val_check_interval`). |
@@ -546,6 +585,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
 | `learning_rate` | `2e-4` (schema default; skill uses `5e-5` for LoRA SFT) | See LR table below. |
 | `weight_decay` | `0.0` | L2-style regularization. |
 | `optim` | `"adamw_8bit"` | `"adamw_torch"`, `"adamw_torch_fused"` (Hopper+), `"adamw_8bit"`, `"paged_adamw_8bit"`, `"sgd"`. `adamw_8bit` has the smallest optimizer state and is Unsloth's notebook default. |
+| `adam_beta1` | `0.9` | Adam/AdamW beta1. |
+| `adam_beta2` | `0.999` | Adam/AdamW beta2. |
+| `adam_epsilon` | `1e-8` | Adam/AdamW epsilon. |
+| `max_grad_norm` | `1.0` | Gradient-clipping max norm (TRL default). |
+| `label_smoothing_factor` | `0.0` | Label smoothing for the CE loss. `0.0` disables. |
+| `neftune_noise_alpha` | `null` | NEFTune embedding-noise alpha (quality boost). `null` disables. |
 
 `warmup_steps` is on `schedule`, not on `optimizer` (different from the automodel schema).
 
diff --git a/services/automodel/src/nmp/automodel/adapter.py b/services/automodel/src/nmp/automodel/adapter.py
index fd16abc676..fda1d69d75 100644
--- a/services/automodel/src/nmp/automodel/adapter.py
+++ b/services/automodel/src/nmp/automodel/adapter.py
@@ -38,6 +38,8 @@ def _build_peft(training: dict[str, Any]) -> LoRAParams | None:
         dropout=lora.get("dropout", 0.0),
         merge=ft == "lora_merged" or lora.get("merge", False),
         target_modules=lora.get("target_modules"),
+        exclude_modules=lora.get("exclude_modules"),
+        use_triton=lora.get("use_triton", True),
     )
 
 
@@ -55,6 +57,9 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra
         "weight_decay": optimizer.get("weight_decay", 0.01),
         "adam_beta1": optimizer.get("adam_beta1", 0.9),
         "adam_beta2": optimizer.get("adam_beta2", 0.999),
+        "adam_eps": optimizer.get("adam_eps", 1e-8),
+        "optimizer": optimizer.get("optimizer", "adam"),
+        "lr_decay_style": optimizer.get("lr_decay_style", "cosine"),
         "warmup_steps": optimizer.get("warmup_steps", 0),
         "epochs": schedule.get("epochs", 1),
         "max_steps": schedule.get("max_steps"),
@@ -62,8 +67,11 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra
         "batch_size": batch.get("global_batch_size", 8),
         "micro_batch_size": batch.get("micro_batch_size", 1),
         "sequence_packing": batch.get("sequence_packing", False),
+        "sequence_packing_max_samples": batch.get("sequence_packing_max_samples", 1000),
+        "split_across_pack": batch.get("split_across_pack", False),
         "max_seq_length": training.get("max_seq_length", 2048),
         "precision": training.get("precision"),
+        "attn_implementation": training.get("attn_implementation", "sdpa"),
         "seed": schedule.get("seed"),
         "parallelism": ParallelismParams(
             num_nodes=parallelism.get("num_nodes", 1),
diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py
index 620781ba8b..517e38be4f 100644
--- a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py
+++ b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py
@@ -88,6 +88,14 @@ class LoRAParams(_PEFTParams):
         description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). "
         "If not set, applies to all '*proj' linear layers.",
     )
+    exclude_modules: Optional[list[str]] = Field(
+        default=None,
+        description="Module name patterns to exclude from LoRA (e.g., ['*.out_proj']).",
+    )
+    use_triton: bool = Field(
+        default=True,
+        description="Use the optimized Triton LoRA kernel.",
+    )
     merge: bool = Field(
         default=False,
         description="Merge LoRA weights into base model after training. "
@@ -173,12 +181,20 @@ class _TrainingBase(BaseModel):
         default=0.999,
         description="Adam beta2 parameter. Adjust for optimizer tuning.",
     )
+    adam_eps: float = Field(
+        default=1e-8,
+        gt=0.0,
+        description="Adam/AdamW epsilon for numerical stability.",
+    )
     warmup_steps: int = Field(
         default=0,
         ge=0,
         description="Linear warmup steps. Recommended: 10% of total training steps for stable training.",
     )
-    optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').")
+    optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
+    lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
+        default="cosine", description="Learning-rate decay schedule."
+    )
 
     # --- Schedule ---
     epochs: int = Field(
@@ -218,6 +234,15 @@ class _TrainingBase(BaseModel):
         default=False,
         description="Enable sequence packing for efficiency. Can improve training speed.",
     )
+    sequence_packing_max_samples: int = Field(
+        default=1000,
+        gt=0,
+        description="Samples analyzed to estimate the optimal pack size when sequence packing is enabled.",
+    )
+    split_across_pack: bool = Field(
+        default=False,
+        description="Allow a single sample to span two packs when sequence packing.",
+    )
 
     # --- Model ---
     max_seq_length: int = Field(
@@ -229,6 +254,10 @@ class _TrainingBase(BaseModel):
         default=None,
         description="Model precision for training. Auto-detected if unset.",
     )
+    attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
+        default="sdpa",
+        description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
+    )
     seed: Optional[int] = Field(
         default=None,
         description="Random seed for reproducibility. Optional.",
diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py
index 9eb1120cca..93169a7f84 100644
--- a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py
+++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py
@@ -157,10 +157,15 @@ def compile_training_step(
             global_batch_size=training.batch_size,
             micro_batch_size=training.micro_batch_size,
             sequence_packing=training.sequence_packing,
+            sequence_packing_max_samples=training.sequence_packing_max_samples,
+            split_across_pack=training.split_across_pack,
         ),
         optimizer=TrainingStepConfig.OptimizerConfig(
+            optimizer_name=training.optimizer,
+            lr_decay_style=training.lr_decay_style,
             learning_rate=training.learning_rate,
             min_learning_rate=training.min_learning_rate,
+            eps=training.adam_eps,
             weight_decay=training.weight_decay,
             beta1=training.adam_beta1,
             beta2=training.adam_beta2,
@@ -241,6 +246,7 @@ def _translate_model_config(
         name=_extract_model_name(job_spec),
         max_seq_length=training.max_seq_length,
         precision=training.precision,
+        attn_implementation=training.attn_implementation,
         trust_remote_code=trust_remote_code,
         is_embedding_model=is_embedding_model,
         chat_template=chat_template,
@@ -293,7 +299,8 @@ def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig:
         alpha=api_lora.alpha,
         dropout=api_lora.dropout,
         target_modules=api_lora.target_modules,
-        use_triton=True,
+        exclude_modules=api_lora.exclude_modules,
+        use_triton=api_lora.use_triton,
     )
 
     if not lora.target_modules:
diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py
index 6add112c70..b686515231 100644
--- a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py
+++ b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py
@@ -183,9 +183,12 @@ class BatchConfig(BaseModel):
         micro_batch_size: int = Field(default=1, gt=0)
         sequence_packing: bool = False
         sequence_packing_max_samples: int = 1000
+        split_across_pack: bool = False
 
     class OptimizerConfig(BaseModel):
         optimizer_type: Optional[OptimizerType] = Field(default=None)
+        optimizer_name: str = "adam"
+        lr_decay_style: str = "cosine"
         learning_rate: float = 1e-4
         min_learning_rate: Optional[float] = None
         eps: float = 1e-8
diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py
index 7b7852a99f..a1d1a4150d 100644
--- a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py
+++ b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py
@@ -254,8 +254,10 @@ def compile_automodel_config(
             )
 
     # === Optimizer ===
+    # Map the optimizer choice to its torch class. Defaults to Adam
+    optimizer_targets = {"adam": "torch.optim.Adam", "adamw": "torch.optim.AdamW"}
     cfg["optimizer"] = {
-        "_target_": "torch.optim.Adam",
+        "_target_": optimizer_targets.get(customizer_config.optimizer.optimizer_name, "torch.optim.Adam"),
         "lr": customizer_config.optimizer.learning_rate,
         "weight_decay": customizer_config.optimizer.weight_decay,
         "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2],
@@ -263,7 +265,7 @@ def compile_automodel_config(
     }
 
     cfg["lr_scheduler"] = {
-        "lr_decay_style": "cosine",
+        "lr_decay_style": customizer_config.optimizer.lr_decay_style,
         "lr_warmup_steps": customizer_config.optimizer.warmup_steps,
     }
     if customizer_config.optimizer.min_learning_rate:
@@ -310,7 +312,7 @@ def compile_automodel_config(
 
             cfg["packed_sequence"] = {
                 "packed_sequence_size": optimal_pack_size,
-                "split_across_pack": False,
+                "split_across_pack": customizer_config.batch.split_across_pack,
             }
 
             # Use pack size as the effective sequence length for datasets
@@ -356,9 +358,8 @@ def compile_automodel_config(
             "use_triton": lora.use_triton,
             "target_modules": lora.target_modules,
         }
-        # TODO: Support exclude_modules via the API
-        # if lora.exclude_modules:
-        #     peft_cfg["exclude_modules"] = lora.exclude_modules
+        if lora.exclude_modules:
+            peft_cfg["exclude_modules"] = lora.exclude_modules
         cfg["peft"] = peft_cfg
 
     # === Loss ===
diff --git a/services/automodel/tests/test_adapter.py b/services/automodel/tests/test_adapter.py
index 7cc6becc5f..603b978530 100644
--- a/services/automodel/tests/test_adapter.py
+++ b/services/automodel/tests/test_adapter.py
@@ -72,6 +72,57 @@ def test_adapter_new_fields_default_when_omitted() -> None:
     assert spec.training.peft.dropout == 0.0
 
 
+def test_adapter_plumbs_pass2_hyperparameters() -> None:
+    """Pass-2 hyperparameters set on the plugin spec must reach the v2 SFTTraining."""
+    spec = automodel_spec_to_compiler_output(
+        {
+            "model": "meta/llama",
+            "dataset": {"training": "default/train"},
+            "training": {
+                "training_type": "sft",
+                "finetuning_type": "lora",
+                "attn_implementation": "flash_attention_2",
+                "lora": {"rank": 8, "exclude_modules": ["*.out_proj"], "use_triton": False},
+            },
+            "optimizer": {"adam_eps": 1e-6, "optimizer": "adamw", "lr_decay_style": "linear"},
+            "batch": {"sequence_packing": True, "sequence_packing_max_samples": 256, "split_across_pack": True},
+            "output": {"name": "out", "type": "adapter", "fileset": "out-fs"},
+        },
+    )
+    assert isinstance(spec.training, SFTTraining)
+    assert spec.training.adam_eps == 1e-6
+    assert spec.training.optimizer == "adamw"
+    assert spec.training.lr_decay_style == "linear"
+    assert spec.training.attn_implementation == "flash_attention_2"
+    assert spec.training.sequence_packing_max_samples == 256
+    assert spec.training.split_across_pack is True
+    assert spec.training.peft is not None
+    assert spec.training.peft.exclude_modules == ["*.out_proj"]
+    assert spec.training.peft.use_triton is False
+
+
+def test_adapter_pass2_defaults_when_omitted() -> None:
+    """Omitting pass-2 fields preserves the historical hardcoded defaults."""
+    spec = automodel_spec_to_compiler_output(
+        {
+            "model": "meta/llama",
+            "dataset": {"training": "default/train"},
+            "training": {"training_type": "sft", "finetuning_type": "lora"},
+            "output": {"name": "out", "type": "adapter", "fileset": "out-fs"},
+        },
+    )
+    assert isinstance(spec.training, SFTTraining)
+    assert spec.training.adam_eps == 1e-8
+    assert spec.training.optimizer == "adam"
+    assert spec.training.lr_decay_style == "cosine"
+    assert spec.training.attn_implementation == "sdpa"
+    assert spec.training.sequence_packing_max_samples == 1000
+    assert spec.training.split_across_pack is False
+    assert spec.training.peft is not None
+    assert spec.training.peft.exclude_modules is None
+    assert spec.training.peft.use_triton is True
+
+
 def test_adapter_distillation() -> None:
     spec = automodel_spec_to_compiler_output(
         {
diff --git a/services/automodel/tests/test_compiler.py b/services/automodel/tests/test_compiler.py
index f0d4f0b4b0..fbdf5b09f6 100644
--- a/services/automodel/tests/test_compiler.py
+++ b/services/automodel/tests/test_compiler.py
@@ -70,6 +70,42 @@ def test_build_file_download_config_rejects_missing_model_fileset() -> None:
         _build_file_download_config(_make_job_output(), _make_mock_model_entity(fileset=None))
 
 
+def test_compile_training_step_carries_pass2_fields() -> None:
+    """Pass-2 hyperparameters on the v2 SFTTraining reach the internal TrainingStepConfig."""
+    from nmp.automodel.app.jobs.training.compiler import compile_training_step
+
+    job_output = CustomizationJobOutput(
+        model="default/test-target",
+        dataset="default/my-dataset",
+        training=SFTTraining(
+            peft=LoRAParams(rank=8, alpha=32, merge=False, exclude_modules=["*.out_proj"], use_triton=False),
+            learning_rate=1e-4,
+            adam_eps=1e-6,
+            optimizer="adamw",
+            lr_decay_style="linear",
+            attn_implementation="flash_attention_2",
+            batch_size=4,
+            micro_batch_size=1,
+            sequence_packing=True,
+            sequence_packing_max_samples=256,
+            split_across_pack=True,
+            max_seq_length=2048,
+        ),
+        output=OutputResponse(name="out", type="adapter", fileset="out-fs"),
+    )
+    step = compile_training_step(job_output, base_env=[], me=_make_mock_model_entity())
+    cfg = step.config if hasattr(step, "config") else step["config"]
+
+    assert cfg["optimizer"]["optimizer_name"] == "adamw"
+    assert cfg["optimizer"]["lr_decay_style"] == "linear"
+    assert cfg["optimizer"]["eps"] == 1e-6
+    assert cfg["model"]["attn_implementation"] == "flash_attention_2"
+    assert cfg["batch"]["sequence_packing_max_samples"] == 256
+    assert cfg["batch"]["split_across_pack"] is True
+    assert cfg["training"]["lora"]["exclude_modules"] == ["*.out_proj"]
+    assert cfg["training"]["lora"]["use_triton"] is False
+
+
 @pytest.mark.asyncio
 async def test_platform_job_config_compiler_sft_lora(mock_sdk, monkeypatch):
     monkeypatch.setattr(
diff --git a/services/unsloth/src/nmp/unsloth/schemas.py b/services/unsloth/src/nmp/unsloth/schemas.py
index 413be132fa..6ecc5d79cc 100644
--- a/services/unsloth/src/nmp/unsloth/schemas.py
+++ b/services/unsloth/src/nmp/unsloth/schemas.py
@@ -29,7 +29,7 @@
 
 from __future__ import annotations
 
-from typing import Literal
+from typing import Any, Literal
 
 from nemo_platform_plugin.integrations import IntegrationsSpec
 from pydantic import BaseModel, ConfigDict, Field
@@ -74,6 +74,14 @@ class ModelLoadSpec(BaseModel):
             "multi-device experiments."
         ),
     )
+    rope_scaling: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "RoPE scaling config for long-context extension, passed to "
+            "FastLanguageModel.from_pretrained (e.g. {'type': 'linear', 'factor': 2.0}). "
+            "None uses the model's native context length."
+        ),
+    )
 
 
 class LoRAParams(BaseModel):
@@ -99,6 +107,33 @@ class LoRAParams(BaseModel):
     bias: Literal["none", "all", "lora_only"] = "none"
     use_rslora: bool = False
     random_state: int = 3407
+    use_dora: bool = Field(
+        default=False,
+        description="DoRA (weight-decomposed LoRA). Improves quality at low ranks; adds training overhead.",
+    )
+    loftq_config: dict[str, Any] | None = Field(
+        default=None,
+        description="LoftQ initialization config for quantized bases. None disables LoftQ.",
+    )
+    modules_to_save: list[str] | None = Field(
+        default=None,
+        description=(
+            "Extra non-LoRA modules to train and save in full (e.g. ['embed_tokens', 'lm_head']). "
+            "Needed for vocab changes / continued pretraining."
+        ),
+    )
+    layers_to_transform: int | list[int] | None = Field(
+        default=None,
+        description="Restrict LoRA to specific layer index(es). None applies to all layers.",
+    )
+    layer_replication: list[list[int]] | None = Field(
+        default=None,
+        description="Layer-replication ranges for stacking, e.g. [[0, 16], [8, 24]]. None disables.",
+    )
+    init_lora_weights: bool | Literal["gaussian", "pissa", "olora", "loftq"] = Field(
+        default=True,
+        description="LoRA weight init scheme. True = PEFT default; 'pissa'/'olora'/'loftq' for advanced inits.",
+    )
 
 
 class TrainingSpec(BaseModel):
@@ -170,6 +205,13 @@ class ScheduleSpec(BaseModel):
     save_steps: int | None = Field(default=None, gt=0)
     eval_steps: int | None = Field(default=None, gt=0)
     seed: int = 3407
+    lr_scheduler_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description=(
+            "Extra kwargs for the LR scheduler, e.g. {'num_cycles': 3} for cosine_with_restarts. "
+            "None uses scheduler defaults."
+        ),
+    )
 
 
 class BatchSpec(BaseModel):
@@ -194,6 +236,18 @@ class OptimizerSpec(BaseModel):
         "paged_adamw_8bit",
         "sgd",
     ] = "adamw_8bit"
+    adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam/AdamW beta1.")
+    adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam/AdamW beta2.")
+    adam_epsilon: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.")
+    max_grad_norm: float = Field(default=1.0, ge=0.0, description="Gradient-clipping max norm (TRL default 1.0).")
+    label_smoothing_factor: float = Field(
+        default=0.0, ge=0.0, lt=1.0, description="Label smoothing for the cross-entropy loss. 0.0 disables."
+    )
+    neftune_noise_alpha: float | None = Field(
+        default=None,
+        ge=0.0,
+        description="NEFTune embedding-noise alpha (quality boost). None disables.",
+    )
 
 
 class HardwareSpec(BaseModel):
diff --git a/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py b/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py
index 2ca3532560..c26dfcd6fb 100644
--- a/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py
+++ b/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py
@@ -67,7 +67,7 @@ def build_model_load_kwargs(spec: UnslothJobOutput, resolved_model: str) -> dict
     takes its default LoRA-optimized load path and warns that full finetuning
     was not requested, leaving the all-weights run mis-configured.
     """
-    return {
+    kwargs: dict[str, Any] = {
         "model_name": resolved_model,
         "max_seq_length": spec.model.max_seq_length,
         "load_in_4bit": spec.model.load_in_4bit,
@@ -76,6 +76,45 @@ def build_model_load_kwargs(spec: UnslothJobOutput, resolved_model: str) -> dict
         "trust_remote_code": spec.model.trust_remote_code,
         "device_map": spec.model.device_map if spec.model.device_map is not None else {"": 0},
     }
+    # Only pass rope_scaling when set — None lets Unsloth use the model's native context length.
+    if spec.model.rope_scaling is not None:
+        kwargs["rope_scaling"] = spec.model.rope_scaling
+    return kwargs
+
+
+def build_peft_kwargs(spec: UnslothJobOutput, *, gradient_checkpointing: bool | str) -> dict[str, Any]:
+    """Assemble ``FastLanguageModel.get_peft_model`` kwargs for a LoRA run.
+
+    Torch-free (unit-testable). Caller resolves ``gradient_checkpointing`` from
+    ``spec.training.use_gradient_checkpointing`` (the JSON literal → ``True`` /
+    ``False`` / ``"unsloth"`` mapping). Optional knobs (``loftq_config``,
+    ``modules_to_save``, ``layers_to_transform``, ``layer_replication``) are only
+    emitted when set so PEFT/Unsloth see absence, not ``None``.
+    """
+    lora = spec.training.lora
+    assert lora is not None  # validated by UnslothJobInput
+    kwargs: dict[str, Any] = {
+        "r": lora.rank,
+        "lora_alpha": lora.alpha,
+        "lora_dropout": lora.dropout,
+        "target_modules": list(lora.target_modules),
+        "bias": lora.bias,
+        "use_rslora": lora.use_rslora,
+        "random_state": lora.random_state,
+        "use_dora": lora.use_dora,
+        "init_lora_weights": lora.init_lora_weights,
+        "use_gradient_checkpointing": gradient_checkpointing,
+        "max_seq_length": spec.model.max_seq_length,
+    }
+    if lora.loftq_config is not None:
+        kwargs["loftq_config"] = lora.loftq_config
+    if lora.modules_to_save is not None:
+        kwargs["modules_to_save"] = lora.modules_to_save
+    if lora.layers_to_transform is not None:
+        kwargs["layers_to_transform"] = lora.layers_to_transform
+    if lora.layer_replication is not None:
+        kwargs["layer_replication"] = lora.layer_replication
+    return kwargs
 
 
 def train_sft(
@@ -183,15 +222,7 @@ def train_sft(
             gc_value = False
         model = FastLanguageModel.get_peft_model(
             model,
-            r=spec.training.lora.rank,
-            lora_alpha=spec.training.lora.alpha,
-            lora_dropout=spec.training.lora.dropout,
-            target_modules=list(spec.training.lora.target_modules),
-            bias=spec.training.lora.bias,
-            use_rslora=spec.training.lora.use_rslora,
-            random_state=spec.training.lora.random_state,
-            use_gradient_checkpointing=gc_value,
-            max_seq_length=spec.model.max_seq_length,
+            **build_peft_kwargs(spec, gradient_checkpointing=gc_value),
         )
     # All-weights FT: leave `model` as-is. `build_model_load_kwargs` passed
     # `full_finetuning=True`, so `from_pretrained` routed through Unsloth's
@@ -251,6 +282,11 @@ def train_sft(
         "learning_rate": spec.optimizer.learning_rate,
         "weight_decay": spec.optimizer.weight_decay,
         "optim": spec.optimizer.optim,
+        "adam_beta1": spec.optimizer.adam_beta1,
+        "adam_beta2": spec.optimizer.adam_beta2,
+        "adam_epsilon": spec.optimizer.adam_epsilon,
+        "max_grad_norm": spec.optimizer.max_grad_norm,
+        "label_smoothing_factor": spec.optimizer.label_smoothing_factor,
         "lr_scheduler_type": spec.schedule.lr_scheduler_type,
         "warmup_steps": spec.schedule.warmup_steps,
         "logging_steps": spec.schedule.logging_steps,
@@ -263,6 +299,11 @@ def train_sft(
         "max_length": spec.model.max_seq_length,
         "packing": spec.dataset.packing,
     }
+    # Optional knobs: only set when provided so trl/transformers keep their defaults.
+    if spec.optimizer.neftune_noise_alpha is not None:
+        args_kwargs["neftune_noise_alpha"] = spec.optimizer.neftune_noise_alpha
+    if spec.schedule.lr_scheduler_kwargs is not None:
+        args_kwargs["lr_scheduler_kwargs"] = spec.schedule.lr_scheduler_kwargs
     if spec.schedule.warmup_ratio is not None:
         args_kwargs["warmup_ratio"] = spec.schedule.warmup_ratio
     # epochs always set (defaults to 1); max_steps, when present, caps/overrides it (trl semantics).
diff --git a/services/unsloth/tests/test_model_load_kwargs.py b/services/unsloth/tests/test_model_load_kwargs.py
index 6f4c22a0be..0cc812316b 100644
--- a/services/unsloth/tests/test_model_load_kwargs.py
+++ b/services/unsloth/tests/test_model_load_kwargs.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-"""Tests for ``build_model_load_kwargs`` in the unsloth SFT driver.
+"""Tests for ``build_model_load_kwargs`` / ``build_peft_kwargs`` in the unsloth SFT driver.
 
 These exercise the torch-free kwargs assembly only, so the module imports
 fine on a CPU box without ``unsloth``/``torch`` installed.
@@ -9,21 +9,30 @@
 
 from __future__ import annotations
 
+from typing import Any
+
 from nmp.unsloth.schemas import (
     DatasetSpec,
+    LoRAParams,
     ModelLoadSpec,
     OutputResponse,
     TrainingSpec,
     UnslothJobOutput,
 )
-from nmp.unsloth.tasks.training.backends.unsloth_sft import build_model_load_kwargs
+from nmp.unsloth.tasks.training.backends.unsloth_sft import build_model_load_kwargs, build_peft_kwargs
 
 
-def _spec(*, finetuning_type: str, load_in_4bit: bool) -> UnslothJobOutput:
+def _spec(
+    *,
+    finetuning_type: str = "lora",
+    load_in_4bit: bool = True,
+    model_extra: dict[str, Any] | None = None,
+    lora: LoRAParams | None = None,
+) -> UnslothJobOutput:
     return UnslothJobOutput(
-        model=ModelLoadSpec(name="meta/llama-3.1-8b", load_in_4bit=load_in_4bit),
+        model=ModelLoadSpec(name="meta/llama-3.1-8b", load_in_4bit=load_in_4bit, **(model_extra or {})),
         dataset=DatasetSpec(path="ws/train"),
-        training=TrainingSpec(finetuning_type=finetuning_type),
+        training=TrainingSpec(finetuning_type=finetuning_type, lora=lora),
         output=OutputResponse(name="out", type="model", save_method="lora", fileset="out"),
     )
 
@@ -47,3 +56,44 @@ def test_dtype_not_included() -> None:
     # dtype mapping needs torch and stays in train_sft; the helper must not emit it.
     kwargs = build_model_load_kwargs(_spec(finetuning_type="lora", load_in_4bit=True), "/local/model")
     assert "dtype" not in kwargs
+
+
+def test_rope_scaling_omitted_when_none() -> None:
+    kwargs = build_model_load_kwargs(_spec(), "/local/model")
+    assert "rope_scaling" not in kwargs
+
+
+def test_rope_scaling_passed_when_set() -> None:
+    spec = _spec(model_extra={"rope_scaling": {"type": "linear", "factor": 2.0}})
+    kwargs = build_model_load_kwargs(spec, "/local/model")
+    assert kwargs["rope_scaling"] == {"type": "linear", "factor": 2.0}
+
+
+def test_peft_kwargs_defaults_preserve_behavior() -> None:
+    # Default LoRAParams → library-default knobs; optional ones omitted (not None).
+    kwargs = build_peft_kwargs(_spec(lora=LoRAParams()), gradient_checkpointing="unsloth")
+    assert kwargs["r"] == 16
+    assert kwargs["use_dora"] is False
+    assert kwargs["init_lora_weights"] is True
+    assert kwargs["use_gradient_checkpointing"] == "unsloth"
+    for omitted in ("loftq_config", "modules_to_save", "layers_to_transform", "layer_replication"):
+        assert omitted not in kwargs
+
+
+def test_peft_kwargs_emits_optional_fields_when_set() -> None:
+    lora = LoRAParams(
+        use_dora=True,
+        init_lora_weights="pissa",
+        modules_to_save=["embed_tokens", "lm_head"],
+        layers_to_transform=[0, 1, 2],
+        layer_replication=[[0, 16], [8, 24]],
+        loftq_config={"loftq_bits": 4},
+    )
+    kwargs = build_peft_kwargs(_spec(lora=lora), gradient_checkpointing=True)
+    assert kwargs["use_dora"] is True
+    assert kwargs["init_lora_weights"] == "pissa"
+    assert kwargs["modules_to_save"] == ["embed_tokens", "lm_head"]
+    assert kwargs["layers_to_transform"] == [0, 1, 2]
+    assert kwargs["layer_replication"] == [[0, 16], [8, 24]]
+    assert kwargs["loftq_config"] == {"loftq_bits": 4}
+    assert kwargs["use_gradient_checkpointing"] is True