Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ class LoRAParams(BaseModel):
dropout: float = Field(default=0.0, ge=0.0, le=1.0, description="LoRA dropout probability for regularization.")
merge: bool = False
target_modules: list[str] | None = None
exclude_modules: list[str] | None = Field(
default=None, description="Module name patterns to exclude from LoRA (e.g. ['*.out_proj'])."
)
use_triton: bool = Field(default=True, description="Use the optimized Triton LoRA kernel.")


class DatasetSpec(BaseModel):
Expand All @@ -59,6 +63,10 @@ class TrainingSpec(BaseModel):
default=None,
description="Model precision for training. Auto-detected from the checkpoint when unset.",
)
attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
default="sdpa",
description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
)
execution_profile: str | None = Field(default=None, min_length=1)
teacher_model: str | None = None
distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0)
Expand Down Expand Up @@ -90,6 +98,12 @@ class BatchSpec(BaseModel):
global_batch_size: int = Field(default=8, gt=0)
micro_batch_size: int = Field(default=1, gt=0)
sequence_packing: bool = False
sequence_packing_max_samples: int = Field(
default=1000, gt=0, description="Samples analyzed to estimate the optimal pack size when packing is enabled."
)
split_across_pack: bool = Field(
default=False, description="Allow a single sample to span two packs when sequence packing."
)


class OptimizerSpec(BaseModel):
Expand All @@ -103,6 +117,11 @@ class OptimizerSpec(BaseModel):
adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam optimizer beta1.")
adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam optimizer beta2.")
warmup_steps: int = Field(default=0, ge=0)
adam_eps: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.")
optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
default="cosine", description="Learning-rate decay schedule."
)


class ParallelismSpec(BaseModel):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,10 +117,13 @@ Full template:
"alpha": 32,
"dropout": 0.0,
"merge": false,
"target_modules": null
"target_modules": null,
"exclude_modules": null,
"use_triton": true
},
"max_seq_length": 2048,
"precision": null,
"attn_implementation": "sdpa",
"execution_profile": null
},
"schedule": {
Expand All @@ -132,14 +135,19 @@ Full template:
"batch": {
"global_batch_size": 4,
"micro_batch_size": 1,
"sequence_packing": false
"sequence_packing": false,
"sequence_packing_max_samples": 1000,
"split_across_pack": false
},
"optimizer": {
"learning_rate": 5e-5,
"min_learning_rate": null,
"weight_decay": 0.01,
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_eps": 1e-8,
"optimizer": "adam",
"lr_decay_style": "cosine",
"warmup_steps": 0
},
"parallelism": {
Expand Down Expand Up @@ -171,8 +179,11 @@ Full template:
| `lora.dropout` | `0.0` | LoRA dropout (0.0–1.0) for regularization |
| `lora.merge` | `false` | If true with `lora_merged`, output is full weights not adapter |
| `lora.target_modules` | `null` | e.g. `["q_proj","v_proj"]`; null = platform default targets |
| `lora.exclude_modules` | `null` | Patterns to exclude from LoRA, e.g. `["*.out_proj"]` |
| `lora.use_triton` | `true` | Use the optimized Triton LoRA kernel |
| `max_seq_length` | `2048` | Truncate/pack to this length; lower if OOM |
| `precision` | `null` | `bf16` \| `fp16` \| `fp32` \| `fp8`; null auto-detects from the checkpoint |
| `attn_implementation` | `sdpa` | `sdpa` (PyTorch native) \| `flash_attention_2` \| `eager` |
| `teacher_model` | — | **Model entity ref** (not HF id). Required for distillation; see below |
| `distillation_ratio` | `0.5` | KD blend (0–1) |
| `distillation_temperature` | `1.0` | KD temperature |
Expand All @@ -199,6 +210,8 @@ LoRA block is auto-created when `finetuning_type` is `lora` or `lora_merged`.
| `global_batch_size` | `8` (schema) | Effective batch across all GPUs; **≥48 GB LoRA tables → `SKILL.md`** |
| `micro_batch_size` | `1` (schema) | **Per GPU**; same SKILL tables for single- and multi-GPU (TP=1) |
| `sequence_packing` | `false` | Pack short sequences for throughput (needs compatible data) |
| `sequence_packing_max_samples` | `1000` | Samples analyzed to estimate the optimal pack size (only when packing) |
| `split_across_pack` | `false` | Allow a single sample to span two packs (only when packing) |

**Validation:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where:

Expand All @@ -215,6 +228,9 @@ Example: 1 node, 2 GPUs, TP=1 → DP=2 → GBS must be a multiple of `2 × micro
| `weight_decay` | `0.01` | L2-style regularization |
| `adam_beta1` | `0.9` | Adam optimizer beta1 |
| `adam_beta2` | `0.999` | Adam optimizer beta2 |
| `adam_eps` | `1e-8` | Adam/AdamW epsilon for numerical stability |
| `optimizer` | `adam` | `adam` \| `adamw` |
| `lr_decay_style` | `cosine` | `cosine` \| `linear` \| `constant` |
| `warmup_steps` | `0` | Linear warmup; try ~10% of total steps for long runs |

### `parallelism`
Expand Down Expand Up @@ -403,7 +419,9 @@ Full template (every section, defaults inline):
"load_in_4bit": true,
"load_in_8bit": false,
"dtype": "auto",
"trust_remote_code": false
"trust_remote_code": false,
"device_map": null,
"rope_scaling": null
},
"dataset": {
"path": "default/<dataset-fileset>",
Expand All @@ -422,7 +440,13 @@ Full template (every section, defaults inline):
"target_modules": ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
"bias": "none",
"use_rslora": false,
"random_state": 3407
"random_state": 3407,
"use_dora": false,
"loftq_config": null,
"modules_to_save": null,
"layers_to_transform": null,
"layer_replication": null,
"init_lora_weights": true
},
"use_gradient_checkpointing": "unsloth"
},
Expand All @@ -432,6 +456,7 @@ Full template (every section, defaults inline):
"warmup_steps": 0,
"warmup_ratio": null,
"lr_scheduler_type": "linear",
"lr_scheduler_kwargs": null,
"logging_steps": 1,
"save_steps": null,
"eval_steps": null,
Expand All @@ -444,7 +469,13 @@ Full template (every section, defaults inline):
"optimizer": {
"learning_rate": 5e-5,
"weight_decay": 0.0,
"optim": "adamw_8bit"
"optim": "adamw_8bit",
"adam_beta1": 0.9,
"adam_beta2": 0.999,
"adam_epsilon": 1e-8,
"max_grad_norm": 1.0,
"label_smoothing_factor": 0.0,
"neftune_noise_alpha": null
},
"hardware": {
"gpus": "0",
Expand Down Expand Up @@ -474,6 +505,7 @@ Full template (every section, defaults inline):
| `dtype` | `"auto"` | One of `"auto"`, `"bfloat16"`, `"float16"`, `"float32"`. |
| `trust_remote_code` | `false` | HF `trust_remote_code` flag for custom model code (required by some hybrid Mamba/MoE models, e.g. Nemotron-H). |
| `device_map` | `null` | Placement for `FastLanguageModel.from_pretrained`. `null` pins the whole model to the single visible GPU (`{"": 0}`) — the right default for this single-GPU backend. Leave unset unless experimenting; `"auto"`/`"balanced"`/`"sequential"` can spill layers to CPU on unified-memory hosts (GB10 / DGX Spark) and abort 4-bit loads. |
| `rope_scaling` | `null` | RoPE scaling for long-context extension, e.g. `{"type": "linear", "factor": 2.0}`. `null` uses the model's native context length. |

**Mutex:** `load_in_4bit` xor `load_in_8bit`. Both quantization flags are also **incompatible with `training.finetuning_type: "all_weights"`** — full SFT must use a non-quantized base.

Expand Down Expand Up @@ -511,6 +543,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
| `bias` | `"none"` | `"none"` / `"all"` / `"lora_only"`. |
| `use_rslora` | `false` | Rank-stabilized LoRA. |
| `random_state` | `3407` | Reproducibility seed for the LoRA init. |
| `use_dora` | `false` | DoRA (weight-decomposed LoRA). Better quality at low ranks; adds overhead. |
| `loftq_config` | `null` | LoftQ init config for quantized bases. `null` disables. |
| `modules_to_save` | `null` | Extra non-LoRA modules trained & saved in full, e.g. `["embed_tokens","lm_head"]` (vocab changes / continued pretraining). |
| `layers_to_transform` | `null` | Restrict LoRA to specific layer index(es). `null` = all layers. |
| `layer_replication` | `null` | Layer-replication ranges for stacking, e.g. `[[0,16],[8,24]]`. |
| `init_lora_weights` | `true` | Init scheme. `true` = PEFT default; `"gaussian"`/`"pissa"`/`"olora"`/`"loftq"` for advanced inits. |

`lora` is auto-filled with these defaults when `finetuning_type: "lora"` and the user omits the block. Must be `null` / omitted when `finetuning_type: "all_weights"`.

Expand All @@ -523,6 +561,7 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
| `warmup_steps` | `0` | Linear warmup. Mutex with `warmup_ratio`. |
| `warmup_ratio` | `null` | Fractional warmup over total steps. Mutex with `warmup_steps`. |
| `lr_scheduler_type` | `"linear"` | `"linear"`, `"cosine"`, `"constant"`, `"constant_with_warmup"`, `"cosine_with_restarts"`. |
| `lr_scheduler_kwargs` | `null` | Extra scheduler kwargs, e.g. `{"num_cycles": 3}` for `cosine_with_restarts`. `null` uses defaults. |
| `logging_steps` | `1` | Loss-log cadence. |
| `save_steps` | `null` | If set, save checkpoint every N steps. |
| `eval_steps` | `null` | If set with `validation_path`, eval every N steps. When `null` and `validation_path` is set, the training driver defaults to **one validation pass per effective epoch** at `max(1, effective_steps - 1)` (same effective-step cap as automodel's default `val_check_interval`). |
Expand All @@ -546,6 +585,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules.
| `learning_rate` | `2e-4` (schema default; skill uses `5e-5` for LoRA SFT) | See LR table below. |
| `weight_decay` | `0.0` | L2-style regularization. |
| `optim` | `"adamw_8bit"` | `"adamw_torch"`, `"adamw_torch_fused"` (Hopper+), `"adamw_8bit"`, `"paged_adamw_8bit"`, `"sgd"`. `adamw_8bit` has the smallest optimizer state and is Unsloth's notebook default. |
| `adam_beta1` | `0.9` | Adam/AdamW beta1. |
| `adam_beta2` | `0.999` | Adam/AdamW beta2. |
| `adam_epsilon` | `1e-8` | Adam/AdamW epsilon. |
| `max_grad_norm` | `1.0` | Gradient-clipping max norm (TRL default). |
| `label_smoothing_factor` | `0.0` | Label smoothing for the CE loss. `0.0` disables. |
| `neftune_noise_alpha` | `null` | NEFTune embedding-noise alpha (quality boost). `null` disables. |

`warmup_steps` is on `schedule`, not on `optimizer` (different from the automodel schema).

Expand Down
8 changes: 8 additions & 0 deletions services/automodel/src/nmp/automodel/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ def _build_peft(training: dict[str, Any]) -> LoRAParams | None:
dropout=lora.get("dropout", 0.0),
merge=ft == "lora_merged" or lora.get("merge", False),
target_modules=lora.get("target_modules"),
exclude_modules=lora.get("exclude_modules"),
use_triton=lora.get("use_triton", True),
)


Expand All @@ -55,15 +57,21 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra
"weight_decay": optimizer.get("weight_decay", 0.01),
"adam_beta1": optimizer.get("adam_beta1", 0.9),
"adam_beta2": optimizer.get("adam_beta2", 0.999),
"adam_eps": optimizer.get("adam_eps", 1e-8),
"optimizer": optimizer.get("optimizer", "adam"),
"lr_decay_style": optimizer.get("lr_decay_style", "cosine"),
"warmup_steps": optimizer.get("warmup_steps", 0),
"epochs": schedule.get("epochs", 1),
"max_steps": schedule.get("max_steps"),
"val_check_interval": schedule.get("val_check_interval"),
"batch_size": batch.get("global_batch_size", 8),
"micro_batch_size": batch.get("micro_batch_size", 1),
"sequence_packing": batch.get("sequence_packing", False),
"sequence_packing_max_samples": batch.get("sequence_packing_max_samples", 1000),
"split_across_pack": batch.get("split_across_pack", False),
"max_seq_length": training.get("max_seq_length", 2048),
"precision": training.get("precision"),
"attn_implementation": training.get("attn_implementation", "sdpa"),
"seed": schedule.get("seed"),
"parallelism": ParallelismParams(
num_nodes=parallelism.get("num_nodes", 1),
Expand Down
31 changes: 30 additions & 1 deletion services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,14 @@ class LoRAParams(_PEFTParams):
description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). "
"If not set, applies to all '*proj' linear layers.",
)
exclude_modules: Optional[list[str]] = Field(
default=None,
description="Module name patterns to exclude from LoRA (e.g., ['*.out_proj']).",
)
use_triton: bool = Field(
default=True,
description="Use the optimized Triton LoRA kernel.",
)
merge: bool = Field(
default=False,
description="Merge LoRA weights into base model after training. "
Expand Down Expand Up @@ -173,12 +181,20 @@ class _TrainingBase(BaseModel):
default=0.999,
description="Adam beta2 parameter. Adjust for optimizer tuning.",
)
adam_eps: float = Field(
default=1e-8,
gt=0.0,
description="Adam/AdamW epsilon for numerical stability.",
)
warmup_steps: int = Field(
default=0,
ge=0,
description="Linear warmup steps. Recommended: 10% of total training steps for stable training.",
)
optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').")
optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.")
lr_decay_style: Literal["cosine", "linear", "constant"] = Field(
default="cosine", description="Learning-rate decay schedule."
)

# --- Schedule ---
epochs: int = Field(
Expand Down Expand Up @@ -218,6 +234,15 @@ class _TrainingBase(BaseModel):
default=False,
description="Enable sequence packing for efficiency. Can improve training speed.",
)
sequence_packing_max_samples: int = Field(
default=1000,
gt=0,
description="Samples analyzed to estimate the optimal pack size when sequence packing is enabled.",
)
split_across_pack: bool = Field(
default=False,
description="Allow a single sample to span two packs when sequence packing.",
)

# --- Model ---
max_seq_length: int = Field(
Expand All @@ -229,6 +254,10 @@ class _TrainingBase(BaseModel):
default=None,
description="Model precision for training. Auto-detected if unset.",
)
attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field(
default="sdpa",
description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.",
)
seed: Optional[int] = Field(
default=None,
description="Random seed for reproducibility. Optional.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,15 @@ def compile_training_step(
global_batch_size=training.batch_size,
micro_batch_size=training.micro_batch_size,
sequence_packing=training.sequence_packing,
sequence_packing_max_samples=training.sequence_packing_max_samples,
split_across_pack=training.split_across_pack,
),
optimizer=TrainingStepConfig.OptimizerConfig(
optimizer_name=training.optimizer,
lr_decay_style=training.lr_decay_style,
learning_rate=training.learning_rate,
min_learning_rate=training.min_learning_rate,
eps=training.adam_eps,
weight_decay=training.weight_decay,
beta1=training.adam_beta1,
beta2=training.adam_beta2,
Expand Down Expand Up @@ -241,6 +246,7 @@ def _translate_model_config(
name=_extract_model_name(job_spec),
max_seq_length=training.max_seq_length,
precision=training.precision,
attn_implementation=training.attn_implementation,
trust_remote_code=trust_remote_code,
is_embedding_model=is_embedding_model,
chat_template=chat_template,
Expand Down Expand Up @@ -293,7 +299,8 @@ def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig:
alpha=api_lora.alpha,
dropout=api_lora.dropout,
target_modules=api_lora.target_modules,
use_triton=True,
exclude_modules=api_lora.exclude_modules,
use_triton=api_lora.use_triton,
)

if not lora.target_modules:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,12 @@ class BatchConfig(BaseModel):
micro_batch_size: int = Field(default=1, gt=0)
sequence_packing: bool = False
sequence_packing_max_samples: int = 1000
split_across_pack: bool = False

class OptimizerConfig(BaseModel):
optimizer_type: Optional[OptimizerType] = Field(default=None)
optimizer_name: str = "adam"
lr_decay_style: str = "cosine"
learning_rate: float = 1e-4
min_learning_rate: Optional[float] = None
eps: float = 1e-8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -254,16 +254,18 @@ def compile_automodel_config(
)

# === Optimizer ===
# Map the optimizer choice to its torch class. Defaults to Adam
optimizer_targets = {"adam": "torch.optim.Adam", "adamw": "torch.optim.AdamW"}
cfg["optimizer"] = {
"_target_": "torch.optim.Adam",
"_target_": optimizer_targets.get(customizer_config.optimizer.optimizer_name, "torch.optim.Adam"),
"lr": customizer_config.optimizer.learning_rate,
Comment on lines +257 to 261

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Fail fast on unknown optimizer names instead of silently using Adam.

Line 260 masks bad config values by defaulting to Adam. That can run with a different optimizer than requested without any error.

Suggested fix
     # Map the optimizer choice to its torch class. Defaults to Adam
     optimizer_targets = {"adam": "torch.optim.Adam", "adamw": "torch.optim.AdamW"}
+    optimizer_target = optimizer_targets.get(customizer_config.optimizer.optimizer_name)
+    if optimizer_target is None:
+        raise ValueError(
+            f"Unsupported optimizer '{customizer_config.optimizer.optimizer_name}'. "
+            f"Expected one of: {', '.join(sorted(optimizer_targets))}."
+        )
     cfg["optimizer"] = {
-        "_target_": optimizer_targets.get(customizer_config.optimizer.optimizer_name, "torch.optim.Adam"),
+        "_target_": optimizer_target,
         "lr": customizer_config.optimizer.learning_rate,
         "weight_decay": customizer_config.optimizer.weight_decay,
         "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2],
         "eps": customizer_config.optimizer.eps,  # Adam epsilon for numerical stability
     }
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@services/automodel/src/nmp/automodel/tasks/training/backends/config.py`
around lines 257 - 261, The optimizer_targets.get() call with a default fallback
to Adam silently masks configuration errors when an unknown optimizer_name is
provided. Instead of using the get method with a default, explicitly check if
customizer_config.optimizer.optimizer_name exists in the optimizer_targets
dictionary and raise a clear error if it does not, ensuring invalid optimizer
configurations fail immediately rather than silently defaulting to Adam.

"weight_decay": customizer_config.optimizer.weight_decay,
"betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2],
"eps": customizer_config.optimizer.eps, # Adam epsilon for numerical stability
}

cfg["lr_scheduler"] = {
"lr_decay_style": "cosine",
"lr_decay_style": customizer_config.optimizer.lr_decay_style,
"lr_warmup_steps": customizer_config.optimizer.warmup_steps,
}
if customizer_config.optimizer.min_learning_rate:
Expand Down Expand Up @@ -310,7 +312,7 @@ def compile_automodel_config(

cfg["packed_sequence"] = {
"packed_sequence_size": optimal_pack_size,
"split_across_pack": False,
"split_across_pack": customizer_config.batch.split_across_pack,
}

# Use pack size as the effective sequence length for datasets
Expand Down Expand Up @@ -356,9 +358,8 @@ def compile_automodel_config(
"use_triton": lora.use_triton,
"target_modules": lora.target_modules,
}
# TODO: Support exclude_modules via the API
# if lora.exclude_modules:
# peft_cfg["exclude_modules"] = lora.exclude_modules
if lora.exclude_modules:
peft_cfg["exclude_modules"] = lora.exclude_modules
cfg["peft"] = peft_cfg

# === Loss ===
Expand Down
Loading
Loading