diff --git a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py index 3355e01c33..02133023bc 100644 --- a/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py +++ b/plugins/nemo-automodel/src/nemo_automodel_plugin/schema.py @@ -38,6 +38,10 @@ class LoRAParams(BaseModel): dropout: float = Field(default=0.0, ge=0.0, le=1.0, description="LoRA dropout probability for regularization.") merge: bool = False target_modules: list[str] | None = None + exclude_modules: list[str] | None = Field( + default=None, description="Module name patterns to exclude from LoRA (e.g. ['*.out_proj'])." + ) + use_triton: bool = Field(default=True, description="Use the optimized Triton LoRA kernel.") class DatasetSpec(BaseModel): @@ -59,6 +63,10 @@ class TrainingSpec(BaseModel): default=None, description="Model precision for training. Auto-detected from the checkpoint when unset.", ) + attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field( + default="sdpa", + description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.", + ) execution_profile: str | None = Field(default=None, min_length=1) teacher_model: str | None = None distillation_ratio: float = Field(default=0.5, ge=0.0, le=1.0) @@ -90,6 +98,12 @@ class BatchSpec(BaseModel): global_batch_size: int = Field(default=8, gt=0) micro_batch_size: int = Field(default=1, gt=0) sequence_packing: bool = False + sequence_packing_max_samples: int = Field( + default=1000, gt=0, description="Samples analyzed to estimate the optimal pack size when packing is enabled." + ) + split_across_pack: bool = Field( + default=False, description="Allow a single sample to span two packs when sequence packing." + ) class OptimizerSpec(BaseModel): @@ -103,6 +117,11 @@ class OptimizerSpec(BaseModel): adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam optimizer beta1.") adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam optimizer beta2.") warmup_steps: int = Field(default=0, ge=0) + adam_eps: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.") + optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.") + lr_decay_style: Literal["cosine", "linear", "constant"] = Field( + default="cosine", description="Learning-rate decay schedule." + ) class ParallelismSpec(BaseModel): diff --git a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md index 2b6e5e0495..a04a5e000b 100644 --- a/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md +++ b/plugins/nemo-customizer/src/nemo_customizer/skills/nemo-customizer/references/hyperparameters.md @@ -117,10 +117,13 @@ Full template: "alpha": 32, "dropout": 0.0, "merge": false, - "target_modules": null + "target_modules": null, + "exclude_modules": null, + "use_triton": true }, "max_seq_length": 2048, "precision": null, + "attn_implementation": "sdpa", "execution_profile": null }, "schedule": { @@ -132,7 +135,9 @@ Full template: "batch": { "global_batch_size": 4, "micro_batch_size": 1, - "sequence_packing": false + "sequence_packing": false, + "sequence_packing_max_samples": 1000, + "split_across_pack": false }, "optimizer": { "learning_rate": 5e-5, @@ -140,6 +145,9 @@ Full template: "weight_decay": 0.01, "adam_beta1": 0.9, "adam_beta2": 0.999, + "adam_eps": 1e-8, + "optimizer": "adam", + "lr_decay_style": "cosine", "warmup_steps": 0 }, "parallelism": { @@ -171,8 +179,11 @@ Full template: | `lora.dropout` | `0.0` | LoRA dropout (0.0–1.0) for regularization | | `lora.merge` | `false` | If true with `lora_merged`, output is full weights not adapter | | `lora.target_modules` | `null` | e.g. `["q_proj","v_proj"]`; null = platform default targets | +| `lora.exclude_modules` | `null` | Patterns to exclude from LoRA, e.g. `["*.out_proj"]` | +| `lora.use_triton` | `true` | Use the optimized Triton LoRA kernel | | `max_seq_length` | `2048` | Truncate/pack to this length; lower if OOM | | `precision` | `null` | `bf16` \| `fp16` \| `fp32` \| `fp8`; null auto-detects from the checkpoint | +| `attn_implementation` | `sdpa` | `sdpa` (PyTorch native) \| `flash_attention_2` \| `eager` | | `teacher_model` | — | **Model entity ref** (not HF id). Required for distillation; see below | | `distillation_ratio` | `0.5` | KD blend (0–1) | | `distillation_temperature` | `1.0` | KD temperature | @@ -199,6 +210,8 @@ LoRA block is auto-created when `finetuning_type` is `lora` or `lora_merged`. | `global_batch_size` | `8` (schema) | Effective batch across all GPUs; **≥48 GB LoRA tables → `SKILL.md`** | | `micro_batch_size` | `1` (schema) | **Per GPU**; same SKILL tables for single- and multi-GPU (TP=1) | | `sequence_packing` | `false` | Pack short sequences for throughput (needs compatible data) | +| `sequence_packing_max_samples` | `1000` | Samples analyzed to estimate the optimal pack size (only when packing) | +| `split_across_pack` | `false` | Allow a single sample to span two packs (only when packing) | **Validation:** `global_batch_size` must be divisible by `micro_batch_size × data_parallel_size`, where: @@ -215,6 +228,9 @@ Example: 1 node, 2 GPUs, TP=1 → DP=2 → GBS must be a multiple of `2 × micro | `weight_decay` | `0.01` | L2-style regularization | | `adam_beta1` | `0.9` | Adam optimizer beta1 | | `adam_beta2` | `0.999` | Adam optimizer beta2 | +| `adam_eps` | `1e-8` | Adam/AdamW epsilon for numerical stability | +| `optimizer` | `adam` | `adam` \| `adamw` | +| `lr_decay_style` | `cosine` | `cosine` \| `linear` \| `constant` | | `warmup_steps` | `0` | Linear warmup; try ~10% of total steps for long runs | ### `parallelism` @@ -403,7 +419,9 @@ Full template (every section, defaults inline): "load_in_4bit": true, "load_in_8bit": false, "dtype": "auto", - "trust_remote_code": false + "trust_remote_code": false, + "device_map": null, + "rope_scaling": null }, "dataset": { "path": "default/", @@ -422,7 +440,13 @@ Full template (every section, defaults inline): "target_modules": ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], "bias": "none", "use_rslora": false, - "random_state": 3407 + "random_state": 3407, + "use_dora": false, + "loftq_config": null, + "modules_to_save": null, + "layers_to_transform": null, + "layer_replication": null, + "init_lora_weights": true }, "use_gradient_checkpointing": "unsloth" }, @@ -432,6 +456,7 @@ Full template (every section, defaults inline): "warmup_steps": 0, "warmup_ratio": null, "lr_scheduler_type": "linear", + "lr_scheduler_kwargs": null, "logging_steps": 1, "save_steps": null, "eval_steps": null, @@ -444,7 +469,13 @@ Full template (every section, defaults inline): "optimizer": { "learning_rate": 5e-5, "weight_decay": 0.0, - "optim": "adamw_8bit" + "optim": "adamw_8bit", + "adam_beta1": 0.9, + "adam_beta2": 0.999, + "adam_epsilon": 1e-8, + "max_grad_norm": 1.0, + "label_smoothing_factor": 0.0, + "neftune_noise_alpha": null }, "hardware": { "gpus": "0", @@ -474,6 +505,7 @@ Full template (every section, defaults inline): | `dtype` | `"auto"` | One of `"auto"`, `"bfloat16"`, `"float16"`, `"float32"`. | | `trust_remote_code` | `false` | HF `trust_remote_code` flag for custom model code (required by some hybrid Mamba/MoE models, e.g. Nemotron-H). | | `device_map` | `null` | Placement for `FastLanguageModel.from_pretrained`. `null` pins the whole model to the single visible GPU (`{"": 0}`) — the right default for this single-GPU backend. Leave unset unless experimenting; `"auto"`/`"balanced"`/`"sequential"` can spill layers to CPU on unified-memory hosts (GB10 / DGX Spark) and abort 4-bit loads. | +| `rope_scaling` | `null` | RoPE scaling for long-context extension, e.g. `{"type": "linear", "factor": 2.0}`. `null` uses the model's native context length. | **Mutex:** `load_in_4bit` xor `load_in_8bit`. Both quantization flags are also **incompatible with `training.finetuning_type: "all_weights"`** — full SFT must use a non-quantized base. @@ -511,6 +543,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules. | `bias` | `"none"` | `"none"` / `"all"` / `"lora_only"`. | | `use_rslora` | `false` | Rank-stabilized LoRA. | | `random_state` | `3407` | Reproducibility seed for the LoRA init. | +| `use_dora` | `false` | DoRA (weight-decomposed LoRA). Better quality at low ranks; adds overhead. | +| `loftq_config` | `null` | LoftQ init config for quantized bases. `null` disables. | +| `modules_to_save` | `null` | Extra non-LoRA modules trained & saved in full, e.g. `["embed_tokens","lm_head"]` (vocab changes / continued pretraining). | +| `layers_to_transform` | `null` | Restrict LoRA to specific layer index(es). `null` = all layers. | +| `layer_replication` | `null` | Layer-replication ranges for stacking, e.g. `[[0,16],[8,24]]`. | +| `init_lora_weights` | `true` | Init scheme. `true` = PEFT default; `"gaussian"`/`"pissa"`/`"olora"`/`"loftq"` for advanced inits. | `lora` is auto-filled with these defaults when `finetuning_type: "lora"` and the user omits the block. Must be `null` / omitted when `finetuning_type: "all_weights"`. @@ -523,6 +561,7 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules. | `warmup_steps` | `0` | Linear warmup. Mutex with `warmup_ratio`. | | `warmup_ratio` | `null` | Fractional warmup over total steps. Mutex with `warmup_steps`. | | `lr_scheduler_type` | `"linear"` | `"linear"`, `"cosine"`, `"constant"`, `"constant_with_warmup"`, `"cosine_with_restarts"`. | +| `lr_scheduler_kwargs` | `null` | Extra scheduler kwargs, e.g. `{"num_cycles": 3}` for `cosine_with_restarts`. `null` uses defaults. | | `logging_steps` | `1` | Loss-log cadence. | | `save_steps` | `null` | If set, save checkpoint every N steps. | | `eval_steps` | `null` | If set with `validation_path`, eval every N steps. When `null` and `validation_path` is set, the training driver defaults to **one validation pass per effective epoch** at `max(1, effective_steps - 1)` (same effective-step cap as automodel's default `val_check_interval`). | @@ -546,6 +585,12 @@ See `references/dataset-formats.md` § Unsloth for row-shape rules. | `learning_rate` | `2e-4` (schema default; skill uses `5e-5` for LoRA SFT) | See LR table below. | | `weight_decay` | `0.0` | L2-style regularization. | | `optim` | `"adamw_8bit"` | `"adamw_torch"`, `"adamw_torch_fused"` (Hopper+), `"adamw_8bit"`, `"paged_adamw_8bit"`, `"sgd"`. `adamw_8bit` has the smallest optimizer state and is Unsloth's notebook default. | +| `adam_beta1` | `0.9` | Adam/AdamW beta1. | +| `adam_beta2` | `0.999` | Adam/AdamW beta2. | +| `adam_epsilon` | `1e-8` | Adam/AdamW epsilon. | +| `max_grad_norm` | `1.0` | Gradient-clipping max norm (TRL default). | +| `label_smoothing_factor` | `0.0` | Label smoothing for the CE loss. `0.0` disables. | +| `neftune_noise_alpha` | `null` | NEFTune embedding-noise alpha (quality boost). `null` disables. | `warmup_steps` is on `schedule`, not on `optimizer` (different from the automodel schema). diff --git a/services/automodel/src/nmp/automodel/adapter.py b/services/automodel/src/nmp/automodel/adapter.py index fd16abc676..fda1d69d75 100644 --- a/services/automodel/src/nmp/automodel/adapter.py +++ b/services/automodel/src/nmp/automodel/adapter.py @@ -38,6 +38,8 @@ def _build_peft(training: dict[str, Any]) -> LoRAParams | None: dropout=lora.get("dropout", 0.0), merge=ft == "lora_merged" or lora.get("merge", False), target_modules=lora.get("target_modules"), + exclude_modules=lora.get("exclude_modules"), + use_triton=lora.get("use_triton", True), ) @@ -55,6 +57,9 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra "weight_decay": optimizer.get("weight_decay", 0.01), "adam_beta1": optimizer.get("adam_beta1", 0.9), "adam_beta2": optimizer.get("adam_beta2", 0.999), + "adam_eps": optimizer.get("adam_eps", 1e-8), + "optimizer": optimizer.get("optimizer", "adam"), + "lr_decay_style": optimizer.get("lr_decay_style", "cosine"), "warmup_steps": optimizer.get("warmup_steps", 0), "epochs": schedule.get("epochs", 1), "max_steps": schedule.get("max_steps"), @@ -62,8 +67,11 @@ def _build_training_block(spec: dict[str, Any]) -> SFTTraining | DistillationTra "batch_size": batch.get("global_batch_size", 8), "micro_batch_size": batch.get("micro_batch_size", 1), "sequence_packing": batch.get("sequence_packing", False), + "sequence_packing_max_samples": batch.get("sequence_packing_max_samples", 1000), + "split_across_pack": batch.get("split_across_pack", False), "max_seq_length": training.get("max_seq_length", 2048), "precision": training.get("precision"), + "attn_implementation": training.get("attn_implementation", "sdpa"), "seed": schedule.get("seed"), "parallelism": ParallelismParams( num_nodes=parallelism.get("num_nodes", 1), diff --git a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py index 620781ba8b..517e38be4f 100644 --- a/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py +++ b/services/automodel/src/nmp/automodel/api/v2/jobs/schemas.py @@ -88,6 +88,14 @@ class LoRAParams(_PEFTParams): description="Module name patterns to apply LoRA to (e.g., ['*.q_proj', '*.v_proj']). " "If not set, applies to all '*proj' linear layers.", ) + exclude_modules: Optional[list[str]] = Field( + default=None, + description="Module name patterns to exclude from LoRA (e.g., ['*.out_proj']).", + ) + use_triton: bool = Field( + default=True, + description="Use the optimized Triton LoRA kernel.", + ) merge: bool = Field( default=False, description="Merge LoRA weights into base model after training. " @@ -173,12 +181,20 @@ class _TrainingBase(BaseModel): default=0.999, description="Adam beta2 parameter. Adjust for optimizer tuning.", ) + adam_eps: float = Field( + default=1e-8, + gt=0.0, + description="Adam/AdamW epsilon for numerical stability.", + ) warmup_steps: int = Field( default=0, ge=0, description="Linear warmup steps. Recommended: 10% of total training steps for stable training.", ) - optimizer: Optional[str] = Field(default=None, description="Optimizer name (e.g., 'adamw').") + optimizer: Literal["adam", "adamw"] = Field(default="adam", description="Optimizer algorithm.") + lr_decay_style: Literal["cosine", "linear", "constant"] = Field( + default="cosine", description="Learning-rate decay schedule." + ) # --- Schedule --- epochs: int = Field( @@ -218,6 +234,15 @@ class _TrainingBase(BaseModel): default=False, description="Enable sequence packing for efficiency. Can improve training speed.", ) + sequence_packing_max_samples: int = Field( + default=1000, + gt=0, + description="Samples analyzed to estimate the optimal pack size when sequence packing is enabled.", + ) + split_across_pack: bool = Field( + default=False, + description="Allow a single sample to span two packs when sequence packing.", + ) # --- Model --- max_seq_length: int = Field( @@ -229,6 +254,10 @@ class _TrainingBase(BaseModel): default=None, description="Model precision for training. Auto-detected if unset.", ) + attn_implementation: Literal["sdpa", "flash_attention_2", "eager"] = Field( + default="sdpa", + description="Attention backend: 'sdpa' (PyTorch native), 'flash_attention_2', or 'eager'.", + ) seed: Optional[int] = Field( default=None, description="Random seed for reproducibility. Optional.", diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py index 9eb1120cca..93169a7f84 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py +++ b/services/automodel/src/nmp/automodel/app/jobs/training/compiler.py @@ -157,10 +157,15 @@ def compile_training_step( global_batch_size=training.batch_size, micro_batch_size=training.micro_batch_size, sequence_packing=training.sequence_packing, + sequence_packing_max_samples=training.sequence_packing_max_samples, + split_across_pack=training.split_across_pack, ), optimizer=TrainingStepConfig.OptimizerConfig( + optimizer_name=training.optimizer, + lr_decay_style=training.lr_decay_style, learning_rate=training.learning_rate, min_learning_rate=training.min_learning_rate, + eps=training.adam_eps, weight_decay=training.weight_decay, beta1=training.adam_beta1, beta2=training.adam_beta2, @@ -241,6 +246,7 @@ def _translate_model_config( name=_extract_model_name(job_spec), max_seq_length=training.max_seq_length, precision=training.precision, + attn_implementation=training.attn_implementation, trust_remote_code=trust_remote_code, is_embedding_model=is_embedding_model, chat_template=chat_template, @@ -293,7 +299,8 @@ def _translate_lora_config(api_lora: LoRAParams, me: ModelEntity) -> LoRAConfig: alpha=api_lora.alpha, dropout=api_lora.dropout, target_modules=api_lora.target_modules, - use_triton=True, + exclude_modules=api_lora.exclude_modules, + use_triton=api_lora.use_triton, ) if not lora.target_modules: diff --git a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py index 6add112c70..b686515231 100644 --- a/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py +++ b/services/automodel/src/nmp/automodel/app/jobs/training/schemas.py @@ -183,9 +183,12 @@ class BatchConfig(BaseModel): micro_batch_size: int = Field(default=1, gt=0) sequence_packing: bool = False sequence_packing_max_samples: int = 1000 + split_across_pack: bool = False class OptimizerConfig(BaseModel): optimizer_type: Optional[OptimizerType] = Field(default=None) + optimizer_name: str = "adam" + lr_decay_style: str = "cosine" learning_rate: float = 1e-4 min_learning_rate: Optional[float] = None eps: float = 1e-8 diff --git a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py index 7b7852a99f..a1d1a4150d 100644 --- a/services/automodel/src/nmp/automodel/tasks/training/backends/config.py +++ b/services/automodel/src/nmp/automodel/tasks/training/backends/config.py @@ -254,8 +254,10 @@ def compile_automodel_config( ) # === Optimizer === + # Map the optimizer choice to its torch class. Defaults to Adam + optimizer_targets = {"adam": "torch.optim.Adam", "adamw": "torch.optim.AdamW"} cfg["optimizer"] = { - "_target_": "torch.optim.Adam", + "_target_": optimizer_targets.get(customizer_config.optimizer.optimizer_name, "torch.optim.Adam"), "lr": customizer_config.optimizer.learning_rate, "weight_decay": customizer_config.optimizer.weight_decay, "betas": [customizer_config.optimizer.beta1, customizer_config.optimizer.beta2], @@ -263,7 +265,7 @@ def compile_automodel_config( } cfg["lr_scheduler"] = { - "lr_decay_style": "cosine", + "lr_decay_style": customizer_config.optimizer.lr_decay_style, "lr_warmup_steps": customizer_config.optimizer.warmup_steps, } if customizer_config.optimizer.min_learning_rate: @@ -310,7 +312,7 @@ def compile_automodel_config( cfg["packed_sequence"] = { "packed_sequence_size": optimal_pack_size, - "split_across_pack": False, + "split_across_pack": customizer_config.batch.split_across_pack, } # Use pack size as the effective sequence length for datasets @@ -356,9 +358,8 @@ def compile_automodel_config( "use_triton": lora.use_triton, "target_modules": lora.target_modules, } - # TODO: Support exclude_modules via the API - # if lora.exclude_modules: - # peft_cfg["exclude_modules"] = lora.exclude_modules + if lora.exclude_modules: + peft_cfg["exclude_modules"] = lora.exclude_modules cfg["peft"] = peft_cfg # === Loss === diff --git a/services/automodel/tests/test_adapter.py b/services/automodel/tests/test_adapter.py index 7cc6becc5f..603b978530 100644 --- a/services/automodel/tests/test_adapter.py +++ b/services/automodel/tests/test_adapter.py @@ -72,6 +72,57 @@ def test_adapter_new_fields_default_when_omitted() -> None: assert spec.training.peft.dropout == 0.0 +def test_adapter_plumbs_pass2_hyperparameters() -> None: + """Pass-2 hyperparameters set on the plugin spec must reach the v2 SFTTraining.""" + spec = automodel_spec_to_compiler_output( + { + "model": "meta/llama", + "dataset": {"training": "default/train"}, + "training": { + "training_type": "sft", + "finetuning_type": "lora", + "attn_implementation": "flash_attention_2", + "lora": {"rank": 8, "exclude_modules": ["*.out_proj"], "use_triton": False}, + }, + "optimizer": {"adam_eps": 1e-6, "optimizer": "adamw", "lr_decay_style": "linear"}, + "batch": {"sequence_packing": True, "sequence_packing_max_samples": 256, "split_across_pack": True}, + "output": {"name": "out", "type": "adapter", "fileset": "out-fs"}, + }, + ) + assert isinstance(spec.training, SFTTraining) + assert spec.training.adam_eps == 1e-6 + assert spec.training.optimizer == "adamw" + assert spec.training.lr_decay_style == "linear" + assert spec.training.attn_implementation == "flash_attention_2" + assert spec.training.sequence_packing_max_samples == 256 + assert spec.training.split_across_pack is True + assert spec.training.peft is not None + assert spec.training.peft.exclude_modules == ["*.out_proj"] + assert spec.training.peft.use_triton is False + + +def test_adapter_pass2_defaults_when_omitted() -> None: + """Omitting pass-2 fields preserves the historical hardcoded defaults.""" + spec = automodel_spec_to_compiler_output( + { + "model": "meta/llama", + "dataset": {"training": "default/train"}, + "training": {"training_type": "sft", "finetuning_type": "lora"}, + "output": {"name": "out", "type": "adapter", "fileset": "out-fs"}, + }, + ) + assert isinstance(spec.training, SFTTraining) + assert spec.training.adam_eps == 1e-8 + assert spec.training.optimizer == "adam" + assert spec.training.lr_decay_style == "cosine" + assert spec.training.attn_implementation == "sdpa" + assert spec.training.sequence_packing_max_samples == 1000 + assert spec.training.split_across_pack is False + assert spec.training.peft is not None + assert spec.training.peft.exclude_modules is None + assert spec.training.peft.use_triton is True + + def test_adapter_distillation() -> None: spec = automodel_spec_to_compiler_output( { diff --git a/services/automodel/tests/test_compiler.py b/services/automodel/tests/test_compiler.py index f0d4f0b4b0..fbdf5b09f6 100644 --- a/services/automodel/tests/test_compiler.py +++ b/services/automodel/tests/test_compiler.py @@ -70,6 +70,42 @@ def test_build_file_download_config_rejects_missing_model_fileset() -> None: _build_file_download_config(_make_job_output(), _make_mock_model_entity(fileset=None)) +def test_compile_training_step_carries_pass2_fields() -> None: + """Pass-2 hyperparameters on the v2 SFTTraining reach the internal TrainingStepConfig.""" + from nmp.automodel.app.jobs.training.compiler import compile_training_step + + job_output = CustomizationJobOutput( + model="default/test-target", + dataset="default/my-dataset", + training=SFTTraining( + peft=LoRAParams(rank=8, alpha=32, merge=False, exclude_modules=["*.out_proj"], use_triton=False), + learning_rate=1e-4, + adam_eps=1e-6, + optimizer="adamw", + lr_decay_style="linear", + attn_implementation="flash_attention_2", + batch_size=4, + micro_batch_size=1, + sequence_packing=True, + sequence_packing_max_samples=256, + split_across_pack=True, + max_seq_length=2048, + ), + output=OutputResponse(name="out", type="adapter", fileset="out-fs"), + ) + step = compile_training_step(job_output, base_env=[], me=_make_mock_model_entity()) + cfg = step.config if hasattr(step, "config") else step["config"] + + assert cfg["optimizer"]["optimizer_name"] == "adamw" + assert cfg["optimizer"]["lr_decay_style"] == "linear" + assert cfg["optimizer"]["eps"] == 1e-6 + assert cfg["model"]["attn_implementation"] == "flash_attention_2" + assert cfg["batch"]["sequence_packing_max_samples"] == 256 + assert cfg["batch"]["split_across_pack"] is True + assert cfg["training"]["lora"]["exclude_modules"] == ["*.out_proj"] + assert cfg["training"]["lora"]["use_triton"] is False + + @pytest.mark.asyncio async def test_platform_job_config_compiler_sft_lora(mock_sdk, monkeypatch): monkeypatch.setattr( diff --git a/services/unsloth/src/nmp/unsloth/schemas.py b/services/unsloth/src/nmp/unsloth/schemas.py index 413be132fa..6ecc5d79cc 100644 --- a/services/unsloth/src/nmp/unsloth/schemas.py +++ b/services/unsloth/src/nmp/unsloth/schemas.py @@ -29,7 +29,7 @@ from __future__ import annotations -from typing import Literal +from typing import Any, Literal from nemo_platform_plugin.integrations import IntegrationsSpec from pydantic import BaseModel, ConfigDict, Field @@ -74,6 +74,14 @@ class ModelLoadSpec(BaseModel): "multi-device experiments." ), ) + rope_scaling: dict[str, Any] | None = Field( + default=None, + description=( + "RoPE scaling config for long-context extension, passed to " + "FastLanguageModel.from_pretrained (e.g. {'type': 'linear', 'factor': 2.0}). " + "None uses the model's native context length." + ), + ) class LoRAParams(BaseModel): @@ -99,6 +107,33 @@ class LoRAParams(BaseModel): bias: Literal["none", "all", "lora_only"] = "none" use_rslora: bool = False random_state: int = 3407 + use_dora: bool = Field( + default=False, + description="DoRA (weight-decomposed LoRA). Improves quality at low ranks; adds training overhead.", + ) + loftq_config: dict[str, Any] | None = Field( + default=None, + description="LoftQ initialization config for quantized bases. None disables LoftQ.", + ) + modules_to_save: list[str] | None = Field( + default=None, + description=( + "Extra non-LoRA modules to train and save in full (e.g. ['embed_tokens', 'lm_head']). " + "Needed for vocab changes / continued pretraining." + ), + ) + layers_to_transform: int | list[int] | None = Field( + default=None, + description="Restrict LoRA to specific layer index(es). None applies to all layers.", + ) + layer_replication: list[list[int]] | None = Field( + default=None, + description="Layer-replication ranges for stacking, e.g. [[0, 16], [8, 24]]. None disables.", + ) + init_lora_weights: bool | Literal["gaussian", "pissa", "olora", "loftq"] = Field( + default=True, + description="LoRA weight init scheme. True = PEFT default; 'pissa'/'olora'/'loftq' for advanced inits.", + ) class TrainingSpec(BaseModel): @@ -170,6 +205,13 @@ class ScheduleSpec(BaseModel): save_steps: int | None = Field(default=None, gt=0) eval_steps: int | None = Field(default=None, gt=0) seed: int = 3407 + lr_scheduler_kwargs: dict[str, Any] | None = Field( + default=None, + description=( + "Extra kwargs for the LR scheduler, e.g. {'num_cycles': 3} for cosine_with_restarts. " + "None uses scheduler defaults." + ), + ) class BatchSpec(BaseModel): @@ -194,6 +236,18 @@ class OptimizerSpec(BaseModel): "paged_adamw_8bit", "sgd", ] = "adamw_8bit" + adam_beta1: float = Field(default=0.9, ge=0.0, lt=1.0, description="Adam/AdamW beta1.") + adam_beta2: float = Field(default=0.999, ge=0.0, lt=1.0, description="Adam/AdamW beta2.") + adam_epsilon: float = Field(default=1e-8, gt=0.0, description="Adam/AdamW epsilon for numerical stability.") + max_grad_norm: float = Field(default=1.0, ge=0.0, description="Gradient-clipping max norm (TRL default 1.0).") + label_smoothing_factor: float = Field( + default=0.0, ge=0.0, lt=1.0, description="Label smoothing for the cross-entropy loss. 0.0 disables." + ) + neftune_noise_alpha: float | None = Field( + default=None, + ge=0.0, + description="NEFTune embedding-noise alpha (quality boost). None disables.", + ) class HardwareSpec(BaseModel): diff --git a/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py b/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py index 2ca3532560..c26dfcd6fb 100644 --- a/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py +++ b/services/unsloth/src/nmp/unsloth/tasks/training/backends/unsloth_sft.py @@ -67,7 +67,7 @@ def build_model_load_kwargs(spec: UnslothJobOutput, resolved_model: str) -> dict takes its default LoRA-optimized load path and warns that full finetuning was not requested, leaving the all-weights run mis-configured. """ - return { + kwargs: dict[str, Any] = { "model_name": resolved_model, "max_seq_length": spec.model.max_seq_length, "load_in_4bit": spec.model.load_in_4bit, @@ -76,6 +76,45 @@ def build_model_load_kwargs(spec: UnslothJobOutput, resolved_model: str) -> dict "trust_remote_code": spec.model.trust_remote_code, "device_map": spec.model.device_map if spec.model.device_map is not None else {"": 0}, } + # Only pass rope_scaling when set — None lets Unsloth use the model's native context length. + if spec.model.rope_scaling is not None: + kwargs["rope_scaling"] = spec.model.rope_scaling + return kwargs + + +def build_peft_kwargs(spec: UnslothJobOutput, *, gradient_checkpointing: bool | str) -> dict[str, Any]: + """Assemble ``FastLanguageModel.get_peft_model`` kwargs for a LoRA run. + + Torch-free (unit-testable). Caller resolves ``gradient_checkpointing`` from + ``spec.training.use_gradient_checkpointing`` (the JSON literal → ``True`` / + ``False`` / ``"unsloth"`` mapping). Optional knobs (``loftq_config``, + ``modules_to_save``, ``layers_to_transform``, ``layer_replication``) are only + emitted when set so PEFT/Unsloth see absence, not ``None``. + """ + lora = spec.training.lora + assert lora is not None # validated by UnslothJobInput + kwargs: dict[str, Any] = { + "r": lora.rank, + "lora_alpha": lora.alpha, + "lora_dropout": lora.dropout, + "target_modules": list(lora.target_modules), + "bias": lora.bias, + "use_rslora": lora.use_rslora, + "random_state": lora.random_state, + "use_dora": lora.use_dora, + "init_lora_weights": lora.init_lora_weights, + "use_gradient_checkpointing": gradient_checkpointing, + "max_seq_length": spec.model.max_seq_length, + } + if lora.loftq_config is not None: + kwargs["loftq_config"] = lora.loftq_config + if lora.modules_to_save is not None: + kwargs["modules_to_save"] = lora.modules_to_save + if lora.layers_to_transform is not None: + kwargs["layers_to_transform"] = lora.layers_to_transform + if lora.layer_replication is not None: + kwargs["layer_replication"] = lora.layer_replication + return kwargs def train_sft( @@ -183,15 +222,7 @@ def train_sft( gc_value = False model = FastLanguageModel.get_peft_model( model, - r=spec.training.lora.rank, - lora_alpha=spec.training.lora.alpha, - lora_dropout=spec.training.lora.dropout, - target_modules=list(spec.training.lora.target_modules), - bias=spec.training.lora.bias, - use_rslora=spec.training.lora.use_rslora, - random_state=spec.training.lora.random_state, - use_gradient_checkpointing=gc_value, - max_seq_length=spec.model.max_seq_length, + **build_peft_kwargs(spec, gradient_checkpointing=gc_value), ) # All-weights FT: leave `model` as-is. `build_model_load_kwargs` passed # `full_finetuning=True`, so `from_pretrained` routed through Unsloth's @@ -251,6 +282,11 @@ def train_sft( "learning_rate": spec.optimizer.learning_rate, "weight_decay": spec.optimizer.weight_decay, "optim": spec.optimizer.optim, + "adam_beta1": spec.optimizer.adam_beta1, + "adam_beta2": spec.optimizer.adam_beta2, + "adam_epsilon": spec.optimizer.adam_epsilon, + "max_grad_norm": spec.optimizer.max_grad_norm, + "label_smoothing_factor": spec.optimizer.label_smoothing_factor, "lr_scheduler_type": spec.schedule.lr_scheduler_type, "warmup_steps": spec.schedule.warmup_steps, "logging_steps": spec.schedule.logging_steps, @@ -263,6 +299,11 @@ def train_sft( "max_length": spec.model.max_seq_length, "packing": spec.dataset.packing, } + # Optional knobs: only set when provided so trl/transformers keep their defaults. + if spec.optimizer.neftune_noise_alpha is not None: + args_kwargs["neftune_noise_alpha"] = spec.optimizer.neftune_noise_alpha + if spec.schedule.lr_scheduler_kwargs is not None: + args_kwargs["lr_scheduler_kwargs"] = spec.schedule.lr_scheduler_kwargs if spec.schedule.warmup_ratio is not None: args_kwargs["warmup_ratio"] = spec.schedule.warmup_ratio # epochs always set (defaults to 1); max_steps, when present, caps/overrides it (trl semantics). diff --git a/services/unsloth/tests/test_model_load_kwargs.py b/services/unsloth/tests/test_model_load_kwargs.py index 6f4c22a0be..0cc812316b 100644 --- a/services/unsloth/tests/test_model_load_kwargs.py +++ b/services/unsloth/tests/test_model_load_kwargs.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -"""Tests for ``build_model_load_kwargs`` in the unsloth SFT driver. +"""Tests for ``build_model_load_kwargs`` / ``build_peft_kwargs`` in the unsloth SFT driver. These exercise the torch-free kwargs assembly only, so the module imports fine on a CPU box without ``unsloth``/``torch`` installed. @@ -9,21 +9,30 @@ from __future__ import annotations +from typing import Any + from nmp.unsloth.schemas import ( DatasetSpec, + LoRAParams, ModelLoadSpec, OutputResponse, TrainingSpec, UnslothJobOutput, ) -from nmp.unsloth.tasks.training.backends.unsloth_sft import build_model_load_kwargs +from nmp.unsloth.tasks.training.backends.unsloth_sft import build_model_load_kwargs, build_peft_kwargs -def _spec(*, finetuning_type: str, load_in_4bit: bool) -> UnslothJobOutput: +def _spec( + *, + finetuning_type: str = "lora", + load_in_4bit: bool = True, + model_extra: dict[str, Any] | None = None, + lora: LoRAParams | None = None, +) -> UnslothJobOutput: return UnslothJobOutput( - model=ModelLoadSpec(name="meta/llama-3.1-8b", load_in_4bit=load_in_4bit), + model=ModelLoadSpec(name="meta/llama-3.1-8b", load_in_4bit=load_in_4bit, **(model_extra or {})), dataset=DatasetSpec(path="ws/train"), - training=TrainingSpec(finetuning_type=finetuning_type), + training=TrainingSpec(finetuning_type=finetuning_type, lora=lora), output=OutputResponse(name="out", type="model", save_method="lora", fileset="out"), ) @@ -47,3 +56,44 @@ def test_dtype_not_included() -> None: # dtype mapping needs torch and stays in train_sft; the helper must not emit it. kwargs = build_model_load_kwargs(_spec(finetuning_type="lora", load_in_4bit=True), "/local/model") assert "dtype" not in kwargs + + +def test_rope_scaling_omitted_when_none() -> None: + kwargs = build_model_load_kwargs(_spec(), "/local/model") + assert "rope_scaling" not in kwargs + + +def test_rope_scaling_passed_when_set() -> None: + spec = _spec(model_extra={"rope_scaling": {"type": "linear", "factor": 2.0}}) + kwargs = build_model_load_kwargs(spec, "/local/model") + assert kwargs["rope_scaling"] == {"type": "linear", "factor": 2.0} + + +def test_peft_kwargs_defaults_preserve_behavior() -> None: + # Default LoRAParams → library-default knobs; optional ones omitted (not None). + kwargs = build_peft_kwargs(_spec(lora=LoRAParams()), gradient_checkpointing="unsloth") + assert kwargs["r"] == 16 + assert kwargs["use_dora"] is False + assert kwargs["init_lora_weights"] is True + assert kwargs["use_gradient_checkpointing"] == "unsloth" + for omitted in ("loftq_config", "modules_to_save", "layers_to_transform", "layer_replication"): + assert omitted not in kwargs + + +def test_peft_kwargs_emits_optional_fields_when_set() -> None: + lora = LoRAParams( + use_dora=True, + init_lora_weights="pissa", + modules_to_save=["embed_tokens", "lm_head"], + layers_to_transform=[0, 1, 2], + layer_replication=[[0, 16], [8, 24]], + loftq_config={"loftq_bits": 4}, + ) + kwargs = build_peft_kwargs(_spec(lora=lora), gradient_checkpointing=True) + assert kwargs["use_dora"] is True + assert kwargs["init_lora_weights"] == "pissa" + assert kwargs["modules_to_save"] == ["embed_tokens", "lm_head"] + assert kwargs["layers_to_transform"] == [0, 1, 2] + assert kwargs["layer_replication"] == [[0, 16], [8, 24]] + assert kwargs["loftq_config"] == {"loftq_bits": 4} + assert kwargs["use_gradient_checkpointing"] is True