From d739bb9f938a746f6917e85e25b051ca33ba6a59 Mon Sep 17 00:00:00 2001 From: jwilber Date: Thu, 5 Feb 2026 14:56:40 -0800 Subject: [PATCH 1/4] draft llama3 configs Signed-off-by: jwilber --- .../configs/recipes/llama3_native_te_1b.yaml | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml diff --git a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml new file mode 100644 index 000000000..98dd3916a --- /dev/null +++ b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml @@ -0,0 +1,183 @@ +# @package _global_ +defaults: + - /base + - _self_ + +############################################################ +# lepton job info +############################################################ +node_group: yo-bom-lepton-001 +mount_from: node-nfs:fs1 +num_nodes: 1 +device_type: gpu +num_devices: 8 +gpu_type: h100-sxm +resource_shape: "${device_type}.${num_devices}x${gpu_type}" + +############################################################ +# kratos info: where to log data +############################################################ +kratos_subject: "convergence_tests_v0.0.3" + +############################################################ +# recipe identifiers +# mostly used for logging and observability +############################################################ +recipe_subdir: llama3_native_te +model_type: llama3 +variant: train + +# Core identifiers for filtering +framework: native +precision: bf16 +te_enabled: true +fp8_enabled: false +fp8_recipe: "" +fp8_format: "" +cp_enabled: false +thd_enabled: false + +# Catchall for additional features/configs +extras: [] + +############################################################ +# wandb info (total_gpus used for group name) +############################################################ +total_gpus: ${multiply:${num_devices},${num_nodes}} + +wandb_init_args: + project: "test_convergence__recipes__${sanitize:${branch}}" + group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}" + job_type: "${recipe_subdir}" + name: null + +############################################################ +# task commands +# shared across all products (if not explicitly overridden) +# Matches L2_lingua_1b.yaml defaults +############################################################ +config: L2_lingua_1b +task_cmd: train_fsdp2 + +# Training parameters +num_train_steps: 10_000 +use_torch_compile: false +use_meta_device: true +use_sequence_packing: true + +# Dataset parameters (from L2_lingua_1b) +micro_batch_size: 4 +max_seq_length: 4096 +num_workers: 8 +stride: 512 +buffer_size: 50_000 + +# Optimizer (from L2_lingua_1b) +lr: 0.003 +weight_decay: 0.033 + +# LR scheduler +num_warmup_steps: 1_000 +num_decay_steps: 9_000 + +# Checkpoint controls +ckpt_dir: "" +save_final_model: false +resume_from_checkpoint: false +save_every_n_steps: 10_000 + +# Context parallelism +cp_size: 1 + +############################################################ +# Each product is a different config to run, alongside +# config-specific arguments. Must have a `wandb_name`. +############################################################ +products: + # Lingua 1B baseline - FSDP2 with THD (sequence packing) + - config: L2_lingua_1b + task_cmd: train_fsdp2 + thd_enabled: true + use_sequence_packing: true + fp8_enabled: false + cp_enabled: false + wandb_name: "llama3_lingua_1b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}" + job_name: "llama3-lingua-1b-fsdp2-thd" + + # Lingua 1B - FSDP2 with FP8 + THD + - config: L2_lingua_1b + task_cmd: train_fsdp2 + thd_enabled: true + use_sequence_packing: true + fp8_enabled: true + fp8_recipe: transformer_engine.common.recipe.DelayedScaling + fp8_format: HYBRID + cp_enabled: false + wandb_name: "llama3_lingua_1b__fsdp2__thd__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" + job_name: "llama3-lingua-1b-fsdp2-thd-fp8" + + # Lingua 1B - FSDP2 with Context Parallelism + - config: L2_lingua_1b + task_cmd: train_fsdp2_cp + thd_enabled: false + use_sequence_packing: false + fp8_enabled: false + cp_enabled: true + cp_size: 2 + wandb_name: "llama3_lingua_1b__fsdp2__cp__${now:%Y%m%d-%H%M%S}__${gitsha:}" + job_name: "llama3-lingua-1b-fsdp2-cp" + + # Lingua 1B - FSDP2 with Context Parallelism + FP8 + - config: L2_lingua_1b + task_cmd: train_fsdp2_cp + thd_enabled: false + use_sequence_packing: false + fp8_enabled: true + fp8_recipe: transformer_engine.common.recipe.DelayedScaling + fp8_format: HYBRID + cp_enabled: true + cp_size: 2 + wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" + job_name: "llama3-lingua-1b-fsdp2-cp-fp8" + +############################################################ +# run script +# This gets called right after `checkout_script` in the base config. +############################################################ +run_script: | + wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; + chmod +x init.sh; + source init.sh; + + HYDRA_FULL_ERROR=1 torchrun \ + --nnodes=$NNODES \ + --nproc_per_node=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=$MASTER_PORT \ + ${task_cmd}.py \ + --config-name ${config}.yaml \ + +wandb.mode=${wandb_init_args.mode} \ + +wandb.project=${wandb_init_args.project} \ + +wandb.name=${wandb_name} \ + num_train_steps=${num_train_steps} \ + use_torch_compile=${use_torch_compile} \ + use_meta_device=${use_meta_device} \ + use_sequence_packing=${use_sequence_packing} \ + cp_size=${cp_size} \ + dataset.micro_batch_size=${micro_batch_size} \ + dataset.max_seq_length=${max_seq_length} \ + dataset.num_workers=${num_workers} \ + dataset.stride=${stride} \ + dataset.buffer_size=${buffer_size} \ + adamw_kwargs.lr=${lr} \ + adamw_kwargs.weight_decay=${weight_decay} \ + lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ + lr_scheduler_kwargs.num_decay_steps=${num_decay_steps} \ + checkpoint.ckpt_dir=${ckpt_dir} \ + checkpoint.save_final_model=${save_final_model} \ + checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \ + checkpoint.save_every_n_steps=${save_every_n_steps} \ + fp8_config.enabled=${fp8_enabled} \ + fp8_config.fp8_recipe=${fp8_recipe} \ + fp8_config.fp8_format=${fp8_format} From e6aee3aac449ac76ceb54b56689c46415c141190 Mon Sep 17 00:00:00 2001 From: jwilber Date: Thu, 5 Feb 2026 16:12:43 -0800 Subject: [PATCH 2/4] update per Peter's recs Signed-off-by: jwilber --- .../configs/recipes/llama3_native_te_1b.yaml | 125 +++++------------- 1 file changed, 30 insertions(+), 95 deletions(-) diff --git a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml index 98dd3916a..699879124 100644 --- a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml @@ -8,7 +8,7 @@ defaults: ############################################################ node_group: yo-bom-lepton-001 mount_from: node-nfs:fs1 -num_nodes: 1 +num_nodes: 2 device_type: gpu num_devices: 8 gpu_type: h100-sxm @@ -29,13 +29,11 @@ variant: train # Core identifiers for filtering framework: native -precision: bf16 +precision: fp8 te_enabled: true -fp8_enabled: false -fp8_recipe: "" -fp8_format: "" -cp_enabled: false -thd_enabled: false +fp8_enabled: true +cp_enabled: true +thd_enabled: true # Catchall for additional features/configs extras: [] @@ -53,92 +51,38 @@ wandb_init_args: ############################################################ # task commands -# shared across all products (if not explicitly overridden) -# Matches L2_lingua_1b.yaml defaults +# Matches the lingua 1B perf test run ############################################################ config: L2_lingua_1b -task_cmd: train_fsdp2 +task_cmd: train_fsdp2_cp # Training parameters num_train_steps: 10_000 use_torch_compile: false use_meta_device: true use_sequence_packing: true +grad_acc_steps: 4 -# Dataset parameters (from L2_lingua_1b) -micro_batch_size: 4 -max_seq_length: 4096 +# Dataset parameters +micro_batch_size: 8 num_workers: 8 -stride: 512 -buffer_size: 50_000 - -# Optimizer (from L2_lingua_1b) -lr: 0.003 -weight_decay: 0.033 - -# LR scheduler -num_warmup_steps: 1_000 -num_decay_steps: 9_000 +dataset_path: /data/pstjohn/dclm-baseline-1.0-parquet +pad_sequences_to_be_divisible_by: 32 # Checkpoint controls -ckpt_dir: "" -save_final_model: false -resume_from_checkpoint: false -save_every_n_steps: 10_000 +ckpt_dir: null # Context parallelism -cp_size: 1 +cp_size: 2 + +# FP8 config +fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling ############################################################ -# Each product is a different config to run, alongside -# config-specific arguments. Must have a `wandb_name`. +# Job info ############################################################ -products: - # Lingua 1B baseline - FSDP2 with THD (sequence packing) - - config: L2_lingua_1b - task_cmd: train_fsdp2 - thd_enabled: true - use_sequence_packing: true - fp8_enabled: false - cp_enabled: false - wandb_name: "llama3_lingua_1b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}" - job_name: "llama3-lingua-1b-fsdp2-thd" - - # Lingua 1B - FSDP2 with FP8 + THD - - config: L2_lingua_1b - task_cmd: train_fsdp2 - thd_enabled: true - use_sequence_packing: true - fp8_enabled: true - fp8_recipe: transformer_engine.common.recipe.DelayedScaling - fp8_format: HYBRID - cp_enabled: false - wandb_name: "llama3_lingua_1b__fsdp2__thd__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" - job_name: "llama3-lingua-1b-fsdp2-thd-fp8" - - # Lingua 1B - FSDP2 with Context Parallelism - - config: L2_lingua_1b - task_cmd: train_fsdp2_cp - thd_enabled: false - use_sequence_packing: false - fp8_enabled: false - cp_enabled: true - cp_size: 2 - wandb_name: "llama3_lingua_1b__fsdp2__cp__${now:%Y%m%d-%H%M%S}__${gitsha:}" - job_name: "llama3-lingua-1b-fsdp2-cp" - - # Lingua 1B - FSDP2 with Context Parallelism + FP8 - - config: L2_lingua_1b - task_cmd: train_fsdp2_cp - thd_enabled: false - use_sequence_packing: false - fp8_enabled: true - fp8_recipe: transformer_engine.common.recipe.DelayedScaling - fp8_format: HYBRID - cp_enabled: true - cp_size: 2 - wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" - job_name: "llama3-lingua-1b-fsdp2-cp-fp8" +job_name: "llama3-lingua-1b-fsdp2-cp-fp8" +wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" ############################################################ # run script @@ -151,33 +95,24 @@ run_script: | HYDRA_FULL_ERROR=1 torchrun \ --nnodes=$NNODES \ - --nproc_per_node=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) \ + --nproc_per_node=8 \ --node_rank=$NODE_RANK \ --master_addr=$MASTER_ADDR \ - --master_port=$MASTER_PORT \ + --master_port=29400 \ ${task_cmd}.py \ --config-name ${config}.yaml \ +wandb.mode=${wandb_init_args.mode} \ +wandb.project=${wandb_init_args.project} \ + +wandb.job_type=${wandb_init_args.job_type} \ +wandb.name=${wandb_name} \ - num_train_steps=${num_train_steps} \ - use_torch_compile=${use_torch_compile} \ - use_meta_device=${use_meta_device} \ - use_sequence_packing=${use_sequence_packing} \ - cp_size=${cp_size} \ - dataset.micro_batch_size=${micro_batch_size} \ - dataset.max_seq_length=${max_seq_length} \ + dataset.load_dataset_kwargs.path=${dataset_path} \ dataset.num_workers=${num_workers} \ - dataset.stride=${stride} \ - dataset.buffer_size=${buffer_size} \ - adamw_kwargs.lr=${lr} \ - adamw_kwargs.weight_decay=${weight_decay} \ - lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ - lr_scheduler_kwargs.num_decay_steps=${num_decay_steps} \ + dataset.micro_batch_size=${micro_batch_size} \ + dataset.pad_sequences_to_be_divisible_by=${pad_sequences_to_be_divisible_by} \ + num_train_steps=${num_train_steps} \ + grad_acc_steps=${grad_acc_steps} \ + +cp_size=${cp_size} \ checkpoint.ckpt_dir=${ckpt_dir} \ - checkpoint.save_final_model=${save_final_model} \ - checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \ - checkpoint.save_every_n_steps=${save_every_n_steps} \ fp8_config.enabled=${fp8_enabled} \ fp8_config.fp8_recipe=${fp8_recipe} \ - fp8_config.fp8_format=${fp8_format} + hydra.verbose=True From 75459fdd16ba6bc9bbd9d4a993e1ccdfa33e03df Mon Sep 17 00:00:00 2001 From: jwilber Date: Thu, 5 Feb 2026 16:53:54 -0800 Subject: [PATCH 3/4] add llama3 and update schedules Signed-off-by: jwilber --- .github/workflows/convergence-tests.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/convergence-tests.yml b/.github/workflows/convergence-tests.yml index a1af8d71f..a249ddfd8 100644 --- a/.github/workflows/convergence-tests.yml +++ b/.github/workflows/convergence-tests.yml @@ -22,6 +22,7 @@ on: - esm2_native_te_3b - esm2_native_te_15b - codonfm_ptl_te + - llama3_native_te_1b branch: description: "Branch to use (ignored if commit SHA is provided)" required: true @@ -32,14 +33,16 @@ on: required: false type: string schedule: - - cron: "0 8 * * *" # everyday at 1am PST + - cron: "0 8 * * 1,3,5" # Mon/Wed/Fri at 1am PST (esm2) + - cron: "0 8 * * 2,4" # Tue/Thu at 1am PST (llama3, codonfm) jobs: submit-lepton-jobs: runs-on: ubuntu-latest strategy: matrix: - model_config: ${{ github.event_name == 'schedule' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b", "codonfm_ptl_te"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }} + # Mon/Wed/Fri runs esm2, Tue/Thu runs llama3 and codonfm + model_config: ${{ github.event_name == 'schedule' && github.event.schedule == '0 8 * * 2,4' && fromJSON('["llama3_native_te_1b", "codonfm_ptl_te"]') || github.event_name == 'schedule' && github.event.schedule == '0 8 * * 1,3,5' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }} fail-fast: false steps: - name: Checkout From fb520ab472c49c5328c44d3163ef40e6fd3b494d Mon Sep 17 00:00:00 2001 From: jwilber Date: Thu, 5 Feb 2026 16:55:07 -0800 Subject: [PATCH 4/4] add schedules to readme Signed-off-by: jwilber --- ci/lepton/README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ci/lepton/README.md b/ci/lepton/README.md index 13527f944..f715dac99 100644 --- a/ci/lepton/README.md +++ b/ci/lepton/README.md @@ -4,6 +4,24 @@ This directory holds code required for triggering automated partial-convergence/ The dashboards may be viewd at the (internal only) url: [nv/bionemo-dashboards](https://nv/bionemo-dashboards). +They currently run on this schedule: + + ┌─────────────────────┬───────────────────────┐ + │ Model │ Schedule │ + ├─────────────────────┼───────────────────────┤ + │ esm2_native_te_650m │ Mon/Wed/Fri (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ esm2_native_te_15b │ Mon/Wed/Fri (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ llama3_native_te_1b │ Tue/Thu (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ codonfm_ptl_te │ Tue/Thu (1am PST) │ + └─────────────────────┴───────────────────────┘ + +with scdl-dataloader running nightly on a cpu runner. + + + ## Overview Currently, there are two ongoing benchmark runs, each triggered nightly: