From d739bb9f938a746f6917e85e25b051ca33ba6a59 Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Thu, 5 Feb 2026 14:56:40 -0800
Subject: [PATCH 1/4] draft llama3 configs

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 .../configs/recipes/llama3_native_te_1b.yaml  | 183 ++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml

diff --git a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml
new file mode 100644
index 000000000..98dd3916a
--- /dev/null
+++ b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml
@@ -0,0 +1,183 @@
+# @package _global_
+defaults:
+  - /base
+  - _self_
+
+############################################################
+# lepton job info
+############################################################
+node_group: yo-bom-lepton-001
+mount_from: node-nfs:fs1
+num_nodes: 1
+device_type: gpu
+num_devices: 8
+gpu_type: h100-sxm
+resource_shape: "${device_type}.${num_devices}x${gpu_type}"
+
+############################################################
+# kratos info: where to log data
+############################################################
+kratos_subject: "convergence_tests_v0.0.3"
+
+############################################################
+# recipe identifiers
+# mostly used for logging and observability
+############################################################
+recipe_subdir: llama3_native_te
+model_type: llama3
+variant: train
+
+# Core identifiers for filtering
+framework: native
+precision: bf16
+te_enabled: true
+fp8_enabled: false
+fp8_recipe: ""
+fp8_format: ""
+cp_enabled: false
+thd_enabled: false
+
+# Catchall for additional features/configs
+extras: []
+
+############################################################
+# wandb info (total_gpus used for group name)
+############################################################
+total_gpus: ${multiply:${num_devices},${num_nodes}}
+
+wandb_init_args:
+  project: "test_convergence__recipes__${sanitize:${branch}}"
+  group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
+  job_type: "${recipe_subdir}"
+  name: null
+
+############################################################
+# task commands
+# shared across all products (if not explicitly overridden)
+# Matches L2_lingua_1b.yaml defaults
+############################################################
+config: L2_lingua_1b
+task_cmd: train_fsdp2
+
+# Training parameters
+num_train_steps: 10_000
+use_torch_compile: false
+use_meta_device: true
+use_sequence_packing: true
+
+# Dataset parameters (from L2_lingua_1b)
+micro_batch_size: 4
+max_seq_length: 4096
+num_workers: 8
+stride: 512
+buffer_size: 50_000
+
+# Optimizer (from L2_lingua_1b)
+lr: 0.003
+weight_decay: 0.033
+
+# LR scheduler
+num_warmup_steps: 1_000
+num_decay_steps: 9_000
+
+# Checkpoint controls
+ckpt_dir: ""
+save_final_model: false
+resume_from_checkpoint: false
+save_every_n_steps: 10_000
+
+# Context parallelism
+cp_size: 1
+
+############################################################
+# Each product is a different config to run, alongside
+# config-specific arguments. Must have a `wandb_name`.
+############################################################
+products:
+  # Lingua 1B baseline - FSDP2 with THD (sequence packing)
+  - config: L2_lingua_1b
+    task_cmd: train_fsdp2
+    thd_enabled: true
+    use_sequence_packing: true
+    fp8_enabled: false
+    cp_enabled: false
+    wandb_name: "llama3_lingua_1b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+    job_name: "llama3-lingua-1b-fsdp2-thd"
+
+  # Lingua 1B - FSDP2 with FP8 + THD
+  - config: L2_lingua_1b
+    task_cmd: train_fsdp2
+    thd_enabled: true
+    use_sequence_packing: true
+    fp8_enabled: true
+    fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+    fp8_format: HYBRID
+    cp_enabled: false
+    wandb_name: "llama3_lingua_1b__fsdp2__thd__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+    job_name: "llama3-lingua-1b-fsdp2-thd-fp8"
+
+  # Lingua 1B - FSDP2 with Context Parallelism
+  - config: L2_lingua_1b
+    task_cmd: train_fsdp2_cp
+    thd_enabled: false
+    use_sequence_packing: false
+    fp8_enabled: false
+    cp_enabled: true
+    cp_size: 2
+    wandb_name: "llama3_lingua_1b__fsdp2__cp__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+    job_name: "llama3-lingua-1b-fsdp2-cp"
+
+  # Lingua 1B - FSDP2 with Context Parallelism + FP8
+  - config: L2_lingua_1b
+    task_cmd: train_fsdp2_cp
+    thd_enabled: false
+    use_sequence_packing: false
+    fp8_enabled: true
+    fp8_recipe: transformer_engine.common.recipe.DelayedScaling
+    fp8_format: HYBRID
+    cp_enabled: true
+    cp_size: 2
+    wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+    job_name: "llama3-lingua-1b-fsdp2-cp-fp8"
+
+############################################################
+# run script
+# This gets called right after `checkout_script` in the base config.
+############################################################
+run_script: |
+  wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
+  chmod +x init.sh;
+  source init.sh;
+
+  HYDRA_FULL_ERROR=1 torchrun \
+    --nnodes=$NNODES \
+    --nproc_per_node=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --master_port=$MASTER_PORT \
+    ${task_cmd}.py \
+    --config-name ${config}.yaml \
+    +wandb.mode=${wandb_init_args.mode} \
+    +wandb.project=${wandb_init_args.project} \
+    +wandb.name=${wandb_name} \
+    num_train_steps=${num_train_steps} \
+    use_torch_compile=${use_torch_compile} \
+    use_meta_device=${use_meta_device} \
+    use_sequence_packing=${use_sequence_packing} \
+    cp_size=${cp_size} \
+    dataset.micro_batch_size=${micro_batch_size} \
+    dataset.max_seq_length=${max_seq_length} \
+    dataset.num_workers=${num_workers} \
+    dataset.stride=${stride} \
+    dataset.buffer_size=${buffer_size} \
+    adamw_kwargs.lr=${lr} \
+    adamw_kwargs.weight_decay=${weight_decay} \
+    lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
+    lr_scheduler_kwargs.num_decay_steps=${num_decay_steps} \
+    checkpoint.ckpt_dir=${ckpt_dir} \
+    checkpoint.save_final_model=${save_final_model} \
+    checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
+    checkpoint.save_every_n_steps=${save_every_n_steps} \
+    fp8_config.enabled=${fp8_enabled} \
+    fp8_config.fp8_recipe=${fp8_recipe} \
+    fp8_config.fp8_format=${fp8_format}

From e6aee3aac449ac76ceb54b56689c46415c141190 Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Thu, 5 Feb 2026 16:12:43 -0800
Subject: [PATCH 2/4] update per Peter's recs

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 .../configs/recipes/llama3_native_te_1b.yaml  | 125 +++++-------------
 1 file changed, 30 insertions(+), 95 deletions(-)

diff --git a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml
index 98dd3916a..699879124 100644
--- a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml
@@ -8,7 +8,7 @@ defaults:
 ############################################################
 node_group: yo-bom-lepton-001
 mount_from: node-nfs:fs1
-num_nodes: 1
+num_nodes: 2
 device_type: gpu
 num_devices: 8
 gpu_type: h100-sxm
@@ -29,13 +29,11 @@ variant: train
 
 # Core identifiers for filtering
 framework: native
-precision: bf16
+precision: fp8
 te_enabled: true
-fp8_enabled: false
-fp8_recipe: ""
-fp8_format: ""
-cp_enabled: false
-thd_enabled: false
+fp8_enabled: true
+cp_enabled: true
+thd_enabled: true
 
 # Catchall for additional features/configs
 extras: []
@@ -53,92 +51,38 @@ wandb_init_args:
 
 ############################################################
 # task commands
-# shared across all products (if not explicitly overridden)
-# Matches L2_lingua_1b.yaml defaults
+# Matches the lingua 1B perf test run
 ############################################################
 config: L2_lingua_1b
-task_cmd: train_fsdp2
+task_cmd: train_fsdp2_cp
 
 # Training parameters
 num_train_steps: 10_000
 use_torch_compile: false
 use_meta_device: true
 use_sequence_packing: true
+grad_acc_steps: 4
 
-# Dataset parameters (from L2_lingua_1b)
-micro_batch_size: 4
-max_seq_length: 4096
+# Dataset parameters
+micro_batch_size: 8
 num_workers: 8
-stride: 512
-buffer_size: 50_000
-
-# Optimizer (from L2_lingua_1b)
-lr: 0.003
-weight_decay: 0.033
-
-# LR scheduler
-num_warmup_steps: 1_000
-num_decay_steps: 9_000
+dataset_path: /data/pstjohn/dclm-baseline-1.0-parquet
+pad_sequences_to_be_divisible_by: 32
 
 # Checkpoint controls
-ckpt_dir: ""
-save_final_model: false
-resume_from_checkpoint: false
-save_every_n_steps: 10_000
+ckpt_dir: null
 
 # Context parallelism
-cp_size: 1
+cp_size: 2
+
+# FP8 config
+fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling
 
 ############################################################
-# Each product is a different config to run, alongside
-# config-specific arguments. Must have a `wandb_name`.
+# Job info
 ############################################################
-products:
-  # Lingua 1B baseline - FSDP2 with THD (sequence packing)
-  - config: L2_lingua_1b
-    task_cmd: train_fsdp2
-    thd_enabled: true
-    use_sequence_packing: true
-    fp8_enabled: false
-    cp_enabled: false
-    wandb_name: "llama3_lingua_1b__fsdp2__thd__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "llama3-lingua-1b-fsdp2-thd"
-
-  # Lingua 1B - FSDP2 with FP8 + THD
-  - config: L2_lingua_1b
-    task_cmd: train_fsdp2
-    thd_enabled: true
-    use_sequence_packing: true
-    fp8_enabled: true
-    fp8_recipe: transformer_engine.common.recipe.DelayedScaling
-    fp8_format: HYBRID
-    cp_enabled: false
-    wandb_name: "llama3_lingua_1b__fsdp2__thd__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "llama3-lingua-1b-fsdp2-thd-fp8"
-
-  # Lingua 1B - FSDP2 with Context Parallelism
-  - config: L2_lingua_1b
-    task_cmd: train_fsdp2_cp
-    thd_enabled: false
-    use_sequence_packing: false
-    fp8_enabled: false
-    cp_enabled: true
-    cp_size: 2
-    wandb_name: "llama3_lingua_1b__fsdp2__cp__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "llama3-lingua-1b-fsdp2-cp"
-
-  # Lingua 1B - FSDP2 with Context Parallelism + FP8
-  - config: L2_lingua_1b
-    task_cmd: train_fsdp2_cp
-    thd_enabled: false
-    use_sequence_packing: false
-    fp8_enabled: true
-    fp8_recipe: transformer_engine.common.recipe.DelayedScaling
-    fp8_format: HYBRID
-    cp_enabled: true
-    cp_size: 2
-    wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
-    job_name: "llama3-lingua-1b-fsdp2-cp-fp8"
+job_name: "llama3-lingua-1b-fsdp2-cp-fp8"
+wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
 
 ############################################################
 # run script
@@ -151,33 +95,24 @@ run_script: |
 
   HYDRA_FULL_ERROR=1 torchrun \
     --nnodes=$NNODES \
-    --nproc_per_node=$(nvidia-smi --query-gpu=gpu_name --format=csv,noheader | wc -l) \
+    --nproc_per_node=8 \
     --node_rank=$NODE_RANK \
     --master_addr=$MASTER_ADDR \
-    --master_port=$MASTER_PORT \
+    --master_port=29400 \
     ${task_cmd}.py \
     --config-name ${config}.yaml \
     +wandb.mode=${wandb_init_args.mode} \
     +wandb.project=${wandb_init_args.project} \
+    +wandb.job_type=${wandb_init_args.job_type} \
     +wandb.name=${wandb_name} \
-    num_train_steps=${num_train_steps} \
-    use_torch_compile=${use_torch_compile} \
-    use_meta_device=${use_meta_device} \
-    use_sequence_packing=${use_sequence_packing} \
-    cp_size=${cp_size} \
-    dataset.micro_batch_size=${micro_batch_size} \
-    dataset.max_seq_length=${max_seq_length} \
+    dataset.load_dataset_kwargs.path=${dataset_path} \
     dataset.num_workers=${num_workers} \
-    dataset.stride=${stride} \
-    dataset.buffer_size=${buffer_size} \
-    adamw_kwargs.lr=${lr} \
-    adamw_kwargs.weight_decay=${weight_decay} \
-    lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
-    lr_scheduler_kwargs.num_decay_steps=${num_decay_steps} \
+    dataset.micro_batch_size=${micro_batch_size} \
+    dataset.pad_sequences_to_be_divisible_by=${pad_sequences_to_be_divisible_by} \
+    num_train_steps=${num_train_steps} \
+    grad_acc_steps=${grad_acc_steps} \
+    +cp_size=${cp_size} \
     checkpoint.ckpt_dir=${ckpt_dir} \
-    checkpoint.save_final_model=${save_final_model} \
-    checkpoint.resume_from_checkpoint=${resume_from_checkpoint} \
-    checkpoint.save_every_n_steps=${save_every_n_steps} \
     fp8_config.enabled=${fp8_enabled} \
     fp8_config.fp8_recipe=${fp8_recipe} \
-    fp8_config.fp8_format=${fp8_format}
+    hydra.verbose=True

From 75459fdd16ba6bc9bbd9d4a993e1ccdfa33e03df Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Thu, 5 Feb 2026 16:53:54 -0800
Subject: [PATCH 3/4] add llama3 and update schedules

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 .github/workflows/convergence-tests.yml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/convergence-tests.yml b/.github/workflows/convergence-tests.yml
index a1af8d71f..a249ddfd8 100644
--- a/.github/workflows/convergence-tests.yml
+++ b/.github/workflows/convergence-tests.yml
@@ -22,6 +22,7 @@ on:
           - esm2_native_te_3b
           - esm2_native_te_15b
           - codonfm_ptl_te
+          - llama3_native_te_1b
       branch:
         description: "Branch to use (ignored if commit SHA is provided)"
         required: true
@@ -32,14 +33,16 @@ on:
         required: false
         type: string
   schedule:
-    - cron: "0 8 * * *" # everyday at 1am PST
+    - cron: "0 8 * * 1,3,5" # Mon/Wed/Fri at 1am PST (esm2)
+    - cron: "0 8 * * 2,4" # Tue/Thu at 1am PST (llama3, codonfm)
 
 jobs:
   submit-lepton-jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        model_config: ${{ github.event_name == 'schedule' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b", "codonfm_ptl_te"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
+        # Mon/Wed/Fri runs esm2, Tue/Thu runs llama3 and codonfm
+        model_config: ${{ github.event_name == 'schedule' && github.event.schedule == '0 8 * * 2,4' && fromJSON('["llama3_native_te_1b", "codonfm_ptl_te"]') || github.event_name == 'schedule' && github.event.schedule == '0 8 * * 1,3,5' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
       fail-fast: false
     steps:
       - name: Checkout

From fb520ab472c49c5328c44d3163ef40e6fd3b494d Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Thu, 5 Feb 2026 16:55:07 -0800
Subject: [PATCH 4/4] add schedules to readme

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 ci/lepton/README.md | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/ci/lepton/README.md b/ci/lepton/README.md
index 13527f944..f715dac99 100644
--- a/ci/lepton/README.md
+++ b/ci/lepton/README.md
@@ -4,6 +4,24 @@ This directory holds code required for triggering automated partial-convergence/
 
 The dashboards may be viewd at the (internal only) url: [nv/bionemo-dashboards](https://nv/bionemo-dashboards).
 
+They currently run on this schedule:
+
+  ┌─────────────────────┬───────────────────────┐
+  │        Model        │       Schedule        │
+  ├─────────────────────┼───────────────────────┤
+  │ esm2_native_te_650m │ Mon/Wed/Fri (1am PST) │
+  ├─────────────────────┼───────────────────────┤
+  │ esm2_native_te_15b  │ Mon/Wed/Fri (1am PST) │
+  ├─────────────────────┼───────────────────────┤
+  │ llama3_native_te_1b │ Tue/Thu (1am PST)     │
+  ├─────────────────────┼───────────────────────┤
+  │ codonfm_ptl_te      │ Tue/Thu (1am PST)     │
+  └─────────────────────┴───────────────────────┘
+
+with scdl-dataloader running nightly on a cpu runner.
+
+
+
 ## Overview
 
 Currently, there are two ongoing benchmark runs, each triggered nightly: