NVIDIA · jwilber · Feb 5, 2026 · Feb 6, 2026 · Feb 6, 2026 · Feb 6, 2026
@@ -22,6 +22,7 @@ on:
           - esm2_native_te_3b
           - esm2_native_te_15b
           - codonfm_ptl_te
+          - llama3_native_te_1b
       branch:
         description: "Branch to use (ignored if commit SHA is provided)"
         required: true
@@ -32,14 +33,16 @@ on:
         required: false
         type: string
   schedule:
-    - cron: "0 8 * * *" # everyday at 1am PST
+    - cron: "0 8 * * 1,3,5" # Mon/Wed/Fri at 1am PST (esm2)
+    - cron: "0 8 * * 2,4" # Tue/Thu at 1am PST (llama3, codonfm)
 
 jobs:
   submit-lepton-jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        model_config: ${{ github.event_name == 'schedule' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b", "codonfm_ptl_te"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
+        # Mon/Wed/Fri runs esm2, Tue/Thu runs llama3 and codonfm
+        model_config: ${{ github.event_name == 'schedule' && github.event.schedule == '0 8 * * 2,4' && fromJSON('["llama3_native_te_1b", "codonfm_ptl_te"]') || github.event_name == 'schedule' && github.event.schedule == '0 8 * * 1,3,5' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }}
       fail-fast: false
     steps:
       - name: Checkout

@@ -4,6 +4,24 @@ This directory holds code required for triggering automated partial-convergence/
 
 The dashboards may be viewd at the (internal only) url: [nv/bionemo-dashboards](https://nv/bionemo-dashboards).
 
+They currently run on this schedule:
+
+  ┌─────────────────────┬───────────────────────┐
+  │        Model        │       Schedule        │
+  ├─────────────────────┼───────────────────────┤
+  │ esm2_native_te_650m │ Mon/Wed/Fri (1am PST) │
+  ├─────────────────────┼───────────────────────┤
+  │ esm2_native_te_15b  │ Mon/Wed/Fri (1am PST) │
+  ├─────────────────────┼───────────────────────┤
+  │ llama3_native_te_1b │ Tue/Thu (1am PST)     │
+  ├─────────────────────┼───────────────────────┤
+  │ codonfm_ptl_te      │ Tue/Thu (1am PST)     │
+  └─────────────────────┴───────────────────────┘
+
+with scdl-dataloader running nightly on a cpu runner.
+
+
+
 ## Overview
 
 Currently, there are two ongoing benchmark runs, each triggered nightly:

@@ -0,0 +1,118 @@
+# @package _global_
+defaults:
+  - /base
+  - _self_
+
+############################################################
+# lepton job info
+############################################################
+node_group: yo-bom-lepton-001
+mount_from: node-nfs:fs1
+num_nodes: 2
+device_type: gpu
+num_devices: 8
+gpu_type: h100-sxm
+resource_shape: "${device_type}.${num_devices}x${gpu_type}"
+
+############################################################
+# kratos info: where to log data
+############################################################
+kratos_subject: "convergence_tests_v0.0.3"
+
+############################################################
+# recipe identifiers
+# mostly used for logging and observability
+############################################################
+recipe_subdir: llama3_native_te
+model_type: llama3
+variant: train
+
+# Core identifiers for filtering
+framework: native
+precision: fp8
+te_enabled: true
+fp8_enabled: true
+cp_enabled: true
+thd_enabled: true
+
+# Catchall for additional features/configs
+extras: []
+
+############################################################
+# wandb info (total_gpus used for group name)
+############################################################
+total_gpus: ${multiply:${num_devices},${num_nodes}}
+
+wandb_init_args:
+  project: "test_convergence__recipes__${sanitize:${branch}}"
+  group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}"
+  job_type: "${recipe_subdir}"
+  name: null
+
+############################################################
+# task commands
+# Matches the lingua 1B perf test run
+############################################################
+config: L2_lingua_1b
+task_cmd: train_fsdp2_cp
+
+# Training parameters
+num_train_steps: 10_000
+use_torch_compile: false
+use_meta_device: true
+use_sequence_packing: true
+grad_acc_steps: 4
+
+# Dataset parameters
+micro_batch_size: 8
+num_workers: 8
+dataset_path: /data/pstjohn/dclm-baseline-1.0-parquet
+pad_sequences_to_be_divisible_by: 32
+
+# Checkpoint controls
+ckpt_dir: null
+
+# Context parallelism
+cp_size: 2
+
+# FP8 config
+fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling
+
+############################################################
+# Job info
+############################################################
+job_name: "llama3-lingua-1b-fsdp2-cp-fp8"
+wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}"
+
+############################################################
+# run script
+# This gets called right after `checkout_script` in the base config.
+############################################################
+run_script: |
+  wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
+  chmod +x init.sh;
+  source init.sh;
+
+  HYDRA_FULL_ERROR=1 torchrun \
+    --nnodes=$NNODES \
+    --nproc_per_node=8 \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --master_port=29400 \
+    ${task_cmd}.py \
+    --config-name ${config}.yaml \
+    +wandb.mode=${wandb_init_args.mode} \
+    +wandb.project=${wandb_init_args.project} \
+    +wandb.job_type=${wandb_init_args.job_type} \
+    +wandb.name=${wandb_name} \
+    dataset.load_dataset_kwargs.path=${dataset_path} \
+    dataset.num_workers=${num_workers} \
+    dataset.micro_batch_size=${micro_batch_size} \
+    dataset.pad_sequences_to_be_divisible_by=${pad_sequences_to_be_divisible_by} \
+    num_train_steps=${num_train_steps} \
+    grad_acc_steps=${grad_acc_steps} \
+    +cp_size=${cp_size} \
+    checkpoint.ckpt_dir=${ckpt_dir} \
+    fp8_config.enabled=${fp8_enabled} \
+    fp8_config.fp8_recipe=${fp8_recipe} \
+    hydra.verbose=True