diff --git a/.github/workflows/convergence-tests.yml b/.github/workflows/convergence-tests.yml index a1af8d71f..a249ddfd8 100644 --- a/.github/workflows/convergence-tests.yml +++ b/.github/workflows/convergence-tests.yml @@ -22,6 +22,7 @@ on: - esm2_native_te_3b - esm2_native_te_15b - codonfm_ptl_te + - llama3_native_te_1b branch: description: "Branch to use (ignored if commit SHA is provided)" required: true @@ -32,14 +33,16 @@ on: required: false type: string schedule: - - cron: "0 8 * * *" # everyday at 1am PST + - cron: "0 8 * * 1,3,5" # Mon/Wed/Fri at 1am PST (esm2) + - cron: "0 8 * * 2,4" # Tue/Thu at 1am PST (llama3, codonfm) jobs: submit-lepton-jobs: runs-on: ubuntu-latest strategy: matrix: - model_config: ${{ github.event_name == 'schedule' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b", "codonfm_ptl_te"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }} + # Mon/Wed/Fri runs esm2, Tue/Thu runs llama3 and codonfm + model_config: ${{ github.event_name == 'schedule' && github.event.schedule == '0 8 * * 2,4' && fromJSON('["llama3_native_te_1b", "codonfm_ptl_te"]') || github.event_name == 'schedule' && github.event.schedule == '0 8 * * 1,3,5' && fromJSON('["esm2_native_te_650m", "esm2_native_te_15b"]') || fromJSON(format('["{0}"]', github.event.inputs.model_config)) }} fail-fast: false steps: - name: Checkout diff --git a/ci/lepton/README.md b/ci/lepton/README.md index 13527f944..f715dac99 100644 --- a/ci/lepton/README.md +++ b/ci/lepton/README.md @@ -4,6 +4,24 @@ This directory holds code required for triggering automated partial-convergence/ The dashboards may be viewd at the (internal only) url: [nv/bionemo-dashboards](https://nv/bionemo-dashboards). +They currently run on this schedule: + + ┌─────────────────────┬───────────────────────┐ + │ Model │ Schedule │ + ├─────────────────────┼───────────────────────┤ + │ esm2_native_te_650m │ Mon/Wed/Fri (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ esm2_native_te_15b │ Mon/Wed/Fri (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ llama3_native_te_1b │ Tue/Thu (1am PST) │ + ├─────────────────────┼───────────────────────┤ + │ codonfm_ptl_te │ Tue/Thu (1am PST) │ + └─────────────────────┴───────────────────────┘ + +with scdl-dataloader running nightly on a cpu runner. + + + ## Overview Currently, there are two ongoing benchmark runs, each triggered nightly: diff --git a/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml new file mode 100644 index 000000000..699879124 --- /dev/null +++ b/ci/lepton/model_convergence/configs/recipes/llama3_native_te_1b.yaml @@ -0,0 +1,118 @@ +# @package _global_ +defaults: + - /base + - _self_ + +############################################################ +# lepton job info +############################################################ +node_group: yo-bom-lepton-001 +mount_from: node-nfs:fs1 +num_nodes: 2 +device_type: gpu +num_devices: 8 +gpu_type: h100-sxm +resource_shape: "${device_type}.${num_devices}x${gpu_type}" + +############################################################ +# kratos info: where to log data +############################################################ +kratos_subject: "convergence_tests_v0.0.3" + +############################################################ +# recipe identifiers +# mostly used for logging and observability +############################################################ +recipe_subdir: llama3_native_te +model_type: llama3 +variant: train + +# Core identifiers for filtering +framework: native +precision: fp8 +te_enabled: true +fp8_enabled: true +cp_enabled: true +thd_enabled: true + +# Catchall for additional features/configs +extras: [] + +############################################################ +# wandb info (total_gpus used for group name) +############################################################ +total_gpus: ${multiply:${num_devices},${num_nodes}} + +wandb_init_args: + project: "test_convergence__recipes__${sanitize:${branch}}" + group: "${model_type}__${task_cmd}__${total_gpus}gpus__${sanitize:${gpu_type}}" + job_type: "${recipe_subdir}" + name: null + +############################################################ +# task commands +# Matches the lingua 1B perf test run +############################################################ +config: L2_lingua_1b +task_cmd: train_fsdp2_cp + +# Training parameters +num_train_steps: 10_000 +use_torch_compile: false +use_meta_device: true +use_sequence_packing: true +grad_acc_steps: 4 + +# Dataset parameters +micro_batch_size: 8 +num_workers: 8 +dataset_path: /data/pstjohn/dclm-baseline-1.0-parquet +pad_sequences_to_be_divisible_by: 32 + +# Checkpoint controls +ckpt_dir: null + +# Context parallelism +cp_size: 2 + +# FP8 config +fp8_recipe: transformer_engine.common.recipe.Float8BlockScaling + +############################################################ +# Job info +############################################################ +job_name: "llama3-lingua-1b-fsdp2-cp-fp8" +wandb_name: "llama3_lingua_1b__fsdp2__cp__fp8__${now:%Y%m%d-%H%M%S}__${gitsha:}" + +############################################################ +# run script +# This gets called right after `checkout_script` in the base config. +############################################################ +run_script: | + wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; + chmod +x init.sh; + source init.sh; + + HYDRA_FULL_ERROR=1 torchrun \ + --nnodes=$NNODES \ + --nproc_per_node=8 \ + --node_rank=$NODE_RANK \ + --master_addr=$MASTER_ADDR \ + --master_port=29400 \ + ${task_cmd}.py \ + --config-name ${config}.yaml \ + +wandb.mode=${wandb_init_args.mode} \ + +wandb.project=${wandb_init_args.project} \ + +wandb.job_type=${wandb_init_args.job_type} \ + +wandb.name=${wandb_name} \ + dataset.load_dataset_kwargs.path=${dataset_path} \ + dataset.num_workers=${num_workers} \ + dataset.micro_batch_size=${micro_batch_size} \ + dataset.pad_sequences_to_be_divisible_by=${pad_sequences_to_be_divisible_by} \ + num_train_steps=${num_train_steps} \ + grad_acc_steps=${grad_acc_steps} \ + +cp_size=${cp_size} \ + checkpoint.ckpt_dir=${ckpt_dir} \ + fp8_config.enabled=${fp8_enabled} \ + fp8_config.fp8_recipe=${fp8_recipe} \ + hydra.verbose=True