From 7ccfecc4f554d334d29d873006714009fced4da2 Mon Sep 17 00:00:00 2001 From: jwilber Date: Tue, 23 Dec 2025 11:12:41 -0800 Subject: [PATCH 1/2] Use shared storage dir Signed-off-by: jwilber --- .../configs/recipes/esm2_native_te_15b.yaml | 13 ++++++++++++- .../configs/recipes/esm2_native_te_3b.yaml | 7 ++++++- .../configs/recipes/esm2_native_te_650m.yaml | 5 ++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml index ada591aaf4..aa17f47ab4 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml @@ -13,6 +13,7 @@ device_type: gpu num_devices: 8 gpu_type: h100-sxm resource_shape: "${device_type}.${num_devices}x${gpu_type}" +log_to_kratos: False ############################################################ # kratos info: where to log data @@ -62,9 +63,11 @@ model_tag: nvidia/esm2_t48_15B_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 8 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret + num_workers: 1 # lr commands @@ -129,6 +132,10 @@ products: # This gets called right after `checkout_script` in the base config. ############################################################ run_script: | + echo "looking in data dir" + ls -la /data + + wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; chmod +x init.sh; source init.sh; @@ -148,6 +155,10 @@ run_script: | wandb_init_args.name=${wandb_name} \ num_train_steps=${num_train_steps} \ dataset.micro_batch_size=${micro_batch_size} \ + dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ + dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ + +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ use_sequence_packing=${thd_enabled} \ dataset.num_workers=${num_workers} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml index 731884f24c..1780ca0a4b 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml @@ -61,9 +61,10 @@ model_tag: nvidia/esm2_t36_3B_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 16 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret # lr commands num_warmup_steps: 2_000 @@ -110,6 +111,9 @@ products: # This gets called right after `checkout_script` in the base config. ############################################################ run_script: | + echo "looking in data dir" + ls -la /data + wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; chmod +x init.sh; source init.sh; @@ -133,6 +137,7 @@ run_script: | dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ checkpoint.ckpt_dir=${ckpt_dir} \ checkpoint.save_final_model=${save_final_model} \ diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml index b3a1d47b1f..8e4dcbe6d8 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml @@ -61,9 +61,11 @@ model_tag: nvidia/esm2_t36_650M_UR50D num_train_steps: 20_000 # dataset commands micro_batch_size: 16 -load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data +load_dataset_kwargs_path: parquet load_dataset_kwargs_streaming: true load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret +load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret + num_workers: 1 # lr commands @@ -130,6 +132,7 @@ run_script: | dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \ dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \ +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \ + +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \ dataset.num_workers=${num_workers} \ lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \ checkpoint.ckpt_dir=${ckpt_dir} \ From fd6b599ed17d258d4360937b8da5fbf1eb27e8ab Mon Sep 17 00:00:00 2001 From: jwilber Date: Tue, 23 Dec 2025 11:14:44 -0800 Subject: [PATCH 2/2] remove ls statement Signed-off-by: jwilber --- .../configs/recipes/esm2_native_te_15b.yaml | 5 ----- .../model_convergence/configs/recipes/esm2_native_te_3b.yaml | 3 --- 2 files changed, 8 deletions(-) diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml index aa17f47ab4..b193e273e0 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml @@ -13,7 +13,6 @@ device_type: gpu num_devices: 8 gpu_type: h100-sxm resource_shape: "${device_type}.${num_devices}x${gpu_type}" -log_to_kratos: False ############################################################ # kratos info: where to log data @@ -132,10 +131,6 @@ products: # This gets called right after `checkout_script` in the base config. ############################################################ run_script: | - echo "looking in data dir" - ls -la /data - - wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; chmod +x init.sh; source init.sh; diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml index 1780ca0a4b..7b1525fba7 100644 --- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml +++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml @@ -111,9 +111,6 @@ products: # This gets called right after `checkout_script` in the base config. ############################################################ run_script: | - echo "looking in data dir" - ls -la /data - wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh; chmod +x init.sh; source init.sh;