From 7ccfecc4f554d334d29d873006714009fced4da2 Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Tue, 23 Dec 2025 11:12:41 -0800
Subject: [PATCH 1/2] Use shared storage dir

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 .../configs/recipes/esm2_native_te_15b.yaml         | 13 ++++++++++++-
 .../configs/recipes/esm2_native_te_3b.yaml          |  7 ++++++-
 .../configs/recipes/esm2_native_te_650m.yaml        |  5 ++++-
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
index ada591aaf4..aa17f47ab4 100644
--- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
@@ -13,6 +13,7 @@ device_type: gpu
 num_devices: 8
 gpu_type: h100-sxm
 resource_shape: "${device_type}.${num_devices}x${gpu_type}"
+log_to_kratos: False
 
 ############################################################
 # kratos info: where to log data
@@ -62,9 +63,11 @@ model_tag: nvidia/esm2_t48_15B_UR50D
 num_train_steps: 20_000
 # dataset commands
 micro_batch_size: 8
-load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
+load_dataset_kwargs_path: parquet
 load_dataset_kwargs_streaming: true
 load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
+load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995  # pragma: allowlist secret
+
 num_workers: 1
 
 # lr commands
@@ -129,6 +132,10 @@ products:
 # This gets called right after `checkout_script` in the base config.
 ############################################################
 run_script: |
+  echo "looking in data dir"
+  ls -la /data
+
+
   wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
   chmod +x init.sh;
   source init.sh;
@@ -148,6 +155,10 @@ run_script: |
     wandb_init_args.name=${wandb_name} \
     num_train_steps=${num_train_steps} \
     dataset.micro_batch_size=${micro_batch_size} \
+    dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \
+    dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \
+    +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \
+    +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \
     use_sequence_packing=${thd_enabled} \
     dataset.num_workers=${num_workers} \
     lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
index 731884f24c..1780ca0a4b 100644
--- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
@@ -61,9 +61,10 @@ model_tag: nvidia/esm2_t36_3B_UR50D
 num_train_steps: 20_000
 # dataset commands
 micro_batch_size: 16
-load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
+load_dataset_kwargs_path: parquet
 load_dataset_kwargs_streaming: true
 load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
+load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
 
 # lr commands
 num_warmup_steps: 2_000
@@ -110,6 +111,9 @@ products:
 # This gets called right after `checkout_script` in the base config.
 ############################################################
 run_script: |
+  echo "looking in data dir"
+  ls -la /data
+
   wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
   chmod +x init.sh;
   source init.sh;
@@ -133,6 +137,7 @@ run_script: |
     dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \
     dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \
     +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \
+    +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \
     lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
     checkpoint.ckpt_dir=${ckpt_dir} \
     checkpoint.save_final_model=${save_final_model} \
diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml
index b3a1d47b1f..8e4dcbe6d8 100644
--- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_650m.yaml
@@ -61,9 +61,11 @@ model_tag: nvidia/esm2_t36_650M_UR50D
 num_train_steps: 20_000
 # dataset commands
 micro_batch_size: 16
-load_dataset_kwargs_path: nvidia/esm2_uniref_pretraining_data
+load_dataset_kwargs_path: parquet
 load_dataset_kwargs_streaming: true
 load_dataset_kwargs_revision: 4ac1d2973567e46b8ca95901f4b4793a21305995 # pragma: allowlist secret
+load_dataset_kwargs_data_dir: /data/esm2/cache/hub/datasets--nvidia--esm2_uniref_pretraining_data/snapshots/4ac1d2973567e46b8ca95901f4b4793a21305995  # pragma: allowlist secret
+
 num_workers: 1
 
 # lr commands
@@ -130,6 +132,7 @@ run_script: |
     dataset.load_dataset_kwargs.path=${load_dataset_kwargs_path} \
     dataset.load_dataset_kwargs.streaming=${load_dataset_kwargs_streaming} \
     +dataset.load_dataset_kwargs.revision=${load_dataset_kwargs_revision} \
+    +dataset.load_dataset_kwargs.data_dir=${load_dataset_kwargs_data_dir} \
     dataset.num_workers=${num_workers} \
     lr_scheduler_kwargs.num_warmup_steps=${num_warmup_steps} \
     checkpoint.ckpt_dir=${ckpt_dir} \

From fd6b599ed17d258d4360937b8da5fbf1eb27e8ab Mon Sep 17 00:00:00 2001
From: jwilber <jwilber@nvidia.com>
Date: Tue, 23 Dec 2025 11:14:44 -0800
Subject: [PATCH 2/2] remove ls statement

Signed-off-by: jwilber <jwilber@nvidia.com>
---
 .../configs/recipes/esm2_native_te_15b.yaml                  | 5 -----
 .../model_convergence/configs/recipes/esm2_native_te_3b.yaml | 3 ---
 2 files changed, 8 deletions(-)

diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
index aa17f47ab4..b193e273e0 100644
--- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_15b.yaml
@@ -13,7 +13,6 @@ device_type: gpu
 num_devices: 8
 gpu_type: h100-sxm
 resource_shape: "${device_type}.${num_devices}x${gpu_type}"
-log_to_kratos: False
 
 ############################################################
 # kratos info: where to log data
@@ -132,10 +131,6 @@ products:
 # This gets called right after `checkout_script` in the base config.
 ############################################################
 run_script: |
-  echo "looking in data dir"
-  ls -la /data
-
-
   wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
   chmod +x init.sh;
   source init.sh;
diff --git a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
index 1780ca0a4b..7b1525fba7 100644
--- a/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
+++ b/ci/lepton/model_convergence/configs/recipes/esm2_native_te_3b.yaml
@@ -111,9 +111,6 @@ products:
 # This gets called right after `checkout_script` in the base config.
 ############################################################
 run_script: |
-  echo "looking in data dir"
-  ls -la /data
-
   wget -O init.sh https://raw.githubusercontent.com/leptonai/scripts/main/lepton_env_to_pytorch.sh;
   chmod +x init.sh;
   source init.sh;