diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh
deleted file mode 100644
index 892961cb..00000000
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/recipe_launch_command.sh
+++ /dev/null
@@ -1 +0,0 @@
-helm install joeywan-ubench-6wsw . -f values.yaml --set-file workload_launcher=launcher.sh --set-file workload_config=/tmp/ubench_recipe/joeywan-ubench-6wsw/custom_setup_experiment.py --set workload.image=nvcr.io/nvidia/nemo:26.02 --set volumes.gcsMounts[0].bucketName=ubench-logs --set volumes.gcsMounts[0].mountPath=/job-logs --set workload.envs[0].value=/job-logs/joeywan-ubench-6wsw --set queue=a4
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/Chart.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md
similarity index 98%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md
index aa487339..c5095c27 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/README.md
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/README.md
@@ -75,7 +75,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder.
git clone https://github.com/ai-hypercomputer/gpu-recipes.git
cd gpu-recipes
export REPO_ROOT=`git rev-parse --show-toplevel`
-export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe
+export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe
cd $RECIPE_ROOT
```
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/custom_setup_experiment.py
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/launcher.sh
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/recipe_launch_command.sh
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/recipe_launch_command.sh
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/recipe_launch_command.sh
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-config-configmap.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-job.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-launcher-configmap.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/templates/workload-svc.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO25.11/recipe/values.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2511/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/Chart.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/Chart.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md
similarity index 98%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md
index fc9352fb..f3954712 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/README.md
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/README.md
@@ -75,7 +75,7 @@ Clone the `gpu-recipes` repository and set a reference to the recipe folder.
git clone https://github.com/ai-hypercomputer/gpu-recipes.git
cd gpu-recipes
export REPO_ROOT=`git rev-parse --show-toplevel`
-export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe
+export RECIPE_ROOT=$REPO_ROOT/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe
cd $RECIPE_ROOT
```
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py
similarity index 51%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py
index 369cfa0a..2337fdec 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/custom_setup_experiment.py
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/custom_setup_experiment.py
@@ -1,19 +1,3 @@
-#!/usr/bin/env python3
-
-# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
import glob
import logging
import os
@@ -60,109 +44,6 @@
logger = logging.getLogger(__name__)
-def check_training_finished(log_file_path: str) -> bool:
- """Check if training is finished."""
- with open(log_file_path, "r") as f:
- log_lines = f.readlines()
- log = "\n".join(log_lines)
- return "StopIteration" in log or "after training is done" in log or "exiting program at iteration" in log
-
-
-def check_slurm_timeout(log_file_path: str) -> bool:
- """Check if Slurm job timed out."""
- with open(log_file_path, "r") as f:
- log_lines = f.readlines()
- log = "\n".join(log_lines)
- return "DUE TO TIME LIMIT" in log
-
-
-def is_flaky_failure(log_file_path: str) -> bool:
- """Check if Slurm job failed due to flaky failure."""
- with open(log_file_path, "r") as f:
- log_lines = f.readlines()
- log = "\n".join(log_lines)
-
- return (
- "The server socket has failed to listen on any local network address." in log
- or "Some NCCL operations have failed or timed out." in log
- or "uncorrectable ECC error encountered" in log
- or "illegal memory access" in log
- or "illegal instruction" in log
- or "torch.distributed.DistNetworkError" in log
- or "Segmentation fault" in log
- or "found NaN in" in log
- or "For debugging consider passing CUDA_LAUNCH_BLOCKING=1" in log
- or "double free or corruption" in log
- or "Call to CUDA function failed." in log
- or "Connection reset by peer" in log
- or "invalid pointer" in log
- or "malloc(): unaligned tcache chunk detected" in log
- or "zmq.error.ZMQError: Address already in use" in log
- or "We couldn't connect to 'https://huggingface.co'" in log
- or "Unpack failed: incomplete input" in log
- or "unspecified launch failure" in log
- or "free(): corrupted unsorted chunks" in log
- or "Segfault encountered" in log
- or "Fatal glibc error" in log
- or "EOFError: No data left in file" in log
- )
-
-
-def build_performance_config(args) -> Optional[Dict[str, Any]]:
- """Build performance configuration from command-line arguments.
-
- Args:
- args: Parsed command-line arguments
-
- Returns:
- Dictionary with performance configuration or None if performance is disabled
- """
- config = {}
-
- performance_params = {
- "timing_threshold": args.timing_threshold,
- "skip_first_percent_time": args.skip_first_percent_time,
- }
-
- for key, value in performance_params.items():
- if value is not None:
- config[key] = value
-
- return config if config else None
-
-
-def ensure_logs_where_written(log_file_paths: List[str]):
- """Ensure logs were written to disk."""
- if len(log_file_paths) != 1:
- raise FileNotFoundError(
- f"Unexpected number of log files found: {log_file_paths}. Expected 1, got {len(log_file_paths)}"
- )
-
-
-def get_job_dir_and_status_from_run(exp_name: str):
- """Get job directory and status from run."""
- result_dict = run.Experiment.from_title(exp_name).status(return_dict=True)
- _, job_dict = list(result_dict.items())[0]
- job_dir = job_dict["local_dir"]
- job_status = str(job_dict["status"])
- return job_dir, job_status
-
-
-def maybe_increase_n_attempts_on_flaky_failure(
- n_attempts: int,
- max_retries: int,
- is_finished_experiment: bool,
- is_long_convergence_run: bool,
- log_file_paths: List[str],
-):
- """Maybe increase number of attempts."""
- if not is_finished_experiment and not is_long_convergence_run:
- if is_flaky_failure(log_file_paths[-1]):
- n_attempts += 1
- else:
- n_attempts = max_retries # On non-flaky failures, we don't need to restart the experiment.
-
- return n_attempts
def main(
@@ -336,151 +217,14 @@ def main(
logger.info("Will launch the following command with Nemo-Run: %s", " ".join(nemorun_script.to_command()))
- is_finished_experiment = False # An experiment might consist of multiple training runs, due to restarts.
- is_testing_passed = False # Whether the testing passed convergence and performance validation.
- error_msg = None
- n_attempts = 0
- exp_name = (
- exp_name[:37] if dgxc_cluster is not None else exp_name
- ) # Some k8s clusters have a limit on the length of the experiment name.
- wandb_run_id = None
- while n_attempts <= max_retries:
- while is_finished_experiment is False:
- if HAVE_WANDB:
- wandb_run_id = (
- (wandb_run_id or wandb.util.generate_id()) if is_long_convergence_run else wandb.util.generate_id()
- )
- executor.env_vars.update(
- {
- "WANDB_RUN_ID": wandb_run_id,
- "WANDB_RESUME": "allow",
- }
- )
- if wandb_key is not None:
- executor.env_vars["WANDB_API_KEY"] = wandb_key
-
- run.run(
- nemorun_script,
- executor=executor,
- plugins=plugins,
- dryrun=dryrun,
- detach=detach,
- name=exp_name,
- )
- if dryrun:
- logger.info("dryrun requested: exiting")
- return
-
- def _copy_logs_to_gcp(job_dir_path):
- import shutil
- import glob
-
- artifact_dir = os.environ.get("ARTIFACT_DIR", "/tmp/artifacts")
- dest_logs_dir = os.path.join(artifact_dir, "logs")
- os.makedirs(dest_logs_dir, exist_ok=True)
-
- try:
- log_files = glob.glob(f"{job_dir_path}/log-*.out") + glob.glob(f"{job_dir_path}/log-*.err")
- for log_f in log_files:
- shutil.copy(log_f, dest_logs_dir)
- msg = f"Copied {log_f} to {dest_logs_dir}"
- print(msg)
- logger.info(msg)
- except Exception as e:
- print(f"Failed to copy logs to GCP: {e}")
- logger.error(f"Failed to copy logs to GCP: {e}")
-
-
- job_dir, job_status = get_job_dir_and_status_from_run(exp_name)
-
- if job_status not in ["SUCCEEDED", "SUBMITTED", "PENDING", "RUNNING"]:
- _copy_logs_to_gcp(job_dir)
- raise Exception(f"Experiment failed for {exp_name} with status: {job_status}.")
-
- if detach:
- is_finished_experiment = True
- is_testing_passed = True
- break
-
- log_file_paths = list(Path(f"{job_dir}").glob("log-*_0.out"))
- ensure_logs_where_written(log_file_paths)
-
- is_finished_experiment = (
- check_training_finished(log_file_paths[-1]) if is_long_convergence_run else (job_status == "SUCCEEDED")
- )
-
- n_attempts = maybe_increase_n_attempts_on_flaky_failure(
- n_attempts=n_attempts,
- max_retries=max_retries,
- is_finished_experiment=is_finished_experiment,
- is_long_convergence_run=is_long_convergence_run,
- log_file_paths=log_file_paths,
- )
-
- if not is_finished_experiment and n_attempts <= max_retries:
- logger.error(f"Starting attempt {n_attempts + 1} of {max_retries + 1} for {exp_name}")
-
- if not is_finished_experiment:
- break
-
- if is_finished_experiment is True and detach is False:
- log_paths = sorted(
- list(glob.glob(f"{get_nemorun_home()}/experiments/{exp_name}/{exp_name}_*/{exp_name}/log-*_0.out"))
- )
-
- if not is_long_convergence_run:
- log_paths = [log_paths[-1]]
-
- logger.info(f"Starting convergence check for {model_family_name}_{model_recipe_name}")
- wandb_run = None
- if HAVE_WANDB and wandb_key:
- wandb_run = wandb.init(
- project=wandb_project_name, entity=wandb_entity_name, id=wandb_run_id, resume="allow"
- )
-
- logger.info("Waiting 10 seconds for I/O to settle")
- time.sleep(10)
-
- is_testing_passed, error_msg = calc_convergence_and_performance(
- model_family_name=model_family_name,
- model_recipe_name=model_recipe_name,
- assets_dir=os.path.join(job_dir, exp_name),
- log_paths=log_paths,
- loss_metric="lm loss",
- timing_metric="elapsed time per iteration (ms)",
- alloc_metric="alloc",
- max_alloc_metric="max_alloc",
- golden_values_path=golden_values_path,
- convergence_config=convergence_params,
- performance_config=performance_params,
- memory_config=memory_params,
- wandb_run=wandb_run,
- )
-
- if wandb_run:
- wandb_run.finish()
- wandb.teardown(exit_code=int(not is_testing_passed))
-
- if not is_long_convergence_run:
- n_attempts = max_retries
- is_finished_experiment = True
- if not is_testing_passed:
- _copy_logs_to_gcp(job_dir)
- break
-
- if is_finished_experiment and is_testing_passed:
- break
-
- if not is_testing_passed and error_msg is not None:
- raise AssertionError(error_msg)
- if is_testing_passed and error_msg is not None:
- logger.warning(error_msg)
-
- if not is_finished_experiment:
- _copy_logs_to_gcp(job_dir)
- raise Exception("Megatron-Bridge CI test job failed")
- elif is_finished_experiment and not detach:
- logger.info("Megatron-Bridge CI test job completed successfully!")
+ run.run(
+ nemorun_script,
+ executor=executor,
+ plugins=plugins,
+ dryrun=dryrun,
+ detach=detach,
+ name=exp_name,
+ )
if __name__ == "__main__":
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh
similarity index 68%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh
index 3cb08b61..dd15b2d0 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/launcher.sh
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/launcher.sh
@@ -7,7 +7,7 @@ EOF
}
parse_args() {
- while [ "$1" != "" ]; do
+ while [[ "$1" != "" ]]; do
case $(grep -o "=" <<< "$1" | wc -l) in
1 )
config_overrides+=("$1")
@@ -25,15 +25,15 @@ parse_args() {
config_overrides=()
parse_args "$@"
-if [ -z "${config_overrides}" ]; then
+if [[ -z "${config_overrides[*]}" ]]; then
echo "No NeMo config overrides specified"
else
echo "NeMo config overrides:"
echo " ${config_overrides}"
fi
-export LD_LIBRARY_PATH="$NCCL_PLUGIN_PATH:/usr/local/nvidia/lib64"
-ldconfig $LD_LIBRARY_PATH
+export LD_LIBRARY_PATH="/usr/local/cuda/compat/lib:$NCCL_PLUGIN_PATH:$LD_LIBRARY_PATH"
+ldconfig "$LD_LIBRARY_PATH"
echo "Added $LD_LIBRARY_PATH to ldconfig:"
ldconfig -p | grep libcuda | sed 's/^/ /'
echo ""
@@ -47,7 +47,7 @@ echo "Logging to ${explicit_log_dir}"
if [[ -n "${TOKENIZER_PATH}" ]]; then
echo "Getting tokenizer files"
- cp ${TOKENIZER_PATH}/* .
+ cp "${TOKENIZER_PATH}"/* .
echo ""
fi
@@ -56,14 +56,22 @@ echo "Launching Torch distributed on the node rank $JOB_COMPLETION_INDEX out of
pip install git+https://github.com/NVIDIA/dllogger#egg=dllogger
# Create the nsys directory.
-mkdir -p ${explicit_log_dir}/nsys
-
-if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
- echo "--- DEBUG libnccl-env.so ---"
- ls -la /usr/local/gib/lib/libnccl-env.so || echo "libnccl-env.so not found"
- ls -lh /usr/local/gib/lib
- echo "----------------------------"
+mkdir -p "${explicit_log_dir}/nsys"
+
+# Collect diagnostics to a single line
+kv="\"kernel_version\": \"$(uname --kernel-release)\""
+if command -v nvidia-smi &> /dev/null; then
+ cuda_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true)
+ driver_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' || true)
+ vbios_v=$(nvidia-smi -q -x | grep -Po '(?<=).*(?=)' | head -n1 || true)
+ kv="${kv}, \"cuda_version\": \"${cuda_v}\""
+ kv="${kv}, \"driver_version\": \"${driver_v}\""
+ kv="${kv}, \"vbios_version\": \"${vbios_v}\""
fi
+echo "VERSION_DIAGNOSTICS: {${kv}}"
+
+
+export HF_TOKEN=YOUR_HF_TOKEN
cd /opt
rm -rf Megatron-Bridge
@@ -71,17 +79,13 @@ git clone https://github.com/NVIDIA-NeMo/Megatron-Bridge.git
cd Megatron-Bridge
git checkout f7a9428f301fa17ac374d5e7166a63b0aa4771af
git submodule update --init --recursive
-sed -i -e '/return config/i \ config.dist.distributed_timeout_minutes = 30' scripts/performance/run_recipe.py
+sed -i -e '/pretrain(config=recipe/i \ recipe.dist.distributed_timeout_minutes = 60' scripts/performance/run_script.py
ls
cp $CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH scripts/performance/
worker_command=$(cat <<- EOM
if [ "\$RANK" -eq "0" ]; then
- echo "--- LOCATING MEGATRON LIBRARIES ---" ;
- python -c "import megatron.core; print('megatron.core:', megatron.core.__file__)" || echo "megatron.core not found" ;
- python -c "import megatron.bridge; print('megatron.bridge:', megatron.bridge.__file__)" || echo "megatron.bridge not found" ;
- echo "-----------------------------------" ;
echo "Worker 0 is stalling for a few seconds.." ;
sleep 3 ;
echo "The detected environment within worker rank 0 is:" ;
@@ -89,9 +93,8 @@ worker_command=$(cat <<- EOM
fi ;
cd /opt/Megatron-Bridge ;
- export PYTHONPATH="/opt/Megatron-Bridge:/opt/Megatron-Bridge/3rdparty/Megatron-LM:\$PYTHONPATH" ;
- exec numactl \
+ numactl \
--cpunodebind=\$((LOCAL_RANK/4)) \
--membind=\$((LOCAL_RANK/4)) nsys profile \
-t nvtx,cuda \
@@ -100,7 +103,7 @@ worker_command=$(cat <<- EOM
--capture-range=cudaProfilerApi \
--capture-range-end=stop \
--kill none \
- -o /${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK \
+ -o "/${explicit_log_dir}/$JOB_IDENTIFIER/rank-\$RANK" \
--force-overwrite true \
--session-new "nsys-\$RANDOM-\$RANK" \
nice -10 \
@@ -110,16 +113,19 @@ worker_command=$(cat <<- EOM
--model_recipe_name deepseek_v3 \
--gpus_per_node 8 \
--num_gpus 256 \
+ --compute_dtype bf16 \
+ --seq_length 4096 \
--global_batch_size 2048 \
--micro_batch_size 1 \
- --seq_length 4096 \
--tensor_model_parallel_size 1 \
--pipeline_model_parallel_size 16 \
+ --expert_model_parallel_size 8 \
+ --expert_tensor_parallel_size 1 \
--context_parallel_size 1 \
--virtual_pipeline_model_parallel_size None \
- --expert_model_parallel_size 8 \
- --compute_dtype bf16 \
- --max_steps 30 dist.distributed_timeout_minutes=30
+ --recompute_modules mla_up_proj \
+ --moe_a2a_overlap False \
+ --max_steps 30
EOM
)
@@ -138,10 +144,10 @@ torchrun \
if [[ "$JOB_COMPLETION_INDEX" == "0" ]]; then
- mkdir -p ${ARTIFACT_DIR}
- cp -r ${explicit_log_dir}/* ${ARTIFACT_DIR}/
- env > ${ARTIFACT_DIR}/environ.txt
- ls ${ARTIFACT_DIR}
+ mkdir -p "${ARTIFACT_DIR}"
+ cp -r "${explicit_log_dir}"/* "${ARTIFACT_DIR}/"
+ env > "${ARTIFACT_DIR}/environ.txt"
+ ls "${ARTIFACT_DIR}"
fi
echo "Training completed"
echo "Pod on $(hostname --fqdn) is exiting"
\ No newline at end of file
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-config-configmap.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-config-configmap.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml
similarity index 98%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml
index 54efbb6b..b4ffa210 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-job.yaml
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-job.yaml
@@ -62,7 +62,7 @@ spec:
gke-parallelstore/memory-limit: "0"
{{- end }}
{{- if and $root.Values.queue $root.Values.tasSettings.topologyRequest }}
- kueue.x-k8s.io/podset-preferred-topology: {{ .Values.tasSettings.topologyRequest | default "kubernetes.io/hostname" }}
+ {{- toYaml .Values.tasSettings.topologyRequest | nindent 14 }}
{{- end }}
{{- if and $root.Values.queue $root.Values.dwsSettings.maxRunDurationSeconds }}
provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{ $root.Values.dwsSettings.maxRunDurationSeconds }}"
@@ -245,7 +245,7 @@ spec:
value: "{{ $gpusPerNode }}"
- name: NCCL_PLUGIN_PATH
- value: /usr/local/gib/lib64
+ value: /usr/local/gib/lib64:/usr/local/nvidia/lib64
{{ if $root.Values.network.gibVersion }}
- name: NCCL_INIT_SCRIPT
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-launcher-configmap.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-launcher-configmap.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml
similarity index 100%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/templates/workload-svc.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/templates/workload-svc.yaml
diff --git a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml
similarity index 64%
rename from training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml
rename to training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml
index 05e98e12..cb73da9b 100644
--- a/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/32node-BF16-SEQ4096-GBS2048-NEMO26.02/recipe/values.yaml
+++ b/training/a4/deepseek_v3/megatron-bridge-pretraining-gke/nemo2602/32node-BF16-SEQ4096-GBS2048/recipe/values.yaml
@@ -1,35 +1,33 @@
+queue: null
dwsSettings:
maxRunDurationSeconds: null
-network:
- gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
- hostNetwork: true
- ncclSettings:
- - name: NCCL_DEBUG
- value: INFO
- - name: NCCL_TIMEOUT
- value: '7200000'
- subnetworks[]: null
-queue: null
tasSettings:
topologyRequest:
kueue.x-k8s.io/podset-preferred-topology: kubernetes.io/hostname
volumes:
- gcsMounts:
- - bucketName: null
- mountPath: null
gcsVolumes: true
psVolumes: false
+ gcsMounts:
+ - bucketName: null
+ mountPath: null
workload:
+ gpus: 256
+ image: nvcr.io/nvidia/nemo:26.02
+ defaultArguments[]: null
arguments[]: null
configFile: custom_setup_experiment.py
configPath: /workload/configs/
- defaultArguments[]: null
envs:
- - name: ARTIFACT_DIR
- value: null
- - name: GLOO_SOCKET_IFNAME
- value: eth0
- - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH
- value: /workload/configs/custom_setup_experiment.py
- gpus: 256
- image: nvcr.io/nvidia/nemo:26.02
+ - name: ARTIFACT_DIR
+ value: null
+ - name: GLOO_SOCKET_IFNAME
+ value: eth0
+ - name: CUSTOM_SETUP_EXPERIMENT_SCRIPT_PATH
+ value: /workload/configs/custom_setup_experiment.py
+network:
+ hostNetwork: true
+ subnetworks[]: null
+ gibVersion: us-docker.pkg.dev/gce-ai-infra/gpudirect-gib/nccl-plugin-gib:v1.1.1
+ ncclSettings:
+ - name: NCCL_DEBUG
+ value: WARN